Added taxonomy presentation in results

2018-06-29 12:53:29 +02:00 · 2018-06-29 12:53:29 +02:00 · 8d7cce6c77
commit 8d7cce6c77
parent d5d06fd7c5
11 changed files with 212 additions and 30 deletions
--- a/pom.xml
+++ b/pom.xml
@ -93,7 +93,7 @@
                <!-- JavaFX -->
                <groupId>com.zenjava</groupId>
                <artifactId>javafx-maven-plugin</artifactId>
-                <version>8.6.0</version>
+                <version>8.8.3</version>
                <configuration>
                    <mainClass>gui.GUIController</mainClass>
                    <verbose>true</verbose>
--- a/src/main/java/alg/XML_processing.java
+++ b/src/main/java/alg/XML_processing.java
@ -473,6 +473,7 @@ public class XML_processing {
 	public static boolean readXMLGigafida(String path, StatisticsNew stats) {
 		boolean inWord = false;
 		ArrayList<String> currentFiletaxonomy = new ArrayList<>();
+		ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
 		String lemma = "";
 		String msd = "";

@ -508,7 +509,10 @@ public class XML_processing {

 							if (tax != null) {
 								// keep only taxonomy properties
-								currentFiletaxonomy.add(String.valueOf(tax.getValue()).replace("#", ""));
+								String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", "");
+								currentFiletaxonomy.add(currentFiletaxonomyElement);
+								Tax taxonomy = new Tax();
+								currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
 							}
 						}
 						break;
@ -519,7 +523,7 @@ public class XML_processing {
 						// "word" node value
 						if (inWord) {
 							String word = characters.getData();
-							sentence.add(new Word(word, lemma, msd));
+							sentence.add(new Word(word, lemma, msd, currentFiletaxonomyLong));
 							inWord = false;
 						}
 						break;
@ -570,6 +574,7 @@ public class XML_processing {

 						// fallback
 						else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
+							// join corpus and stats
 							fj(corpus, stats);
 							corpus.clear();

--- a/src/main/java/alg/ngram/Ngrams.java
+++ b/src/main/java/alg/ngram/Ngrams.java
@ -45,6 +45,8 @@ public class Ngrams {
 					continue;
 				}

+				// UPDATE TAXONOMY HERE!!!
+                stats.updateTaxonomyResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()), ngramCandidate);
 				stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
 			}
 		}
@ -60,7 +62,8 @@ public class Ngrams {
 		}

 		for (int i = 0; i < regex.size(); i++) {
-			if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
+			//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
+			if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern() + ".*")) {
 				return false;
 			}
 		}
--- a/src/main/java/data/StatisticsNew.java
+++ b/src/main/java/data/StatisticsNew.java
@ -32,6 +32,7 @@ public class StatisticsNew {

 	private String resultTitle;
 	private Map<String, AtomicLong> result;
+	private Map<String, Map<String, AtomicLong>> taxonomyResult;
 	private Object[][] resultCustom; // for when calculating percentages that don't add up to 100%
 	private Map<String, ConcurrentHashMap<String, AtomicLong>> resultNestedSuffix;
 	private Map<String, ConcurrentHashMap<String, AtomicLong>> resultNestedPrefix;
@ -43,6 +44,20 @@ public class StatisticsNew {
 	public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) {
 		this.corpus = corpus;
 		this.filter = filter;
+		this.taxonomyResult = new ConcurrentHashMap<>();
+
+		// create table for counting word occurances per taxonomies
+
+		if (this.filter.getTaxonomy().isEmpty()) {
+			for (int i = 0; i < this.corpus.getTaxonomy().size(); i++) {
+				this.taxonomyResult.put(this.corpus.getTaxonomy().get(i), new ConcurrentHashMap<>());
+			}
+		} else {
+			for (int i = 0; i < this.filter.getTaxonomy().size(); i++) {
+				Tax taxonomy = new Tax();
+				this.taxonomyResult.put(taxonomy.getLongTaxonomyName(this.filter.getTaxonomy().get(i)), new ConcurrentHashMap<>());
+			}
+		}

 		if (useDB) {
 			this.useDB = true;
@ -189,7 +204,7 @@ public class StatisticsNew {
 		}

 		stats.add(ImmutablePair.of(resultTitle, getSortedResult(result, Util.getValidInt(limit))));
-		Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock());
+		Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), taxonomyResult);
 		return true;
 	}

@ -260,6 +275,28 @@ public class StatisticsNew {
 		return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit);
 	}

+	public void updateTaxonomyResults(String o, List<Word> ngramCandidate) {
+		for (String key : taxonomyResult.keySet()) {
+			// first word should have the same taxonomy as others
+			if (ngramCandidate.get(0).getTaxonomy().contains(key)) {
+				// if taxonomy not in map and in this word
+				AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(1));
+
+				if (r != null)
+					taxonomyResult.get(key).get(o).incrementAndGet();
+			} else {
+				// if taxonomy not in map and not in this word
+				AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(0));
+			}
+		}
+
+		// if not in map
+
+
+		// else
+
+	}
+
 	public void updateResults(String o) {
 		// if not in map
 		AtomicLong r = result.putIfAbsent(o, new AtomicLong(1));
@ -377,22 +414,22 @@ public class StatisticsNew {
 			}

 			// taksonomija
-			if (!isEmpty(filter.getTaxonomy())) {
-				info.put("Taksonomija:", StringUtils.join(filter.getTaxonomy(), ", "));
-			}
+//			if (!isEmpty(filter.getTaxonomy())) {
+//				info.put("Taksonomija:", StringUtils.join(filter.getTaxonomy(), ", "));
+//			}


 		}

-//		if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
-//			ArrayList<String> tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy());
-//
-//			info.put("Taksonomija: ", "");
-//			String sep = "";
-//			for (String s : tax) {
-//				info.put(sep = sep + " ", s);
-//			}
-//		}
+		if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
+			ArrayList<String> tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy());
+
+			info.put("Taksonomija: ", "");
+			String sep = "";
+			for (String s : tax) {
+				info.put(sep = sep + " ", s);
+			}
+		}

 		if (corpus.getCorpusType() == CorpusType.SOLAR) {
 			HashMap<String, ObservableList<String>> filters = corpus.getSolarFilters();
--- a/src/main/java/data/Tax.java
+++ b/src/main/java/data/Tax.java
@ -172,4 +172,13 @@ public class Tax {

 		return result;
 	}
+
+	public static String getLongTaxonomyName(String shortName){
+		if (GIGAFIDA_TAXONOMY.containsKey(shortName))
+			return GIGAFIDA_TAXONOMY.get(shortName);
+		else if(GOS_TAXONOMY.containsKey(shortName))
+			return GOS_TAXONOMY.get(shortName);
+		else
+			return null;
+	}
 }
--- a/src/main/java/data/Word.java
+++ b/src/main/java/data/Word.java
@ -3,6 +3,7 @@ package data;
 import java.io.Serializable;
 import java.util.Arrays;
 import java.util.HashSet;
+import java.util.List;

 import org.apache.commons.lang3.StringUtils;

@ -15,6 +16,7 @@ public class Word implements Serializable {
 	private String word;
 	private String lemma;
 	private String msd;
+	private List<String> taxonomy;
 	private final HashSet<Character> VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u'));

 	/**
@ -50,6 +52,22 @@ public class Word implements Serializable {
 		}
 	}

+	//private char besedna_vrsta;
+	public Word(String word, String lemma, String msd, List<String> taxonomy) {
+		this.lemma = lemma;
+		this.msd = normalizeMsd(msd);
+		this.taxonomy = taxonomy;
+
+		// veliko zacetnico ohranimo samo za lastna imena
+		if (!ValidationUtil.isEmpty(this.msd) && !(this.msd.charAt(0) == 'S'
+				&& this.msd.length() >= 2
+				&& this.msd.charAt(1) == 'l')) {
+			this.word = word.toLowerCase();
+		} else {
+			this.word = word;
+		}
+	}
+
 	public Word() {
 	}

@ -99,6 +117,10 @@ public class Word implements Serializable {
 		this.word = word;
 	}

+	public List<String> getTaxonomy() {
+		return taxonomy;
+	}
+
 	public String getLemma() {
 		return lemma;
 	}
--- a/src/main/java/gui/CharacterAnalysisTab.java
+++ b/src/main/java/gui/CharacterAnalysisTab.java
@ -67,6 +67,9 @@ public class CharacterAnalysisTab {
 	@FXML
 	private Button computeNgramsB;

+	@FXML
+	private Button cancel;
+
 	@FXML
 	public ProgressBar ngramProgressBar;
 	@FXML
@ -192,6 +195,8 @@ public class CharacterAnalysisTab {
 		});

        helpH.setOnAction(e -> openHelpWebsite());
+
+		cancel.setVisible(false);
 	}

 	/**
@ -399,6 +404,10 @@ public class CharacterAnalysisTab {
 				for (File f : corpusFiles) {
 					readXML(f.toString(), statistic);
 					i++;
+					if (isCancelled()) {
+						updateMessage(CANCELING_NOTIFICATION);
+						break;
+					}
 					this.updateProgress(i, corpusFiles.size());
 					this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName()));
 				}
@ -427,6 +436,7 @@ public class CharacterAnalysisTab {
 			ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
 			progressLabel.textProperty().unbind();
 			progressLabel.setText("");
+			cancel.setVisible(false);
 		});

 		task.setOnFailed(e -> {
@ -437,8 +447,27 @@ public class CharacterAnalysisTab {
 			ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK);
 			progressLabel.textProperty().unbind();
 			progressLabel.setText("");
+			cancel.setVisible(false);
 		});

+		task.setOnCancelled(e -> {
+			showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_CANCLED);
+			ngramProgressBar.progressProperty().unbind();
+			ngramProgressBar.setProgress(0.0);
+			ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
+			progressLabel.textProperty().unbind();
+			progressLabel.setText("");
+			cancel.setVisible(false);
+		});
+
+		// When cancel button is pressed cancel analysis
+		cancel.setOnAction(e -> {
+			task.cancel();
+			logger.info("cancel button");
+		});
+
+		cancel.setVisible(true);
+
 		final Thread thread = new Thread(task, "task");
 		thread.setDaemon(true);
 		thread.start();
--- a/src/main/java/gui/CorpusTab.java
+++ b/src/main/java/gui/CorpusTab.java
@ -83,7 +83,7 @@ public class CorpusTab {
 	private OneWordAnalysisTab oneWordTabController;
 	private CharacterAnalysisTab catController;
 	private FiltersForSolar ffsController;
-	//private WordFormationTab wfController;
+	private WordFormationTab wfController;
 	private WordLevelTab wlController;
 	private HostServices hostService;

@ -383,7 +383,7 @@ public class CorpusTab {
 			characterLevelTab.setDisable(false);
 			catController.setCorpus(corpus);
 			catController.init();
-			wordFormationTab.setDisable(false);
+			//wordFormationTab.setDisable(false);
 			wordLevelTab.setDisable(false);
 			//wfController.setCorpus(corpus);
 			//wfController.init();
--- a/src/main/java/gui/OneWordAnalysisTab.java
+++ b/src/main/java/gui/OneWordAnalysisTab.java
@ -164,6 +164,8 @@ public class OneWordAnalysisTab {
            logger.info("compute button");
        });
        helpH.setOnAction(e -> openHelpWebsite());
+
+        cancel.setVisible(false);
    }

    /**
@ -384,6 +386,7 @@ public class OneWordAnalysisTab {
            ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
            progressLabel.textProperty().unbind();
            progressLabel.setText("");
+            cancel.setVisible(false);
        });

        task.setOnFailed(e -> {
@ -394,6 +397,7 @@ public class OneWordAnalysisTab {
            ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK);
            progressLabel.textProperty().unbind();
            progressLabel.setText("");
+            cancel.setVisible(false);
        });

        task.setOnCancelled(e -> {
@ -403,6 +407,7 @@ public class OneWordAnalysisTab {
            ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
            progressLabel.textProperty().unbind();
            progressLabel.setText("");
+            cancel.setVisible(false);
        });

        // When cancel button is pressed cancel analysis
@ -411,6 +416,7 @@ public class OneWordAnalysisTab {
            logger.info("cancel button");
        });

+        cancel.setVisible(true);
        final Thread thread = new Thread(task, "task");
        thread.setDaemon(true);
        thread.start();
--- a/src/main/java/gui/StringAnalysisTabNew2.java
+++ b/src/main/java/gui/StringAnalysisTabNew2.java
@ -71,6 +71,9 @@ public class StringAnalysisTabNew2 {
    @FXML
    private Button computeNgramsB;

+    @FXML
+    private Button cancel;
+
    @FXML
    public ProgressBar ngramProgressBar;
    @FXML
@ -231,6 +234,8 @@ public class StringAnalysisTabNew2 {
        });

        helpH.setOnAction(e -> openHelpWebsite());
+
+        cancel.setVisible(false);
    }

    /**
@ -457,6 +462,10 @@ public class StringAnalysisTabNew2 {
                for (File f : corpusFiles) {
                    readXML(f.toString(), statistic);
                    i++;
+                    if (isCancelled()) {
+                        updateMessage(CANCELING_NOTIFICATION);
+                        break;
+                    }
                    this.updateProgress(i, corpusFiles.size());
                    this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName()));
                }
@ -485,6 +494,7 @@ public class StringAnalysisTabNew2 {
            ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
            progressLabel.textProperty().unbind();
            progressLabel.setText("");
+            cancel.setVisible(false);
        });

        task.setOnFailed(e -> {
@ -495,8 +505,27 @@ public class StringAnalysisTabNew2 {
            ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK);
            progressLabel.textProperty().unbind();
            progressLabel.setText("");
+            cancel.setVisible(false);
        });

+        task.setOnCancelled(e -> {
+            showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_CANCLED);
+            ngramProgressBar.progressProperty().unbind();
+            ngramProgressBar.setProgress(0.0);
+            ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
+            progressLabel.textProperty().unbind();
+            progressLabel.setText("");
+            cancel.setVisible(false);
+        });
+
+        // When cancel button is pressed cancel analysis
+        cancel.setOnAction(e -> {
+            task.cancel();
+            logger.info("cancel button");
+        });
+
+        cancel.setVisible(true);
+
        final Thread thread = new Thread(task, "task");
        thread.setDaemon(true);
        thread.start();
--- a/src/main/java/util/Export.java
+++ b/src/main/java/util/Export.java
@ -5,7 +5,11 @@ import static util.Util.*;
 import java.io.*;
 import java.nio.charset.StandardCharsets;
 import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.atomic.AtomicLong;

+import data.Filter;
 import org.apache.commons.csv.CSVFormat;
 import org.apache.commons.csv.CSVPrinter;
 import org.apache.commons.lang3.tuple.Pair;
@ -52,17 +56,29 @@ public class Export {
 		}
 	}

-	public static String SetToCSV(Set<Pair<String, Map<String, Long>>> set, File resultsPath, LinkedHashMap<String, String> headerInfoBlock) {
+	public static String SetToCSV(Set<Pair<String, Map<String, Long>>> set, File resultsPath, LinkedHashMap<String, String> headerInfoBlock,
+								  Map<String, Map<String, AtomicLong>> taxonomyResults) {
 		//Delimiter used in CSV file
 		String NEW_LINE_SEPARATOR = "\n";
+		List<Object> FILE_HEADER_AL = new ArrayList<Object>();
 		Object[] FILE_HEADER;

 		//Count frequencies
-		int num_frequencies = 0;
+		long num_frequencies = 0;
 		for (Pair<String, Map<String, Long>> p : set) {
 			Map<String, Long> map = p.getRight();
-			for (Map.Entry<String, Long> e : map.entrySet()) {
-				num_frequencies += e.getValue();
+			if (map.isEmpty())
+				continue;
+			num_frequencies = Util.mapSumFrequencies(map);
+		}
+
+		Map<String, Long> num_taxonomy_frequencies = new ConcurrentHashMap<>();
+		for (String taxonomyKey : taxonomyResults.keySet()) {
+			num_taxonomy_frequencies.put(taxonomyKey, (long) 0);
+			for (AtomicLong value : taxonomyResults.get(taxonomyKey).values()){
+				long val = num_taxonomy_frequencies.get(taxonomyKey);
+				val += value.get();
+				num_taxonomy_frequencies.put(taxonomyKey, val);
 			}
 		}

@ -71,19 +87,36 @@ public class Export {
 		if (headerInfoBlock.containsKey("Analiza") && headerInfoBlock.get("Analiza").equals("Besede")) {
 			if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("različnica")) {
 				headerInfoBlock.put("Skupna vsota vseh različnic:", String.valueOf(num_frequencies));
-				FILE_HEADER = new Object[]{"Različnica", "Skupna absolutna pogostost", "Delež glede na vse različnice"};
+				FILE_HEADER_AL.add("Različnica");
+				FILE_HEADER_AL.add("Skupna absolutna pogostost");
+				FILE_HEADER_AL.add("Delež glede na vse različnice");
 			} else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("lema")) {
 				headerInfoBlock.put("Skupna vsota vseh lem:", String.valueOf(num_frequencies));
-				FILE_HEADER = new Object[]{"Lema", "Skupna absolutna pogostost", "Delež glede na vse leme"};
+				FILE_HEADER_AL.add("Lema");
+				FILE_HEADER_AL.add("Skupna absolutna pogostost");
+				FILE_HEADER_AL.add("Delež glede na vse leme");
 			} else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("oblikoskladenjska oznaka")) {
 				headerInfoBlock.put("Skupna vsota vseh oblikoskladenjskih oznak:", String.valueOf(num_frequencies));
-				FILE_HEADER = new Object[]{"Oblikoskladenjska oznaka", "Skupna absolutna pogostost", "Delež glede na vse oblikoskladenjske oznake"};
+				FILE_HEADER_AL.add("Oblikoskladenjska oznaka");
+				FILE_HEADER_AL.add("Skupna absolutna pogostost");
+				FILE_HEADER_AL.add("Delež glede na vse oblikoskladenjske oznake");
 			} else {
 				headerInfoBlock.put("Skupna vsota vseh različnic:", String.valueOf(num_frequencies));
-				FILE_HEADER = new Object[]{"Lema", "Skupna pogostost", "Delež glede na leme"};
+				FILE_HEADER_AL.add("Lema");
+				FILE_HEADER_AL.add("Skupna absolutna pogostost");
+				FILE_HEADER_AL.add("Delež glede na vse leme");
 			}
-		} else
+			FILE_HEADER_AL.add("Skupna relativna pogostost");
+			for (String key : taxonomyResults.keySet()) {
+				FILE_HEADER_AL.add("Absolutna pogostost [" + key + "]");
+				FILE_HEADER_AL.add("Delež [" + key + "]");
+				FILE_HEADER_AL.add("Relativna pogostost [" + key + "]");
+			}
+			FILE_HEADER = new String[ FILE_HEADER_AL.size() ];
+			FILE_HEADER_AL.toArray(FILE_HEADER);
+		} else {
 			FILE_HEADER = new Object[]{"word", "frequency", "percent"};
+		}

 		String fileName = "";

@ -99,7 +132,7 @@ public class Export {
 			if (map.isEmpty())
 				continue;

-			long total = Util.mapSumFrequencies(map);
+//			long total = Util.mapSumFrequencies(map);

 			OutputStreamWriter fileWriter = null;
 			CSVPrinter csvFilePrinter = null;
@ -124,7 +157,16 @@ public class Export {
 					List dataEntry = new ArrayList<>();
 					dataEntry.add(e.getKey());
 					dataEntry.add(e.getValue().toString());
-					dataEntry.add(formatNumberAsPercent((double) e.getValue() / total));
+					dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies));
+					dataEntry.add(String.format("%.2f", ((double) e.getValue() * 10000)/num_frequencies));
+					for (String key : taxonomyResults.keySet()){
+						AtomicLong frequency = taxonomyResults.get(key).get(e.getKey());
+						dataEntry.add(frequency.toString());
+						dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_taxonomy_frequencies.get(key)));
+						dataEntry.add(String.format("%.2f", ((double) frequency.get() * 10000) / num_taxonomy_frequencies.get(key)));
+
+					}
+
 					csvFilePrinter.printRecord(dataEntry);
 				}
 			} catch (Exception e) {