diff --git a/pom.xml b/pom.xml index 1f4b81a..17a3e90 100644 --- a/pom.xml +++ b/pom.xml @@ -93,7 +93,7 @@ com.zenjava javafx-maven-plugin - 8.6.0 + 8.8.3 gui.GUIController true diff --git a/src/main/java/alg/XML_processing.java b/src/main/java/alg/XML_processing.java index 94c94cc..b47c190 100644 --- a/src/main/java/alg/XML_processing.java +++ b/src/main/java/alg/XML_processing.java @@ -473,6 +473,7 @@ public class XML_processing { public static boolean readXMLGigafida(String path, StatisticsNew stats) { boolean inWord = false; ArrayList currentFiletaxonomy = new ArrayList<>(); + ArrayList currentFiletaxonomyLong = new ArrayList<>(); String lemma = ""; String msd = ""; @@ -508,7 +509,10 @@ public class XML_processing { if (tax != null) { // keep only taxonomy properties - currentFiletaxonomy.add(String.valueOf(tax.getValue()).replace("#", "")); + String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", ""); + currentFiletaxonomy.add(currentFiletaxonomyElement); + Tax taxonomy = new Tax(); + currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement)); } } break; @@ -519,7 +523,7 @@ public class XML_processing { // "word" node value if (inWord) { String word = characters.getData(); - sentence.add(new Word(word, lemma, msd)); + sentence.add(new Word(word, lemma, msd, currentFiletaxonomyLong)); inWord = false; } break; @@ -570,6 +574,7 @@ public class XML_processing { // fallback else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { + // join corpus and stats fj(corpus, stats); corpus.clear(); diff --git a/src/main/java/alg/ngram/Ngrams.java b/src/main/java/alg/ngram/Ngrams.java index 4b0f930..429b5a6 100644 --- a/src/main/java/alg/ngram/Ngrams.java +++ b/src/main/java/alg/ngram/Ngrams.java @@ -45,6 +45,8 @@ public class Ngrams { continue; } + // UPDATE TAXONOMY HERE!!! + stats.updateTaxonomyResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()), ngramCandidate); stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor())); } } @@ -60,7 +62,8 @@ public class Ngrams { } for (int i = 0; i < regex.size(); i++) { - if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) { + //if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) { + if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern() + ".*")) { return false; } } diff --git a/src/main/java/data/StatisticsNew.java b/src/main/java/data/StatisticsNew.java index 618f85d..6c27265 100644 --- a/src/main/java/data/StatisticsNew.java +++ b/src/main/java/data/StatisticsNew.java @@ -32,6 +32,7 @@ public class StatisticsNew { private String resultTitle; private Map result; + private Map> taxonomyResult; private Object[][] resultCustom; // for when calculating percentages that don't add up to 100% private Map> resultNestedSuffix; private Map> resultNestedPrefix; @@ -43,6 +44,20 @@ public class StatisticsNew { public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) { this.corpus = corpus; this.filter = filter; + this.taxonomyResult = new ConcurrentHashMap<>(); + + // create table for counting word occurances per taxonomies + + if (this.filter.getTaxonomy().isEmpty()) { + for (int i = 0; i < this.corpus.getTaxonomy().size(); i++) { + this.taxonomyResult.put(this.corpus.getTaxonomy().get(i), new ConcurrentHashMap<>()); + } + } else { + for (int i = 0; i < this.filter.getTaxonomy().size(); i++) { + Tax taxonomy = new Tax(); + this.taxonomyResult.put(taxonomy.getLongTaxonomyName(this.filter.getTaxonomy().get(i)), new ConcurrentHashMap<>()); + } + } if (useDB) { this.useDB = true; @@ -189,7 +204,7 @@ public class StatisticsNew { } stats.add(ImmutablePair.of(resultTitle, getSortedResult(result, Util.getValidInt(limit)))); - Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock()); + Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), taxonomyResult); return true; } @@ -260,6 +275,28 @@ public class StatisticsNew { return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit); } + public void updateTaxonomyResults(String o, List ngramCandidate) { + for (String key : taxonomyResult.keySet()) { + // first word should have the same taxonomy as others + if (ngramCandidate.get(0).getTaxonomy().contains(key)) { + // if taxonomy not in map and in this word + AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(1)); + + if (r != null) + taxonomyResult.get(key).get(o).incrementAndGet(); + } else { + // if taxonomy not in map and not in this word + AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(0)); + } + } + + // if not in map + + + // else + + } + public void updateResults(String o) { // if not in map AtomicLong r = result.putIfAbsent(o, new AtomicLong(1)); @@ -377,22 +414,22 @@ public class StatisticsNew { } // taksonomija - if (!isEmpty(filter.getTaxonomy())) { - info.put("Taksonomija:", StringUtils.join(filter.getTaxonomy(), ", ")); - } +// if (!isEmpty(filter.getTaxonomy())) { +// info.put("Taksonomija:", StringUtils.join(filter.getTaxonomy(), ", ")); +// } } -// if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) { -// ArrayList tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy()); -// -// info.put("Taksonomija: ", ""); -// String sep = ""; -// for (String s : tax) { -// info.put(sep = sep + " ", s); -// } -// } + if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) { + ArrayList tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy()); + + info.put("Taksonomija: ", ""); + String sep = ""; + for (String s : tax) { + info.put(sep = sep + " ", s); + } + } if (corpus.getCorpusType() == CorpusType.SOLAR) { HashMap> filters = corpus.getSolarFilters(); diff --git a/src/main/java/data/Tax.java b/src/main/java/data/Tax.java index c1d6251..6324fd0 100644 --- a/src/main/java/data/Tax.java +++ b/src/main/java/data/Tax.java @@ -172,4 +172,13 @@ public class Tax { return result; } + + public static String getLongTaxonomyName(String shortName){ + if (GIGAFIDA_TAXONOMY.containsKey(shortName)) + return GIGAFIDA_TAXONOMY.get(shortName); + else if(GOS_TAXONOMY.containsKey(shortName)) + return GOS_TAXONOMY.get(shortName); + else + return null; + } } diff --git a/src/main/java/data/Word.java b/src/main/java/data/Word.java index 154dd24..5cff321 100644 --- a/src/main/java/data/Word.java +++ b/src/main/java/data/Word.java @@ -3,6 +3,7 @@ package data; import java.io.Serializable; import java.util.Arrays; import java.util.HashSet; +import java.util.List; import org.apache.commons.lang3.StringUtils; @@ -15,6 +16,7 @@ public class Word implements Serializable { private String word; private String lemma; private String msd; + private List taxonomy; private final HashSet VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u')); /** @@ -50,6 +52,22 @@ public class Word implements Serializable { } } + //private char besedna_vrsta; + public Word(String word, String lemma, String msd, List taxonomy) { + this.lemma = lemma; + this.msd = normalizeMsd(msd); + this.taxonomy = taxonomy; + + // veliko zacetnico ohranimo samo za lastna imena + if (!ValidationUtil.isEmpty(this.msd) && !(this.msd.charAt(0) == 'S' + && this.msd.length() >= 2 + && this.msd.charAt(1) == 'l')) { + this.word = word.toLowerCase(); + } else { + this.word = word; + } + } + public Word() { } @@ -99,6 +117,10 @@ public class Word implements Serializable { this.word = word; } + public List getTaxonomy() { + return taxonomy; + } + public String getLemma() { return lemma; } diff --git a/src/main/java/gui/CharacterAnalysisTab.java b/src/main/java/gui/CharacterAnalysisTab.java index cd1f6e3..12327a7 100644 --- a/src/main/java/gui/CharacterAnalysisTab.java +++ b/src/main/java/gui/CharacterAnalysisTab.java @@ -67,6 +67,9 @@ public class CharacterAnalysisTab { @FXML private Button computeNgramsB; + @FXML + private Button cancel; + @FXML public ProgressBar ngramProgressBar; @FXML @@ -192,6 +195,8 @@ public class CharacterAnalysisTab { }); helpH.setOnAction(e -> openHelpWebsite()); + + cancel.setVisible(false); } /** @@ -399,6 +404,10 @@ public class CharacterAnalysisTab { for (File f : corpusFiles) { readXML(f.toString(), statistic); i++; + if (isCancelled()) { + updateMessage(CANCELING_NOTIFICATION); + break; + } this.updateProgress(i, corpusFiles.size()); this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName())); } @@ -427,6 +436,7 @@ public class CharacterAnalysisTab { ngramProgressBar.setStyle(Settings.FX_ACCENT_OK); progressLabel.textProperty().unbind(); progressLabel.setText(""); + cancel.setVisible(false); }); task.setOnFailed(e -> { @@ -437,8 +447,27 @@ public class CharacterAnalysisTab { ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK); progressLabel.textProperty().unbind(); progressLabel.setText(""); + cancel.setVisible(false); + }); + + task.setOnCancelled(e -> { + showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_CANCLED); + ngramProgressBar.progressProperty().unbind(); + ngramProgressBar.setProgress(0.0); + ngramProgressBar.setStyle(Settings.FX_ACCENT_OK); + progressLabel.textProperty().unbind(); + progressLabel.setText(""); + cancel.setVisible(false); }); + // When cancel button is pressed cancel analysis + cancel.setOnAction(e -> { + task.cancel(); + logger.info("cancel button"); + }); + + cancel.setVisible(true); + final Thread thread = new Thread(task, "task"); thread.setDaemon(true); thread.start(); diff --git a/src/main/java/gui/CorpusTab.java b/src/main/java/gui/CorpusTab.java index 48ce9a0..5171b02 100644 --- a/src/main/java/gui/CorpusTab.java +++ b/src/main/java/gui/CorpusTab.java @@ -83,7 +83,7 @@ public class CorpusTab { private OneWordAnalysisTab oneWordTabController; private CharacterAnalysisTab catController; private FiltersForSolar ffsController; - //private WordFormationTab wfController; + private WordFormationTab wfController; private WordLevelTab wlController; private HostServices hostService; @@ -383,7 +383,7 @@ public class CorpusTab { characterLevelTab.setDisable(false); catController.setCorpus(corpus); catController.init(); - wordFormationTab.setDisable(false); + //wordFormationTab.setDisable(false); wordLevelTab.setDisable(false); //wfController.setCorpus(corpus); //wfController.init(); diff --git a/src/main/java/gui/OneWordAnalysisTab.java b/src/main/java/gui/OneWordAnalysisTab.java index 2f7572d..0755ede 100755 --- a/src/main/java/gui/OneWordAnalysisTab.java +++ b/src/main/java/gui/OneWordAnalysisTab.java @@ -164,6 +164,8 @@ public class OneWordAnalysisTab { logger.info("compute button"); }); helpH.setOnAction(e -> openHelpWebsite()); + + cancel.setVisible(false); } /** @@ -384,6 +386,7 @@ public class OneWordAnalysisTab { ngramProgressBar.setStyle(Settings.FX_ACCENT_OK); progressLabel.textProperty().unbind(); progressLabel.setText(""); + cancel.setVisible(false); }); task.setOnFailed(e -> { @@ -394,6 +397,7 @@ public class OneWordAnalysisTab { ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK); progressLabel.textProperty().unbind(); progressLabel.setText(""); + cancel.setVisible(false); }); task.setOnCancelled(e -> { @@ -403,6 +407,7 @@ public class OneWordAnalysisTab { ngramProgressBar.setStyle(Settings.FX_ACCENT_OK); progressLabel.textProperty().unbind(); progressLabel.setText(""); + cancel.setVisible(false); }); // When cancel button is pressed cancel analysis @@ -411,6 +416,7 @@ public class OneWordAnalysisTab { logger.info("cancel button"); }); + cancel.setVisible(true); final Thread thread = new Thread(task, "task"); thread.setDaemon(true); thread.start(); diff --git a/src/main/java/gui/StringAnalysisTabNew2.java b/src/main/java/gui/StringAnalysisTabNew2.java index 1db9293..b861d3b 100755 --- a/src/main/java/gui/StringAnalysisTabNew2.java +++ b/src/main/java/gui/StringAnalysisTabNew2.java @@ -71,6 +71,9 @@ public class StringAnalysisTabNew2 { @FXML private Button computeNgramsB; + @FXML + private Button cancel; + @FXML public ProgressBar ngramProgressBar; @FXML @@ -231,6 +234,8 @@ public class StringAnalysisTabNew2 { }); helpH.setOnAction(e -> openHelpWebsite()); + + cancel.setVisible(false); } /** @@ -457,6 +462,10 @@ public class StringAnalysisTabNew2 { for (File f : corpusFiles) { readXML(f.toString(), statistic); i++; + if (isCancelled()) { + updateMessage(CANCELING_NOTIFICATION); + break; + } this.updateProgress(i, corpusFiles.size()); this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName())); } @@ -485,6 +494,7 @@ public class StringAnalysisTabNew2 { ngramProgressBar.setStyle(Settings.FX_ACCENT_OK); progressLabel.textProperty().unbind(); progressLabel.setText(""); + cancel.setVisible(false); }); task.setOnFailed(e -> { @@ -495,8 +505,27 @@ public class StringAnalysisTabNew2 { ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK); progressLabel.textProperty().unbind(); progressLabel.setText(""); + cancel.setVisible(false); + }); + + task.setOnCancelled(e -> { + showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_CANCLED); + ngramProgressBar.progressProperty().unbind(); + ngramProgressBar.setProgress(0.0); + ngramProgressBar.setStyle(Settings.FX_ACCENT_OK); + progressLabel.textProperty().unbind(); + progressLabel.setText(""); + cancel.setVisible(false); }); + // When cancel button is pressed cancel analysis + cancel.setOnAction(e -> { + task.cancel(); + logger.info("cancel button"); + }); + + cancel.setVisible(true); + final Thread thread = new Thread(task, "task"); thread.setDaemon(true); thread.start(); diff --git a/src/main/java/util/Export.java b/src/main/java/util/Export.java index 9b5e3be..1627312 100644 --- a/src/main/java/util/Export.java +++ b/src/main/java/util/Export.java @@ -5,7 +5,11 @@ import static util.Util.*; import java.io.*; import java.nio.charset.StandardCharsets; import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.atomic.AtomicLong; +import data.Filter; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVPrinter; import org.apache.commons.lang3.tuple.Pair; @@ -52,17 +56,29 @@ public class Export { } } - public static String SetToCSV(Set>> set, File resultsPath, LinkedHashMap headerInfoBlock) { + public static String SetToCSV(Set>> set, File resultsPath, LinkedHashMap headerInfoBlock, + Map> taxonomyResults) { //Delimiter used in CSV file String NEW_LINE_SEPARATOR = "\n"; + List FILE_HEADER_AL = new ArrayList(); Object[] FILE_HEADER; //Count frequencies - int num_frequencies = 0; + long num_frequencies = 0; for (Pair> p : set) { Map map = p.getRight(); - for (Map.Entry e : map.entrySet()) { - num_frequencies += e.getValue(); + if (map.isEmpty()) + continue; + num_frequencies = Util.mapSumFrequencies(map); + } + + Map num_taxonomy_frequencies = new ConcurrentHashMap<>(); + for (String taxonomyKey : taxonomyResults.keySet()) { + num_taxonomy_frequencies.put(taxonomyKey, (long) 0); + for (AtomicLong value : taxonomyResults.get(taxonomyKey).values()){ + long val = num_taxonomy_frequencies.get(taxonomyKey); + val += value.get(); + num_taxonomy_frequencies.put(taxonomyKey, val); } } @@ -71,19 +87,36 @@ public class Export { if (headerInfoBlock.containsKey("Analiza") && headerInfoBlock.get("Analiza").equals("Besede")) { if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("različnica")) { headerInfoBlock.put("Skupna vsota vseh različnic:", String.valueOf(num_frequencies)); - FILE_HEADER = new Object[]{"Različnica", "Skupna absolutna pogostost", "Delež glede na vse različnice"}; + FILE_HEADER_AL.add("Različnica"); + FILE_HEADER_AL.add("Skupna absolutna pogostost"); + FILE_HEADER_AL.add("Delež glede na vse različnice"); } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("lema")) { headerInfoBlock.put("Skupna vsota vseh lem:", String.valueOf(num_frequencies)); - FILE_HEADER = new Object[]{"Lema", "Skupna absolutna pogostost", "Delež glede na vse leme"}; + FILE_HEADER_AL.add("Lema"); + FILE_HEADER_AL.add("Skupna absolutna pogostost"); + FILE_HEADER_AL.add("Delež glede na vse leme"); } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("oblikoskladenjska oznaka")) { headerInfoBlock.put("Skupna vsota vseh oblikoskladenjskih oznak:", String.valueOf(num_frequencies)); - FILE_HEADER = new Object[]{"Oblikoskladenjska oznaka", "Skupna absolutna pogostost", "Delež glede na vse oblikoskladenjske oznake"}; + FILE_HEADER_AL.add("Oblikoskladenjska oznaka"); + FILE_HEADER_AL.add("Skupna absolutna pogostost"); + FILE_HEADER_AL.add("Delež glede na vse oblikoskladenjske oznake"); } else { headerInfoBlock.put("Skupna vsota vseh različnic:", String.valueOf(num_frequencies)); - FILE_HEADER = new Object[]{"Lema", "Skupna pogostost", "Delež glede na leme"}; + FILE_HEADER_AL.add("Lema"); + FILE_HEADER_AL.add("Skupna absolutna pogostost"); + FILE_HEADER_AL.add("Delež glede na vse leme"); + } + FILE_HEADER_AL.add("Skupna relativna pogostost"); + for (String key : taxonomyResults.keySet()) { + FILE_HEADER_AL.add("Absolutna pogostost [" + key + "]"); + FILE_HEADER_AL.add("Delež [" + key + "]"); + FILE_HEADER_AL.add("Relativna pogostost [" + key + "]"); } - } else + FILE_HEADER = new String[ FILE_HEADER_AL.size() ]; + FILE_HEADER_AL.toArray(FILE_HEADER); + } else { FILE_HEADER = new Object[]{"word", "frequency", "percent"}; + } String fileName = ""; @@ -99,7 +132,7 @@ public class Export { if (map.isEmpty()) continue; - long total = Util.mapSumFrequencies(map); +// long total = Util.mapSumFrequencies(map); OutputStreamWriter fileWriter = null; CSVPrinter csvFilePrinter = null; @@ -124,7 +157,16 @@ public class Export { List dataEntry = new ArrayList<>(); dataEntry.add(e.getKey()); dataEntry.add(e.getValue().toString()); - dataEntry.add(formatNumberAsPercent((double) e.getValue() / total)); + dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies)); + dataEntry.add(String.format("%.2f", ((double) e.getValue() * 10000)/num_frequencies)); + for (String key : taxonomyResults.keySet()){ + AtomicLong frequency = taxonomyResults.get(key).get(e.getKey()); + dataEntry.add(frequency.toString()); + dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_taxonomy_frequencies.get(key))); + dataEntry.add(String.format("%.2f", ((double) frequency.get() * 10000) / num_taxonomy_frequencies.get(key))); + + } + csvFilePrinter.printRecord(dataEntry); } } catch (Exception e) {