diff --git a/src/main/java/alg/XML_processing.java b/src/main/java/alg/XML_processing.java index 79b82ff..44f2dc8 100755 --- a/src/main/java/alg/XML_processing.java +++ b/src/main/java/alg/XML_processing.java @@ -892,7 +892,7 @@ public class XML_processing { if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { // ngram level: if not 0 must be less than or equal to number of words in this sentence. if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) { - return null; + return new ArrayList<>(); } // if we're calculating values for letters, omit words that are shorter than string length diff --git a/src/main/java/alg/ngram/Ngrams.java b/src/main/java/alg/ngram/Ngrams.java index 1e9ac94..df063c6 100755 --- a/src/main/java/alg/ngram/Ngrams.java +++ b/src/main/java/alg/ngram/Ngrams.java @@ -56,7 +56,8 @@ public class Ngrams { // String test = key; // } -// key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key; + if (stats.getFilter().getNotePunctuations()) + key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key; MultipleHMKeys multipleKeys; @@ -165,6 +166,12 @@ public class Ngrams { // .collect(Collectors.toList())); // .substring(0, 1) return StringUtils.join(candidate, " "); + case NORMALIZED_WORD: + candidate.addAll(ngramCandidate + .stream() + .map(Word::getNormalizedWord) + .collect(Collectors.toList())); + return StringUtils.join(candidate, " "); } return StringUtils.join(candidate, " "); diff --git a/src/main/java/data/CalculateFor.java b/src/main/java/data/CalculateFor.java index 5146957..1dde1b2 100755 --- a/src/main/java/data/CalculateFor.java +++ b/src/main/java/data/CalculateFor.java @@ -2,6 +2,7 @@ package data; public enum CalculateFor { WORD("različnica"), + NORMALIZED_WORD("normalizirana različnica"), LEMMA("lema"), MORPHOSYNTACTIC_SPECS("oblikoskladenjska oznaka"), MORPHOSYNTACTIC_PROPERTY("oblikoskladenjska lastnost"), @@ -37,7 +38,56 @@ public enum CalculateFor { if (WORD_TYPE.toString().equals(cf)) { return WORD_TYPE; } + if (NORMALIZED_WORD.toString().equals(cf)) { + return NORMALIZED_WORD; + } } return null; } + + public String toHeaderString() { + switch(this){ + case WORD: + return "Različnica"; + case NORMALIZED_WORD: + return "Normalizirana različnica"; + case LEMMA: + return "Lema"; + case MORPHOSYNTACTIC_SPECS: + return "Oblikoskladenjska oznaka"; + case MORPHOSYNTACTIC_PROPERTY: + return "Oblikoskladenjska lastnost"; + case WORD_TYPE: + return "Besedna vrsta"; + case DIST_WORDS: + return "Različnica"; + case DIST_LEMMAS: + return "Lema"; + default: + return null; + } + } + + public String toPercentString() { + switch(this){ + case WORD: + return "Delež glede na vse različnice"; + case NORMALIZED_WORD: + return "Delež glede na vse normalizirane različnice"; + case LEMMA: + return "Delež glede na vse leme"; + case MORPHOSYNTACTIC_SPECS: + return "Delež glede na vse oblikoskladenjske oznake"; + case MORPHOSYNTACTIC_PROPERTY: + return "Delež glede na vse oblikoskladenjske lastnosti"; + case WORD_TYPE: + return "Delež glede na vse besedne vrste"; + case DIST_WORDS: + return "Delež glede na vse različnice"; + case DIST_LEMMAS: + return "Delež glede na vse leme"; + default: + return null; + } + } } diff --git a/src/main/java/data/MultipleHMKeys.java b/src/main/java/data/MultipleHMKeys.java index be93d1b..18904d5 100755 --- a/src/main/java/data/MultipleHMKeys.java +++ b/src/main/java/data/MultipleHMKeys.java @@ -7,10 +7,10 @@ Created for when words are sorted by multiple keys, i.e. not just lemmas but lem */ public interface MultipleHMKeys { String getK1(); - String getK2(); - String getK3(); - String getK4(); - String getK5(); + default String getK2(){ return null; } + default String getK3(){ return null; } + default String getK4(){ return null; } + default String getK5(){ return null; } @Override int hashCode(); diff --git a/src/main/java/data/MultipleHMKeys1.java b/src/main/java/data/MultipleHMKeys1.java index 4698e3c..5c0898c 100755 --- a/src/main/java/data/MultipleHMKeys1.java +++ b/src/main/java/data/MultipleHMKeys1.java @@ -16,22 +16,6 @@ public final class MultipleHMKeys1 implements MultipleHMKeys { return k1; } - public String getK2() { - return null; - } - - public String getK3() { - return null; - } - - public String getK4() { - return null; - } - - public String getK5() { - return null; - } - @Override public int hashCode() { return k1.hashCode(); diff --git a/src/main/java/data/MultipleHMKeys2.java b/src/main/java/data/MultipleHMKeys2.java index 91388d5..9456e50 100755 --- a/src/main/java/data/MultipleHMKeys2.java +++ b/src/main/java/data/MultipleHMKeys2.java @@ -21,18 +21,6 @@ public final class MultipleHMKeys2 implements MultipleHMKeys { return k2; } - public String getK3() { - return null; - } - - public String getK4() { - return null; - } - - public String getK5() { - return null; - } - @Override public int hashCode() { return Objects.hash(k1, k2); diff --git a/src/main/java/data/MultipleHMKeys3.java b/src/main/java/data/MultipleHMKeys3.java index 5783ef3..8342273 100755 --- a/src/main/java/data/MultipleHMKeys3.java +++ b/src/main/java/data/MultipleHMKeys3.java @@ -26,14 +26,6 @@ public final class MultipleHMKeys3 implements MultipleHMKeys { return k3; } - public String getK4() { - return null; - } - - public String getK5() { - return null; - } - @Override public int hashCode() { return Objects.hash(k1, k2, k3); diff --git a/src/main/java/data/MultipleHMKeys4.java b/src/main/java/data/MultipleHMKeys4.java index 46e8d73..1d8ccac 100755 --- a/src/main/java/data/MultipleHMKeys4.java +++ b/src/main/java/data/MultipleHMKeys4.java @@ -31,10 +31,6 @@ public final class MultipleHMKeys4 implements MultipleHMKeys { return k4; } - public String getK5() { - return null; - } - @Override public int hashCode() { return Objects.hash(k1, k2, k3, k4); diff --git a/src/main/java/gui/OneWordAnalysisTab.java b/src/main/java/gui/OneWordAnalysisTab.java index 5479291..2f10e59 100755 --- a/src/main/java/gui/OneWordAnalysisTab.java +++ b/src/main/java/gui/OneWordAnalysisTab.java @@ -87,8 +87,11 @@ public class OneWordAnalysisTab { private static final ObservableList N_GRAM_COMPUTE_FOR_WORDS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka"); private static final ObservableList N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("lema", "različnica"); private static final ObservableList N_GRAM_COMPUTE_FOR_WORDS_ORTH = FXCollections.observableArrayList("različnica"); + private static final ObservableList N_GRAM_COMPUTE_FOR_WORDS_GOS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka", "normalizirana različnica"); private static final ObservableList alsoVisualizeItemsLemma = FXCollections.observableArrayList("besedna vrsta", "oblikoskladenjska oznaka"); - private static final ObservableList alsoVisualizeItemsDifferential = FXCollections.observableArrayList("lema", "besedna vrsta", "oblikoskladenjska oznaka"); + private static final ObservableList alsoVisualizeItemsWord = FXCollections.observableArrayList("lema", "besedna vrsta", "oblikoskladenjska oznaka"); + private static final ObservableList alsoVisualizeItemsWordGos = FXCollections.observableArrayList("lema", "besedna vrsta", "oblikoskladenjska oznaka", "normalizirana različnica"); + private static final ObservableList alsoVisualizeItemsNormalizedWord = FXCollections.observableArrayList("lema", "besedna vrsta", "oblikoskladenjska oznaka", "različnica"); private static final ObservableList alsoVisualizeItemsEmpty = FXCollections.observableArrayList(); // TODO: pass observables for taxonomy based on header scan @@ -103,37 +106,28 @@ public class OneWordAnalysisTab { // calculateForCB calculateForCB.valueProperty().addListener((observable, oldValue, newValue) -> { calculateFor = CalculateFor.factory(newValue); + + alsoVisualizeCCB.getItems().removeAll(); if(newValue.equals("lema")){ - alsoVisualizeCCB.getItems().removeAll(); alsoVisualizeCCB.getItems().setAll(alsoVisualizeItemsLemma); - alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> { - alsoVisualize = new ArrayList<>(); - ObservableList checkedItems = alsoVisualizeCCB.getCheckModel().getCheckedItems(); - alsoVisualize.addAll(checkedItems); - logger.info(String.format("Selected also visualize items: %s", StringUtils.join(checkedItems, ","))); - }); - alsoVisualizeCCB.getCheckModel().clearChecks(); - } else if(newValue.equals("različnica")){ - alsoVisualizeCCB.getItems().removeAll(); - alsoVisualizeCCB.getItems().setAll(alsoVisualizeItemsDifferential); - alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> { - alsoVisualize = new ArrayList<>(); - ObservableList checkedItems = alsoVisualizeCCB.getCheckModel().getCheckedItems(); - alsoVisualize.addAll(checkedItems); - logger.info(String.format("Selected also visualize items: %s", StringUtils.join(checkedItems, ","))); - }); - alsoVisualizeCCB.getCheckModel().clearChecks(); - } else { - alsoVisualizeCCB.getItems().removeAll(); + } else if(newValue.equals("različnica")) { + if (corpus.getCorpusType() == CorpusType.GOS) + alsoVisualizeCCB.getItems().setAll(alsoVisualizeItemsWordGos); + else + alsoVisualizeCCB.getItems().setAll(alsoVisualizeItemsWord); + } else if(newValue.equals("normalizirana različnica")) { + alsoVisualizeCCB.getItems().setAll(alsoVisualizeItemsNormalizedWord); + }else { alsoVisualizeCCB.getItems().setAll(alsoVisualizeItemsEmpty); - alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> { - alsoVisualize = new ArrayList<>(); - ObservableList checkedItems = alsoVisualizeCCB.getCheckModel().getCheckedItems(); - alsoVisualize.addAll(checkedItems); - logger.info(String.format("Selected also visualize items: %s", StringUtils.join(checkedItems, ","))); - }); - alsoVisualizeCCB.getCheckModel().clearChecks(); } + alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> { + alsoVisualize = new ArrayList<>(); + ObservableList checkedItems = alsoVisualizeCCB.getCheckModel().getCheckedItems(); + alsoVisualize.addAll(checkedItems); + logger.info(String.format("Selected also visualize items: %s", StringUtils.join(checkedItems, ","))); + }); + alsoVisualizeCCB.getCheckModel().clearChecks(); + logger.info("calculateForCB:", calculateFor.toString()); }); @@ -343,7 +337,10 @@ public class OneWordAnalysisTab { logger.info("mode: ", mode.toString()); if (mode == MODE.WORD) { - calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS); + if (corpus.getCorpusType() == CorpusType.GOS) + calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS_GOS); + else + calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS); } else if (mode == MODE.LETTER) { calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_LETTERS); diff --git a/src/main/java/gui/StringAnalysisTabNew2.java b/src/main/java/gui/StringAnalysisTabNew2.java index da47e0c..766edee 100755 --- a/src/main/java/gui/StringAnalysisTabNew2.java +++ b/src/main/java/gui/StringAnalysisTabNew2.java @@ -108,6 +108,7 @@ public class StringAnalysisTabNew2 { private HostServices hostService; private static final ObservableList N_GRAM_COMPUTE_FOR_WORDS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka"); +// private static final ObservableList N_GRAM_COMPUTE_FOR_WORDS_GOS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka", "normalizirana različnica"); private static final ObservableList N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("lema", "različnica"); private static final ObservableList N_GRAM_COMPUTE_FOR_WORDS_ORTH = FXCollections.observableArrayList("različnica"); @@ -420,7 +421,11 @@ public class StringAnalysisTabNew2 { if (mode == MODE.WORD) { paneWords.setVisible(true); paneLetters.setVisible(false); +// if (corpus.getCorpusType() == CorpusType.GOS) +// calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS_GOS); +// else calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS); + } else if (mode == MODE.LETTER) { paneWords.setVisible(false); paneLetters.setVisible(true); diff --git a/src/main/java/util/Export.java b/src/main/java/util/Export.java index 00a8ef3..704ba9a 100755 --- a/src/main/java/util/Export.java +++ b/src/main/java/util/Export.java @@ -87,83 +87,93 @@ public class Export { //CSV file header - if (headerInfoBlock.containsKey("Analiza") && (headerInfoBlock.get("Analiza").equals("Besede") || headerInfoBlock.get("Analiza").equals("Besedni nizi"))) { - if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("različnica")) { - headerInfoBlock.put("Skupna vsota vseh različnic:", String.valueOf(num_frequencies)); - if (headerInfoBlock.get("Analiza").equals("Besede")){ - FILE_HEADER_AL.add("Različnica"); - } else if (headerInfoBlock.get("Analiza").equals("Besedni nizi")) { - FILE_HEADER_AL.add("Različnice"); - } - } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("lema")) { - headerInfoBlock.put("Skupna vsota vseh lem:", String.valueOf(num_frequencies)); - if (headerInfoBlock.get("Analiza").equals("Besede")){ - FILE_HEADER_AL.add("Lema"); - FILE_HEADER_AL.add("Lema male črke"); - } else if (headerInfoBlock.get("Analiza").equals("Besedni nizi")) { - FILE_HEADER_AL.add("Leme"); - FILE_HEADER_AL.add("Leme male črke"); - } - } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("oblikoskladenjska oznaka")) { - headerInfoBlock.put("Skupna vsota vseh oblikoskladenjskih oznak:", String.valueOf(num_frequencies)); - if (headerInfoBlock.get("Analiza").equals("Besede")){ - FILE_HEADER_AL.add("Oblikoskladenjska oznaka"); - } else if (headerInfoBlock.get("Analiza").equals("Besedni nizi")) { - FILE_HEADER_AL.add("Oblikoskladenjska oznake"); - } - } else { - headerInfoBlock.put("Skupna vsota vseh različnic:", String.valueOf(num_frequencies)); - FILE_HEADER_AL.add("Lema"); - FILE_HEADER_AL.add("Lema male črke"); - } + FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString()); + if (filter.getCalculateFor().equals(CalculateFor.LEMMA)) + FILE_HEADER_AL.add("Lema male črke"); + +// if (headerInfoBlock.containsKey("Analiza") && (headerInfoBlock.get("Analiza").equals("Besede") || headerInfoBlock.get("Analiza").equals("Besedni nizi"))) { +// if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("različnica")) { +// headerInfoBlock.put("Skupna vsota vseh različnic:", String.valueOf(num_frequencies)); +// if (headerInfoBlock.get("Analiza").equals("Besede")){ +// FILE_HEADER_AL.add("Različnica"); +// } else if (headerInfoBlock.get("Analiza").equals("Besedni nizi")) { +// FILE_HEADER_AL.add("Različnice"); +// } +// } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("lema")) { +// headerInfoBlock.put("Skupna vsota vseh lem:", String.valueOf(num_frequencies)); +// if (headerInfoBlock.get("Analiza").equals("Besede")){ +// FILE_HEADER_AL.add("Lema"); +// FILE_HEADER_AL.add("Lema male črke"); +// } else if (headerInfoBlock.get("Analiza").equals("Besedni nizi")) { +// FILE_HEADER_AL.add("Leme"); +// FILE_HEADER_AL.add("Leme male črke"); +// } +// } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("oblikoskladenjska oznaka")) { +// headerInfoBlock.put("Skupna vsota vseh oblikoskladenjskih oznak:", String.valueOf(num_frequencies)); +// if (headerInfoBlock.get("Analiza").equals("Besede")){ +// FILE_HEADER_AL.add("Oblikoskladenjska oznaka"); +// } else if (headerInfoBlock.get("Analiza").equals("Besedni nizi")) { +// FILE_HEADER_AL.add("Oblikoskladenjska oznake"); +// } +// } else { +// headerInfoBlock.put("Skupna vsota vseh različnic:", String.valueOf(num_frequencies)); +// FILE_HEADER_AL.add("Lema"); +// FILE_HEADER_AL.add("Lema male črke"); +// } // for (Map value : taxonomyResults.values()) { - for (CalculateFor otherKey : filter.getMultipleKeys()){ - if(otherKey.equals(CalculateFor.LEMMA)){ - FILE_HEADER_AL.add("Lema"); - FILE_HEADER_AL.add("Lema male črke"); - } - if(otherKey.equals(CalculateFor.WORD_TYPE)){ - FILE_HEADER_AL.add("Besedna vrsta"); - } - if(otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ - FILE_HEADER_AL.add("Oblikoskladenjska oznaka"); - } - } + for (CalculateFor otherKey : filter.getMultipleKeys()) { + FILE_HEADER_AL.add(otherKey.toHeaderString()); + if (otherKey.equals(CalculateFor.LEMMA)) + FILE_HEADER_AL.add("Lema male črke"); + } + +// if(otherKey.equals(CalculateFor.LEMMA)){ +// FILE_HEADER_AL.add("Lema"); +// FILE_HEADER_AL.add("Lema male črke"); +// } +// if(otherKey.equals(CalculateFor.WORD_TYPE)){ +// FILE_HEADER_AL.add("Besedna vrsta"); +// } +// if(otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ +// FILE_HEADER_AL.add("Oblikoskladenjska oznaka"); +// } +// if(otherKey.equals(CalculateFor.NORMALIZED_WORD)){ +// FILE_HEADER_AL.add("Normalizirana različnica"); +// } +// } // break; // } - - if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("različnica")) { - FILE_HEADER_AL.add("Skupna absolutna pogostost"); - FILE_HEADER_AL.add("Delež glede na vse različnice"); - } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("lema")) { - FILE_HEADER_AL.add("Skupna absolutna pogostost"); - FILE_HEADER_AL.add("Delež glede na vse leme"); - } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("oblikoskladenjska oznaka")) { - FILE_HEADER_AL.add("Skupna absolutna pogostost"); - FILE_HEADER_AL.add("Delež glede na vse oblikoskladenjske oznake"); - } else { - FILE_HEADER_AL.add("Skupna absolutna pogostost"); - FILE_HEADER_AL.add("Delež glede na vse leme"); - } - FILE_HEADER_AL.add("Skupna relativna pogostost"); - for (String key : taxonomyResults.keySet()) { - if(!key.equals("Total")) { - FILE_HEADER_AL.add("Absolutna pogostost [" + key + "]"); - FILE_HEADER_AL.add("Delež [" + key + "]"); - FILE_HEADER_AL.add("Relativna pogostost [" + key + "]"); - } + FILE_HEADER_AL.add("Skupna absolutna pogostost"); + FILE_HEADER_AL.add(filter.getCalculateFor().toPercentString()); + +// if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("različnica")) { +// FILE_HEADER_AL.add("Delež glede na vse različnice"); +// } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("lema")) { +// FILE_HEADER_AL.add("Delež glede na vse leme"); +// } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("oblikoskladenjska oznaka")) { +// FILE_HEADER_AL.add("Delež glede na vse oblikoskladenjske oznake"); +// } else { +// FILE_HEADER_AL.add("Delež glede na vse leme"); +// } + FILE_HEADER_AL.add("Skupna relativna pogostost"); + for (String key : taxonomyResults.keySet()) { + if(!key.equals("Total")) { + FILE_HEADER_AL.add("Absolutna pogostost [" + key + "]"); + FILE_HEADER_AL.add("Delež [" + key + "]"); + FILE_HEADER_AL.add("Relativna pogostost [" + key + "]"); } - FILE_HEADER = new String[ FILE_HEADER_AL.size() ]; - FILE_HEADER_AL.toArray(FILE_HEADER); - } else { - FILE_HEADER = new Object[]{"word", "frequency", "percent"}; } + FILE_HEADER = new String[ FILE_HEADER_AL.size() ]; + FILE_HEADER_AL.toArray(FILE_HEADER); +// } else { +// FILE_HEADER = new Object[]{"word", "frequency", "percent"}; +// } String fileName = ""; diff --git a/src/main/resources/gui/StringAnalysisTabNew2.fxml b/src/main/resources/gui/StringAnalysisTabNew2.fxml index ba88540..b0ae28e 100755 --- a/src/main/resources/gui/StringAnalysisTabNew2.fxml +++ b/src/main/resources/gui/StringAnalysisTabNew2.fxml @@ -15,7 +15,28 @@ -