diff --git a/src/main/java/alg/ngram/Ngrams.java b/src/main/java/alg/ngram/Ngrams.java index 6bcfc3c..d86feba 100644 --- a/src/main/java/alg/ngram/Ngrams.java +++ b/src/main/java/alg/ngram/Ngrams.java @@ -6,14 +6,11 @@ import java.util.List; import java.util.regex.Pattern; import java.util.stream.Collectors; +import data.*; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import data.CalculateFor; -import data.Sentence; -import data.StatisticsNew; -import data.Word; import gui.ValidationUtil; public class Ngrams { @@ -45,9 +42,26 @@ public class Ngrams { continue; } + // generate proper MultipleHMKeys depending on filter data + String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor()); + String lemma = ""; + String wordType = ""; + String msd = ""; + for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){ + if(otherKey.toString().equals("lema")){ + lemma = wordToString(ngramCandidate, otherKey); + } else if(otherKey.toString().equals("besedna vrsta")){ + wordType = wordToString(ngramCandidate, otherKey).substring(0, 1); + } else if(otherKey.toString().equals("oblikoskladenjska oznaka")){ + msd = wordToString(ngramCandidate, otherKey); + } + } + + MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd); + // UPDATE TAXONOMY HERE!!! - stats.updateTaxonomyResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()), ngramCandidate.get(0).getTaxonomy()); - stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor())); + stats.updateTaxonomyResults(multipleKeys, ngramCandidate.get(0).getTaxonomy()); +// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor())); } } } @@ -129,7 +143,9 @@ public class Ngrams { for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) { // TODO: locila? - stats.updateTaxonomyResults(word.substring(i, i + stats.getFilter().getStringLength()), taxonomy); + + MultipleHMKeys multipleKeys = new MultipleHMKeys(word.substring(i, i + stats.getFilter().getStringLength())); + stats.updateTaxonomyResults(multipleKeys, taxonomy); // stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor())); diff --git a/src/main/java/data/Filter.java b/src/main/java/data/Filter.java index 733cb67..14a2cee 100644 --- a/src/main/java/data/Filter.java +++ b/src/main/java/data/Filter.java @@ -24,7 +24,8 @@ public class Filter { TAXONOMY, MSD, HAS_MSD, - SOLAR_FILTERS + SOLAR_FILTERS, + MULTIPLE_KEYS } public Filter() { @@ -141,4 +142,23 @@ public class Filter { public HashMap> getSolarFilters() { return (HashMap>) filter.get(SOLAR_FILTERS); } + + public void setMultipleKeys(ArrayList keys) { + ArrayList newKeys = new ArrayList<>(); + if (keys != null) { + for (String key : keys) { + newKeys.add(CalculateFor.factory(key)); + } + } + + filter.put(MULTIPLE_KEYS, newKeys); + } + + public ArrayList getMultipleKeys() { + if (filter.containsKey(MULTIPLE_KEYS) && filter.get(MULTIPLE_KEYS) != null) { + return (ArrayList) filter.get(MULTIPLE_KEYS); + } else { + return new ArrayList<>(); + } + } } diff --git a/src/main/java/data/MultipleHMKeys.java b/src/main/java/data/MultipleHMKeys.java index 61e06a0..e816382 100644 --- a/src/main/java/data/MultipleHMKeys.java +++ b/src/main/java/data/MultipleHMKeys.java @@ -2,48 +2,54 @@ package data; /* Created for when words are sorted by multiple keys, i.e. not just lemmas but lemmas and msd simultaneously. */ -final class MultipleHMKeys { - private final String key1, key2, key3; +public final class MultipleHMKeys { + private final String key, lemma, wordType, msd; - public MultipleHMKeys(String key1) { - this.key1 = key1; - this.key2 = null; - this.key3 = null; + public MultipleHMKeys(String key) { + this.key = key; + this.lemma = ""; + this.wordType = ""; + this.msd = ""; } - public MultipleHMKeys(String key1, String key2) { - this.key1 = key1; - this.key2 = key2; - this.key3 = null; + public MultipleHMKeys(String key, String lemma, String wordType, String msd) { + this.key = key; + this.lemma = lemma; + this.wordType = wordType; + this.msd = msd; } - public MultipleHMKeys(String key1, String key2, String key3) { - this.key1 = key1; - this.key2 = key2; - this.key3 = key3; + public String getKey() { + return key; } - public String getKey1() { - return key1; + public String getLemma() { + return lemma; } - public String getKey2() { - return key2; + public String getWordType() { + return wordType; } - public String getKey3() { - return key3; + public String getMsd() { + return msd; } @Override public int hashCode() { - return key1.hashCode() ^ key2.hashCode() ^ key3.hashCode(); +// if(key2 == null){ +// return key1.hashCode(); +// } else if (key3 == null){ +// return key1.hashCode() ^ key2.hashCode(); +// } + return key.hashCode() ^ lemma.hashCode() ^ wordType.hashCode() ^ msd.hashCode(); } @Override public boolean equals(Object obj) { - return (obj instanceof MultipleHMKeys) && ((MultipleHMKeys) obj).key1.equals(key1) - && ((MultipleHMKeys) obj).key2.equals(key2) - && ((MultipleHMKeys) obj).key3.equals(key3); + return (obj instanceof MultipleHMKeys) && ((MultipleHMKeys) obj).key.equals(key) + && ((MultipleHMKeys) obj).lemma.equals(lemma) + && ((MultipleHMKeys) obj).wordType.equals(wordType) + && ((MultipleHMKeys) obj).msd.equals(msd); } } diff --git a/src/main/java/data/Statistics.java b/src/main/java/data/Statistics.java index 8ec972d..bc76cc9 100644 --- a/src/main/java/data/Statistics.java +++ b/src/main/java/data/Statistics.java @@ -222,7 +222,7 @@ public class Statistics { // return sortedM; // } - private Map getSortedResult(Map map, int limit) { + private Map getSortedResult(Map map, int limit) { return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit); } diff --git a/src/main/java/data/StatisticsNew.java b/src/main/java/data/StatisticsNew.java index 4da3226..115a57b 100644 --- a/src/main/java/data/StatisticsNew.java +++ b/src/main/java/data/StatisticsNew.java @@ -32,10 +32,10 @@ public class StatisticsNew { private String resultTitle; private Map result; - private Map> taxonomyResult; + private Map> taxonomyResult; private Object[][] resultCustom; // for when calculating percentages that don't add up to 100% - private Map> resultNestedSuffix; - private Map> resultNestedPrefix; + private Map> resultNestedSuffix; + private Map> resultNestedPrefix; private boolean useDB; private RDB db; private boolean analysisProducedResults; @@ -194,7 +194,7 @@ public class StatisticsNew { } public boolean saveResultToDisk(int... limit) throws UnsupportedEncodingException { - Set>> stats = new HashSet<>(); + Set>> stats = new HashSet<>(); if (useDB) { result = db.getDump(); @@ -223,13 +223,14 @@ public class StatisticsNew { } Map>> results = new HashMap<>(); - if (!isEmpty(resultNestedSuffix)) { - results.put(WordLevelType.SUFFIX, sortNestedMap(resultNestedSuffix, Util.getValidInt(limit))); - } - - if (!isEmpty(resultNestedPrefix)) { - results.put(WordLevelType.PREFIX, sortNestedMap(resultNestedPrefix, Util.getValidInt(limit))); - } + // UNCOMMENT!!!!!! +// if (!isEmpty(resultNestedSuffix)) { +// results.put(WordLevelType.SUFFIX, sortNestedMap(resultNestedSuffix, Util.getValidInt(limit))); +// } +// +// if (!isEmpty(resultNestedPrefix)) { +// results.put(WordLevelType.PREFIX, sortNestedMap(resultNestedPrefix, Util.getValidInt(limit))); +// } // if no results and nothing to save, return false if (!(results.size() > 0)) { @@ -266,8 +267,8 @@ public class StatisticsNew { return true; } - private Map> sortNestedMap(Map> nestedMap, int limit) { - Map> sorted = new HashMap<>(); + private Map> sortNestedMap(Map> nestedMap, int limit) { + Map> sorted = new HashMap<>(); for (String s : nestedMap.keySet()) { sorted.put(s, getSortedResult(nestedMap.get(s), Util.getValidInt(limit))); @@ -277,11 +278,11 @@ public class StatisticsNew { } - private Map getSortedResult(Map map, int limit) { + private Map getSortedResult(Map map, int limit) { return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit); } - public void updateTaxonomyResults(String o, List taxonomy) { + public void updateTaxonomyResults(MultipleHMKeys o, List taxonomy) { for (String key : taxonomyResult.keySet()) { // first word should have the same taxonomy as others if (taxonomy.contains(key) || key.equals("Total")) { @@ -335,9 +336,11 @@ public class StatisticsNew { } public void updateResultsNestedSuffix(String key, String stringValue) { + MultipleHMKeys mkStringValue = new MultipleHMKeys(stringValue); + if (resultNestedSuffix.containsKey(key)) { // if not in map - AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1)); + AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); // else if (r != null) { @@ -345,7 +348,7 @@ public class StatisticsNew { } } else { resultNestedSuffix.putIfAbsent(key, new ConcurrentHashMap<>()); - AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1)); + AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); if (r != null) { resultNestedSuffix.get(key).get(stringValue).incrementAndGet(); @@ -354,9 +357,11 @@ public class StatisticsNew { } public void updateResultsNestedPrefix(String key, String stringValue) { + MultipleHMKeys mkStringValue = new MultipleHMKeys(stringValue); + if (resultNestedPrefix.containsKey(key)) { // if not in map - AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1)); + AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); // else if (r != null) { @@ -364,7 +369,7 @@ public class StatisticsNew { } } else { resultNestedPrefix.putIfAbsent(key, new ConcurrentHashMap<>()); - AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1)); + AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); if (r != null) { resultNestedPrefix.get(key).get(stringValue).incrementAndGet(); diff --git a/src/main/java/gui/OneWordAnalysisTab.java b/src/main/java/gui/OneWordAnalysisTab.java index 0755ede..fd133bf 100755 --- a/src/main/java/gui/OneWordAnalysisTab.java +++ b/src/main/java/gui/OneWordAnalysisTab.java @@ -82,6 +82,7 @@ public class OneWordAnalysisTab { private static final ObservableList N_GRAM_COMPUTE_FOR_WORDS_ORTH = FXCollections.observableArrayList("različnica"); private static final ObservableList alsoVisualizeItemsLemma = FXCollections.observableArrayList("besedna vrsta", "oblikoskladenjska oznaka"); private static final ObservableList alsoVisualizeItemsDifferential = FXCollections.observableArrayList("lema", "besedna vrsta", "oblikoskladenjska oznaka"); + private static final ObservableList alsoVisualizeItemsEmpty = FXCollections.observableArrayList(); // TODO: pass observables for taxonomy based on header scan // after header scan @@ -95,6 +96,37 @@ public class OneWordAnalysisTab { // calculateForCB calculateForCB.valueProperty().addListener((observable, oldValue, newValue) -> { calculateFor = CalculateFor.factory(newValue); + if(newValue.equals("lema")){ + alsoVisualizeCCB.getItems().removeAll(); + alsoVisualizeCCB.getItems().setAll(alsoVisualizeItemsLemma); + alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> { + alsoVisualize = new ArrayList<>(); + ObservableList checkedItems = alsoVisualizeCCB.getCheckModel().getCheckedItems(); + alsoVisualize.addAll(checkedItems); + logger.info(String.format("Selected also visualize items: %s", StringUtils.join(checkedItems, ","))); + }); + alsoVisualizeCCB.getCheckModel().clearChecks(); + } else if(newValue.equals("različnica")){ + alsoVisualizeCCB.getItems().removeAll(); + alsoVisualizeCCB.getItems().setAll(alsoVisualizeItemsDifferential); + alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> { + alsoVisualize = new ArrayList<>(); + ObservableList checkedItems = alsoVisualizeCCB.getCheckModel().getCheckedItems(); + alsoVisualize.addAll(checkedItems); + logger.info(String.format("Selected also visualize items: %s", StringUtils.join(checkedItems, ","))); + }); + alsoVisualizeCCB.getCheckModel().clearChecks(); + } else { + alsoVisualizeCCB.getItems().removeAll(); + alsoVisualizeCCB.getItems().setAll(alsoVisualizeItemsEmpty); + alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> { + alsoVisualize = new ArrayList<>(); + ObservableList checkedItems = alsoVisualizeCCB.getCheckModel().getCheckedItems(); + alsoVisualize.addAll(checkedItems); + logger.info(String.format("Selected also visualize items: %s", StringUtils.join(checkedItems, ","))); + }); + alsoVisualizeCCB.getCheckModel().clearChecks(); + } logger.info("calculateForCB:", calculateFor.toString()); }); @@ -294,6 +326,7 @@ public class OneWordAnalysisTab { filter.setIsCvv(false); filter.setSolarFilters(solarFiltersMap); filter.setStringLength(1); + filter.setMultipleKeys(alsoVisualize); String message = Validation.validateForStringLevel(filter); if (message == null) { diff --git a/src/main/java/util/Export.java b/src/main/java/util/Export.java index 856018a..9341b23 100644 --- a/src/main/java/util/Export.java +++ b/src/main/java/util/Export.java @@ -10,6 +10,7 @@ import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicLong; import data.Filter; +import data.MultipleHMKeys; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVPrinter; import org.apache.commons.lang3.tuple.Pair; @@ -20,22 +21,22 @@ import data.Enums.WordLevelType; @SuppressWarnings("unchecked") public class Export { - public static void SetToJSON(Set>> set) { + public static void SetToJSON(Set>> set) { JSONArray wrapper = new JSONArray(); - for (Pair> p : set) { + for (Pair> p : set) { JSONArray data_wrapper = new JSONArray(); JSONObject metric = new JSONObject(); String title = p.getLeft(); - Map map = p.getRight(); + Map map = p.getRight(); if (map.isEmpty()) continue; long total = Util.mapSumFrequencies(map); - for (Map.Entry e : map.entrySet()) { + for (Map.Entry e : map.entrySet()) { JSONObject data_entry = new JSONObject(); data_entry.put("word", e.getKey()); data_entry.put("frequency", e.getValue()); @@ -56,8 +57,8 @@ public class Export { } } - public static String SetToCSV(Set>> set, File resultsPath, LinkedHashMap headerInfoBlock, - Map> taxonomyResults) { + public static String SetToCSV(Set>> set, File resultsPath, LinkedHashMap headerInfoBlock, + Map> taxonomyResults) { //Delimiter used in CSV file String NEW_LINE_SEPARATOR = "\n"; List FILE_HEADER_AL = new ArrayList(); @@ -65,8 +66,8 @@ public class Export { //Count frequencies long num_frequencies = 0; - for (Pair> p : set) { - Map map = p.getRight(); + for (Pair> p : set) { + Map map = p.getRight(); if (map.isEmpty()) continue; num_frequencies = Util.mapSumFrequencies(map); @@ -88,21 +89,48 @@ public class Export { if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("različnica")) { headerInfoBlock.put("Skupna vsota vseh različnic:", String.valueOf(num_frequencies)); FILE_HEADER_AL.add("Različnica"); - FILE_HEADER_AL.add("Skupna absolutna pogostost"); - FILE_HEADER_AL.add("Delež glede na vse različnice"); } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("lema")) { headerInfoBlock.put("Skupna vsota vseh lem:", String.valueOf(num_frequencies)); FILE_HEADER_AL.add("Lema"); - FILE_HEADER_AL.add("Skupna absolutna pogostost"); - FILE_HEADER_AL.add("Delež glede na vse leme"); } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("oblikoskladenjska oznaka")) { headerInfoBlock.put("Skupna vsota vseh oblikoskladenjskih oznak:", String.valueOf(num_frequencies)); FILE_HEADER_AL.add("Oblikoskladenjska oznaka"); - FILE_HEADER_AL.add("Skupna absolutna pogostost"); - FILE_HEADER_AL.add("Delež glede na vse oblikoskladenjske oznake"); } else { headerInfoBlock.put("Skupna vsota vseh različnic:", String.valueOf(num_frequencies)); FILE_HEADER_AL.add("Lema"); + } + + + for (Map value : taxonomyResults.values()) { + for (MultipleHMKeys key : value.keySet()){ + if(!key.getLemma().equals("")){ + FILE_HEADER_AL.add("Lema"); + } + if(!key.getWordType().equals("")){ + FILE_HEADER_AL.add("Besedna vrsta"); + } + if(!key.getMsd().equals("")){ + FILE_HEADER_AL.add("Oblikoskladenjska oznaka"); + } + break; + } + + break; + } + + + + + if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("različnica")) { + FILE_HEADER_AL.add("Skupna absolutna pogostost"); + FILE_HEADER_AL.add("Delež glede na vse različnice"); + } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("lema")) { + FILE_HEADER_AL.add("Skupna absolutna pogostost"); + FILE_HEADER_AL.add("Delež glede na vse leme"); + } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("oblikoskladenjska oznaka")) { + FILE_HEADER_AL.add("Skupna absolutna pogostost"); + FILE_HEADER_AL.add("Delež glede na vse oblikoskladenjske oznake"); + } else { FILE_HEADER_AL.add("Skupna absolutna pogostost"); FILE_HEADER_AL.add("Delež glede na vse leme"); } @@ -122,14 +150,14 @@ public class Export { String fileName = ""; - for (Pair> p : set) { + for (Pair> p : set) { String title = p.getLeft(); fileName = title.replace(": ", "-"); fileName = fileName.replace(" ", "_").concat(".csv"); fileName = resultsPath.toString().concat(File.separator).concat(fileName); - Map map = p.getRight(); + Map map = p.getRight(); if (map.isEmpty()) continue; @@ -155,9 +183,18 @@ public class Export { //Create CSV file header csvFilePrinter.printRecord(FILE_HEADER); - for (Map.Entry e : map.entrySet()) { + for (Map.Entry e : map.entrySet()) { List dataEntry = new ArrayList<>(); - dataEntry.add(e.getKey()); + dataEntry.add(e.getKey().getKey()); + if(!e.getKey().getLemma().equals("")){ + dataEntry.add(e.getKey().getLemma()); + } + if(!e.getKey().getWordType().equals("")){ + dataEntry.add(e.getKey().getWordType()); + } + if(!e.getKey().getMsd().equals("")){ + dataEntry.add(e.getKey().getMsd()); + } dataEntry.add(e.getValue().toString()); dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies)); dataEntry.add(String.format("%.2f", ((double) e.getValue() * 10000)/num_frequencies)); diff --git a/src/main/java/util/Util.java b/src/main/java/util/Util.java index fa35d02..ff6f24a 100644 --- a/src/main/java/util/Util.java +++ b/src/main/java/util/Util.java @@ -9,6 +9,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Stream; +import data.MultipleHMKeys; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -85,11 +86,11 @@ public class Util { * Generic map converter -> since AtomicLongs aren't as comparable. * Converts ConcurrentHashMap to HashMap */ - public static Map atomicInt2StringAndInt(Map map) { - Map m = new HashMap(); + public static Map atomicInt2StringAndInt(Map map) { + Map m = new HashMap(); for (Map.Entry e : map.entrySet()) { - m.put(e.getKey().toString(), ((AtomicLong) e.getValue()).longValue()); + m.put(e.getKey(), ((AtomicLong) e.getValue()).longValue()); } return m; @@ -148,7 +149,7 @@ public class Util { System.out.println(); } - static long mapSumFrequencies(Map map) { + static long mapSumFrequencies(Map map) { long sum = 0; for (long value : map.values()) {