From a4df732678e3868fe64fe6ffb9c892234e319cf3 Mon Sep 17 00:00:00 2001 From: Luka Date: Thu, 8 Nov 2018 11:37:16 +0100 Subject: [PATCH] Added the rest of collocabilities --- .gitignore | 1 + src/main/java/alg/ngram/Ngrams.java | 45 ++++++++++++- src/main/java/data/Collocability.java | 43 +++++++++++-- src/main/java/data/StatisticsNew.java | 67 +++++++++++++++++--- src/main/java/gui/StringAnalysisTabNew2.java | 2 +- src/main/java/gui/WordLevelTab.java | 4 +- src/main/java/util/Export.java | 59 ++++++++++++++++- 7 files changed, 202 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index de9fa21..5c9ea6b 100755 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ release.properties dependency-reduced-pom.xml buildNumber.properties .mvn/timing.properties +corpus-analyzer.iml # Avoid ignoring Maven wrapper jar file (.jar files are usually ignored) !/.mvn/wrapper/maven-wrapper.jar diff --git a/src/main/java/alg/ngram/Ngrams.java b/src/main/java/alg/ngram/Ngrams.java index ffd32c6..19b160b 100755 --- a/src/main/java/alg/ngram/Ngrams.java +++ b/src/main/java/alg/ngram/Ngrams.java @@ -52,10 +52,53 @@ public class Ngrams { // generate proper MultipleHMKeys depending on filter data String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts()); - if(key.length() < stats.getFilter().getPrefixLength() + stats.getFilter().getSuffixLength()){ + if(stats.getFilter().getPrefixLength() != null && stats.getFilter().getSuffixLength() != null && + key.length() < stats.getFilter().getPrefixLength() + stats.getFilter().getSuffixLength()){ continue; } + if(stats.getFilter().getPrefixList() != null && stats.getFilter().getSuffixList() != null && + (stats.getFilter().getPrefixList().size() > 0 || stats.getFilter().getSuffixList().size() > 0)){ + + String correctPrefix = ""; + // go over all prefixes in PrefixList and look for them in words + for(String pf : stats.getFilter().getPrefixList()){ + if (pf.length() <= key.length() && pf.equals(key.substring(0, pf.length()))){ + correctPrefix = pf; + break; + } + } + + String correctSuffix = ""; + // go over all prefixes in SuffixList and look for them in words + for(String sf : stats.getFilter().getSuffixList()){ + if (sf.length() <= key.length() && sf.equals(key.substring(key.length() - sf.length()))){ + correctSuffix = sf; + break; + } + } + +// boolean a = (correctPrefix.equals("") && !correctSuffix.equals("")); +// boolean b = (!correctPrefix.equals("") && correctSuffix.equals("")); +// boolean c = (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length()); +// boolean d = !((correctPrefix.equals("") && !correctSuffix.equals("")) || +// (!correctPrefix.equals("") && correctSuffix.equals("")) || +// (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length())); + + if(!((stats.getFilter().getPrefixList().size() == 0 && !correctSuffix.equals("")) || + (!correctPrefix.equals("") && stats.getFilter().getSuffixList().size() == 0) || + (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length()))){ + continue; + } + +// if(!((correctPrefix.equals("") && !correctSuffix.equals("")) || +// (!correctPrefix.equals("") && correctSuffix.equals("")) || +// (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length()))){ +// continue; +// } + + } + // if last letter is ',' erase it // if (key.equals("")){ diff --git a/src/main/java/data/Collocability.java b/src/main/java/data/Collocability.java index 9d7c21f..159f15c 100755 --- a/src/main/java/data/Collocability.java +++ b/src/main/java/data/Collocability.java @@ -1,7 +1,12 @@ package data; public enum Collocability { - DICE("Dice"); + DICE("Dice"), + TSCORE("t-score"), + MI("MI"), + MI3("MI3"), + LOGDICE("logDice"), + SIMPLELL("simple LL"); private final String name; @@ -17,7 +22,17 @@ public enum Collocability { if (cf != null) { if (DICE.toString().equals(cf)) { return DICE; - } + } else if (TSCORE.toString().equals(cf)) { + return TSCORE; + } else if (MI.toString().equals(cf)) { + return MI; + } else if (MI3.toString().equals(cf)) { + return MI3; + } else if (LOGDICE.toString().equals(cf)) { + return LOGDICE; + } else if (SIMPLELL.toString().equals(cf)) { + return SIMPLELL; + } } return null; } @@ -25,7 +40,17 @@ public enum Collocability { public String toMetadataString() { switch(this){ case DICE: - return "Kolokabilnost - Dice:"; + return "Dice"; + case TSCORE: + return "t-score"; + case MI: + return "MI"; + case MI3: + return "MI3"; + case LOGDICE: + return "logDice"; + case SIMPLELL: + return "simple LL"; default: return null; } @@ -34,7 +59,17 @@ public enum Collocability { public String toHeaderString() { switch(this){ case DICE: - return "Kolokabilnost - Dice"; + return "Dice"; + case TSCORE: + return "t-score"; + case MI: + return "MI"; + case MI3: + return "MI3"; + case LOGDICE: + return "logDice"; + case SIMPLELL: + return "simple LL"; default: return null; } diff --git a/src/main/java/data/StatisticsNew.java b/src/main/java/data/StatisticsNew.java index ada8244..eca6eb2 100755 --- a/src/main/java/data/StatisticsNew.java +++ b/src/main/java/data/StatisticsNew.java @@ -428,9 +428,14 @@ public class StatisticsNew { Integer ngramLevel = filter.getNgramValue(); if (ngramLevel == 0) info.put("Analiza", "Črke"); - else if (ngramLevel == 1) - info.put("Analiza", "Besede"); - else + else if (ngramLevel == 1) { + // if suffixes or prefixes are not null print word parts + if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) { + info.put("Analiza", "Besedni deli"); + } else { + info.put("Analiza", "Besede"); + } + } else info.put("Analiza", filter.getAl().toString()); } else { info.put("Analiza", filter.getAl().toString()); @@ -492,22 +497,68 @@ public class StatisticsNew { public void updateCalculateCollocabilities(StatisticsNew oneWordStatistics) { Map> oneWordTaxonomyResult = oneWordStatistics.getTaxonomyResult(); - Map collocabilityMap = new ConcurrentHashMap<>(); + Map> collocabilityMap = new ConcurrentHashMap<>(); + + for(Collocability c : filter.getCollocability()){ + collocabilityMap.put(c, new ConcurrentHashMap<>()); + } + + // count number of all words + long N = 0; + for(AtomicLong a : oneWordTaxonomyResult.get("Total").values()){ + N += a.longValue(); + } for(MultipleHMKeys hmKey : taxonomyResult.get("Total").keySet()) { // String[] splitedString = hmKey.getK1().split("\\s+"); long sum_fwi =0L; + long mul_fwi =1L; for(MultipleHMKeys smallHmKey : hmKey.getSplittedMultipleHMKeys()){ - System.out.println(smallHmKey.getK1()); +// System.out.println(smallHmKey.getK1()); sum_fwi += oneWordTaxonomyResult.get("Total").get(smallHmKey).longValue(); + mul_fwi *= oneWordTaxonomyResult.get("Total").get(smallHmKey).longValue(); + } +// String t = hmKey.getK1(); +// if(hmKey.getK1().equals("v Slovenija")){ +// System.out.println("TEST"); +// +// } + double O = (double)taxonomyResult.get("Total").get(hmKey).longValue(); + double n = (double)filter.getNgramValue(); + double E = (double)mul_fwi / Math.pow(N, n - 1); + if (collocabilityMap.keySet().contains(Collocability.DICE)){ + double dice_value = n * O / sum_fwi; + collocabilityMap.get(Collocability.DICE).put(hmKey, dice_value); + } + if (collocabilityMap.keySet().contains(Collocability.TSCORE)){ + double t_score = (O - E) / Math.sqrt(O); + collocabilityMap.get(Collocability.TSCORE).put(hmKey, t_score); } - double dice_value = (double) filter.getNgramValue() * (double)taxonomyResult.get("Total").get(hmKey).longValue() / sum_fwi; - collocabilityMap.put(hmKey, dice_value); + if (collocabilityMap.keySet().contains(Collocability.MI)){ + double MI = Math.log(O / E) / Math.log(2); + collocabilityMap.get(Collocability.MI).put(hmKey, MI); + } + if (collocabilityMap.keySet().contains(Collocability.MI3)){ + double MI3 = Math.log(Math.pow(O, 3.0) / E) / Math.log(2); + collocabilityMap.get(Collocability.MI3).put(hmKey, MI3); + } + if (collocabilityMap.keySet().contains(Collocability.LOGDICE)){ + double dice_value = n * O / sum_fwi; + double log_dice = 14 + Math.log(dice_value) / Math.log(2); + collocabilityMap.get(Collocability.LOGDICE).put(hmKey, log_dice); + } + if (collocabilityMap.keySet().contains(Collocability.SIMPLELL)){ + double simple_ll = 2 * (O * Math.log10(O / E) - (O - E)); + collocabilityMap.get(Collocability.SIMPLELL).put(hmKey, simple_ll); + } + } + + for(Collocability c : collocabilityMap.keySet()){ + collocability.put(c, collocabilityMap.get(c)); } - collocability.put(filter.getCollocability().get(0), collocabilityMap); } public Map> getCollocability(){ diff --git a/src/main/java/gui/StringAnalysisTabNew2.java b/src/main/java/gui/StringAnalysisTabNew2.java index a183567..6c1eacd 100755 --- a/src/main/java/gui/StringAnalysisTabNew2.java +++ b/src/main/java/gui/StringAnalysisTabNew2.java @@ -133,7 +133,7 @@ public class StringAnalysisTabNew2 { private static final ObservableList alsoVisualizeItemsWordGos = FXCollections.observableArrayList("lema", "besedna vrsta", "oblikoskladenjska oznaka", "normalizirana različnica"); private static final ObservableList alsoVisualizeItemsNormalizedWord = FXCollections.observableArrayList("lema", "besedna vrsta", "oblikoskladenjska oznaka"); private static final ObservableList alsoVisualizeItemsMsd = FXCollections.observableArrayList("besedna vrsta"); - private static final ObservableList COLLOCABILITY_ITEMS = FXCollections.observableArrayList("Dice"); + private static final ObservableList COLLOCABILITY_ITEMS = FXCollections.observableArrayList("Dice", "t-score", "MI", "MI3", "logDice", "simple LL"); private static final ObservableList alsoVisualizeItemsEmpty = FXCollections.observableArrayList(); diff --git a/src/main/java/gui/WordLevelTab.java b/src/main/java/gui/WordLevelTab.java index 560b59d..ade959a 100755 --- a/src/main/java/gui/WordLevelTab.java +++ b/src/main/java/gui/WordLevelTab.java @@ -108,10 +108,10 @@ public class WordLevelTab { private boolean useDb; private HostServices hostService; - private static final ObservableList N_GRAM_COMPUTE_FOR_WORDS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka"); + private static final ObservableList N_GRAM_COMPUTE_FOR_WORDS = FXCollections.observableArrayList("lema", "različnica"); private static final ObservableList N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("lema", "različnica"); private static final ObservableList N_GRAM_COMPUTE_FOR_WORDS_ORTH = FXCollections.observableArrayList("različnica"); - private static final ObservableList N_GRAM_COMPUTE_FOR_WORDS_GOS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka", "normalizirana različnica"); + private static final ObservableList N_GRAM_COMPUTE_FOR_WORDS_GOS = FXCollections.observableArrayList("lema", "različnica", "normalizirana različnica"); private static final ObservableList alsoVisualizeItemsLemma = FXCollections.observableArrayList("besedna vrsta", "oblikoskladenjska oznaka"); private static final ObservableList alsoVisualizeItemsWord = FXCollections.observableArrayList("lema", "besedna vrsta", "oblikoskladenjska oznaka"); private static final ObservableList alsoVisualizeItemsWordGos = FXCollections.observableArrayList("lema", "besedna vrsta", "oblikoskladenjska oznaka", "normalizirana različnica"); diff --git a/src/main/java/util/Export.java b/src/main/java/util/Export.java index 4e492e7..ee686f9 100755 --- a/src/main/java/util/Export.java +++ b/src/main/java/util/Export.java @@ -94,6 +94,17 @@ public class Export { FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString()); if (filter.getCalculateFor().equals(CalculateFor.LEMMA)) FILE_HEADER_AL.add("Lema male črke"); + + if (filter.getSuffixLength() != null && filter.getSuffixList() != null && filter.getPrefixLength() != null && filter.getPrefixList() != null) { + if (filter.getPrefixLength() > 0 || filter.getPrefixList().size() > 0) { + FILE_HEADER_AL.add("Predpona"); + } + FILE_HEADER_AL.add("Preostali del besede"); + if (filter.getSuffixLength() > 0 || filter.getSuffixList().size() > 0) { + FILE_HEADER_AL.add("Pripona"); + } + } + headerInfoBlock.put(filter.getCalculateFor().toMetadataString(), String.valueOf(num_frequencies)); for (CalculateFor otherKey : filter.getMultipleKeys()) { @@ -109,7 +120,9 @@ public class Export { FILE_HEADER_AL.add("Skupna relativna pogostost (na milijon pojavitev)"); if (filter.getCollocability().size() > 0){ - FILE_HEADER_AL.add(filter.getCollocability().get(0).toHeaderString()); + for (Collocability c : filter.getCollocability()) { + FILE_HEADER_AL.add(c.toHeaderString()); + } } for (String key : taxonomyResults.keySet()) { @@ -167,6 +180,45 @@ public class Export { dataEntry.add(eraseSkipgramStars(e.getKey().getK1().toLowerCase(), filter)); } + if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) { + if(filter.getSuffixLength() > 0 || filter.getPrefixLength() > 0) { + if (filter.getPrefixLength() > 0) { + dataEntry.add(((String) dataEntry.get(0)).substring(0, filter.getPrefixLength())); + } + dataEntry.add(((String) dataEntry.get(0)).substring(filter.getPrefixLength(), ((String) dataEntry.get(0)).length() - filter.getSuffixLength())); + if (filter.getSuffixLength() > 0) { + dataEntry.add(((String) dataEntry.get(0)).substring(((String) dataEntry.get(0)).length() - filter.getSuffixLength())); + } + } else { + String key = (String) dataEntry.get(0); + // real prefix + String rpf = ""; + for(String pf : filter.getPrefixList()){ + if (pf.equals(key.substring(0, pf.length()))){ + rpf = pf; + break; + } + } + + // real suffix + String rsf = ""; + for(String sf : filter.getSuffixList()){ + if (sf.equals(key.substring(key.length() - sf.length()))){ + rsf = sf; + break; + } + } + + if (filter.getPrefixList().size() > 0) { + dataEntry.add(rpf); + } + dataEntry.add(key.substring(rpf.length(), key.length() - rsf.length())); + if (filter.getSuffixList().size() > 0) { + dataEntry.add(rsf); + } + } + } + int i = 0; for (CalculateFor otherKey : filter.getMultipleKeys()){ switch(i){ @@ -207,7 +259,9 @@ public class Export { } if (filter.getCollocability().size() > 0){ - dataEntry.add(String.format("%.4f", statistics.getCollocability().get(filter.getCollocability().get(0)).get(e.getKey()))); + for (Collocability c : filter.getCollocability()) { + dataEntry.add(statistics.getCollocability().get(c).get(e.getKey())); + } } // Write msd separated per letters at the end of each line in csv @@ -419,6 +473,5 @@ public class Export { List values = new ArrayList(); csvFilePrinter.printRecord(values); csvFilePrinter.printRecord(values); - } }