Added the rest of collocabilities
This commit is contained in:
parent
abc15360d3
commit
a4df732678
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -11,6 +11,7 @@ release.properties
|
|||
dependency-reduced-pom.xml
|
||||
buildNumber.properties
|
||||
.mvn/timing.properties
|
||||
corpus-analyzer.iml
|
||||
|
||||
# Avoid ignoring Maven wrapper jar file (.jar files are usually ignored)
|
||||
!/.mvn/wrapper/maven-wrapper.jar
|
||||
|
|
|
@ -52,8 +52,51 @@ public class Ngrams {
|
|||
// generate proper MultipleHMKeys depending on filter data
|
||||
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts());
|
||||
|
||||
if(key.length() < stats.getFilter().getPrefixLength() + stats.getFilter().getSuffixLength()){
|
||||
if(stats.getFilter().getPrefixLength() != null && stats.getFilter().getSuffixLength() != null &&
|
||||
key.length() < stats.getFilter().getPrefixLength() + stats.getFilter().getSuffixLength()){
|
||||
continue;
|
||||
}
|
||||
|
||||
if(stats.getFilter().getPrefixList() != null && stats.getFilter().getSuffixList() != null &&
|
||||
(stats.getFilter().getPrefixList().size() > 0 || stats.getFilter().getSuffixList().size() > 0)){
|
||||
|
||||
String correctPrefix = "";
|
||||
// go over all prefixes in PrefixList and look for them in words
|
||||
for(String pf : stats.getFilter().getPrefixList()){
|
||||
if (pf.length() <= key.length() && pf.equals(key.substring(0, pf.length()))){
|
||||
correctPrefix = pf;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
String correctSuffix = "";
|
||||
// go over all prefixes in SuffixList and look for them in words
|
||||
for(String sf : stats.getFilter().getSuffixList()){
|
||||
if (sf.length() <= key.length() && sf.equals(key.substring(key.length() - sf.length()))){
|
||||
correctSuffix = sf;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// boolean a = (correctPrefix.equals("") && !correctSuffix.equals(""));
|
||||
// boolean b = (!correctPrefix.equals("") && correctSuffix.equals(""));
|
||||
// boolean c = (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length());
|
||||
// boolean d = !((correctPrefix.equals("") && !correctSuffix.equals("")) ||
|
||||
// (!correctPrefix.equals("") && correctSuffix.equals("")) ||
|
||||
// (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length()));
|
||||
|
||||
if(!((stats.getFilter().getPrefixList().size() == 0 && !correctSuffix.equals("")) ||
|
||||
(!correctPrefix.equals("") && stats.getFilter().getSuffixList().size() == 0) ||
|
||||
(!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length()))){
|
||||
continue;
|
||||
}
|
||||
|
||||
// if(!((correctPrefix.equals("") && !correctSuffix.equals("")) ||
|
||||
// (!correctPrefix.equals("") && correctSuffix.equals("")) ||
|
||||
// (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length()))){
|
||||
// continue;
|
||||
// }
|
||||
|
||||
}
|
||||
|
||||
// if last letter is ',' erase it
|
||||
|
|
|
@ -1,7 +1,12 @@
|
|||
package data;
|
||||
|
||||
public enum Collocability {
|
||||
DICE("Dice");
|
||||
DICE("Dice"),
|
||||
TSCORE("t-score"),
|
||||
MI("MI"),
|
||||
MI3("MI3"),
|
||||
LOGDICE("logDice"),
|
||||
SIMPLELL("simple LL");
|
||||
|
||||
private final String name;
|
||||
|
||||
|
@ -17,6 +22,16 @@ public enum Collocability {
|
|||
if (cf != null) {
|
||||
if (DICE.toString().equals(cf)) {
|
||||
return DICE;
|
||||
} else if (TSCORE.toString().equals(cf)) {
|
||||
return TSCORE;
|
||||
} else if (MI.toString().equals(cf)) {
|
||||
return MI;
|
||||
} else if (MI3.toString().equals(cf)) {
|
||||
return MI3;
|
||||
} else if (LOGDICE.toString().equals(cf)) {
|
||||
return LOGDICE;
|
||||
} else if (SIMPLELL.toString().equals(cf)) {
|
||||
return SIMPLELL;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
|
@ -25,7 +40,17 @@ public enum Collocability {
|
|||
public String toMetadataString() {
|
||||
switch(this){
|
||||
case DICE:
|
||||
return "Kolokabilnost - Dice:";
|
||||
return "Dice";
|
||||
case TSCORE:
|
||||
return "t-score";
|
||||
case MI:
|
||||
return "MI";
|
||||
case MI3:
|
||||
return "MI3";
|
||||
case LOGDICE:
|
||||
return "logDice";
|
||||
case SIMPLELL:
|
||||
return "simple LL";
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
|
@ -34,7 +59,17 @@ public enum Collocability {
|
|||
public String toHeaderString() {
|
||||
switch(this){
|
||||
case DICE:
|
||||
return "Kolokabilnost - Dice";
|
||||
return "Dice";
|
||||
case TSCORE:
|
||||
return "t-score";
|
||||
case MI:
|
||||
return "MI";
|
||||
case MI3:
|
||||
return "MI3";
|
||||
case LOGDICE:
|
||||
return "logDice";
|
||||
case SIMPLELL:
|
||||
return "simple LL";
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -428,9 +428,14 @@ public class StatisticsNew {
|
|||
Integer ngramLevel = filter.getNgramValue();
|
||||
if (ngramLevel == 0)
|
||||
info.put("Analiza", "Črke");
|
||||
else if (ngramLevel == 1)
|
||||
else if (ngramLevel == 1) {
|
||||
// if suffixes or prefixes are not null print word parts
|
||||
if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) {
|
||||
info.put("Analiza", "Besedni deli");
|
||||
} else {
|
||||
info.put("Analiza", "Besede");
|
||||
else
|
||||
}
|
||||
} else
|
||||
info.put("Analiza", filter.getAl().toString());
|
||||
} else {
|
||||
info.put("Analiza", filter.getAl().toString());
|
||||
|
@ -492,22 +497,68 @@ public class StatisticsNew {
|
|||
public void updateCalculateCollocabilities(StatisticsNew oneWordStatistics) {
|
||||
Map<String, Map<MultipleHMKeys, AtomicLong>> oneWordTaxonomyResult = oneWordStatistics.getTaxonomyResult();
|
||||
|
||||
Map<MultipleHMKeys, Double> collocabilityMap = new ConcurrentHashMap<>();
|
||||
Map<Collocability, Map<MultipleHMKeys, Double>> collocabilityMap = new ConcurrentHashMap<>();
|
||||
|
||||
for(Collocability c : filter.getCollocability()){
|
||||
collocabilityMap.put(c, new ConcurrentHashMap<>());
|
||||
}
|
||||
|
||||
// count number of all words
|
||||
long N = 0;
|
||||
for(AtomicLong a : oneWordTaxonomyResult.get("Total").values()){
|
||||
N += a.longValue();
|
||||
}
|
||||
|
||||
for(MultipleHMKeys hmKey : taxonomyResult.get("Total").keySet()) {
|
||||
// String[] splitedString = hmKey.getK1().split("\\s+");
|
||||
|
||||
long sum_fwi =0L;
|
||||
long mul_fwi =1L;
|
||||
|
||||
for(MultipleHMKeys smallHmKey : hmKey.getSplittedMultipleHMKeys()){
|
||||
System.out.println(smallHmKey.getK1());
|
||||
// System.out.println(smallHmKey.getK1());
|
||||
sum_fwi += oneWordTaxonomyResult.get("Total").get(smallHmKey).longValue();
|
||||
mul_fwi *= oneWordTaxonomyResult.get("Total").get(smallHmKey).longValue();
|
||||
}
|
||||
// String t = hmKey.getK1();
|
||||
// if(hmKey.getK1().equals("v Slovenija")){
|
||||
// System.out.println("TEST");
|
||||
//
|
||||
// }
|
||||
double O = (double)taxonomyResult.get("Total").get(hmKey).longValue();
|
||||
double n = (double)filter.getNgramValue();
|
||||
double E = (double)mul_fwi / Math.pow(N, n - 1);
|
||||
if (collocabilityMap.keySet().contains(Collocability.DICE)){
|
||||
double dice_value = n * O / sum_fwi;
|
||||
collocabilityMap.get(Collocability.DICE).put(hmKey, dice_value);
|
||||
}
|
||||
if (collocabilityMap.keySet().contains(Collocability.TSCORE)){
|
||||
double t_score = (O - E) / Math.sqrt(O);
|
||||
collocabilityMap.get(Collocability.TSCORE).put(hmKey, t_score);
|
||||
}
|
||||
if (collocabilityMap.keySet().contains(Collocability.MI)){
|
||||
double MI = Math.log(O / E) / Math.log(2);
|
||||
collocabilityMap.get(Collocability.MI).put(hmKey, MI);
|
||||
}
|
||||
if (collocabilityMap.keySet().contains(Collocability.MI3)){
|
||||
double MI3 = Math.log(Math.pow(O, 3.0) / E) / Math.log(2);
|
||||
collocabilityMap.get(Collocability.MI3).put(hmKey, MI3);
|
||||
}
|
||||
if (collocabilityMap.keySet().contains(Collocability.LOGDICE)){
|
||||
double dice_value = n * O / sum_fwi;
|
||||
double log_dice = 14 + Math.log(dice_value) / Math.log(2);
|
||||
collocabilityMap.get(Collocability.LOGDICE).put(hmKey, log_dice);
|
||||
}
|
||||
if (collocabilityMap.keySet().contains(Collocability.SIMPLELL)){
|
||||
double simple_ll = 2 * (O * Math.log10(O / E) - (O - E));
|
||||
collocabilityMap.get(Collocability.SIMPLELL).put(hmKey, simple_ll);
|
||||
}
|
||||
double dice_value = (double) filter.getNgramValue() * (double)taxonomyResult.get("Total").get(hmKey).longValue() / sum_fwi;
|
||||
collocabilityMap.put(hmKey, dice_value);
|
||||
}
|
||||
|
||||
collocability.put(filter.getCollocability().get(0), collocabilityMap);
|
||||
for(Collocability c : collocabilityMap.keySet()){
|
||||
collocability.put(c, collocabilityMap.get(c));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public Map<Collocability, Map<MultipleHMKeys, Double>> getCollocability(){
|
||||
|
|
|
@ -133,7 +133,7 @@ public class StringAnalysisTabNew2 {
|
|||
private static final ObservableList<String> alsoVisualizeItemsWordGos = FXCollections.observableArrayList("lema", "besedna vrsta", "oblikoskladenjska oznaka", "normalizirana različnica");
|
||||
private static final ObservableList<String> alsoVisualizeItemsNormalizedWord = FXCollections.observableArrayList("lema", "besedna vrsta", "oblikoskladenjska oznaka");
|
||||
private static final ObservableList<String> alsoVisualizeItemsMsd = FXCollections.observableArrayList("besedna vrsta");
|
||||
private static final ObservableList<String> COLLOCABILITY_ITEMS = FXCollections.observableArrayList("Dice");
|
||||
private static final ObservableList<String> COLLOCABILITY_ITEMS = FXCollections.observableArrayList("Dice", "t-score", "MI", "MI3", "logDice", "simple LL");
|
||||
private static final ObservableList<String> alsoVisualizeItemsEmpty = FXCollections.observableArrayList();
|
||||
|
||||
|
||||
|
|
|
@ -108,10 +108,10 @@ public class WordLevelTab {
|
|||
private boolean useDb;
|
||||
private HostServices hostService;
|
||||
|
||||
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka");
|
||||
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS = FXCollections.observableArrayList("lema", "različnica");
|
||||
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("lema", "različnica");
|
||||
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS_ORTH = FXCollections.observableArrayList("različnica");
|
||||
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS_GOS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka", "normalizirana različnica");
|
||||
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS_GOS = FXCollections.observableArrayList("lema", "različnica", "normalizirana različnica");
|
||||
private static final ObservableList<String> alsoVisualizeItemsLemma = FXCollections.observableArrayList("besedna vrsta", "oblikoskladenjska oznaka");
|
||||
private static final ObservableList<String> alsoVisualizeItemsWord = FXCollections.observableArrayList("lema", "besedna vrsta", "oblikoskladenjska oznaka");
|
||||
private static final ObservableList<String> alsoVisualizeItemsWordGos = FXCollections.observableArrayList("lema", "besedna vrsta", "oblikoskladenjska oznaka", "normalizirana različnica");
|
||||
|
|
|
@ -94,6 +94,17 @@ public class Export {
|
|||
FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString());
|
||||
if (filter.getCalculateFor().equals(CalculateFor.LEMMA))
|
||||
FILE_HEADER_AL.add("Lema male črke");
|
||||
|
||||
if (filter.getSuffixLength() != null && filter.getSuffixList() != null && filter.getPrefixLength() != null && filter.getPrefixList() != null) {
|
||||
if (filter.getPrefixLength() > 0 || filter.getPrefixList().size() > 0) {
|
||||
FILE_HEADER_AL.add("Predpona");
|
||||
}
|
||||
FILE_HEADER_AL.add("Preostali del besede");
|
||||
if (filter.getSuffixLength() > 0 || filter.getSuffixList().size() > 0) {
|
||||
FILE_HEADER_AL.add("Pripona");
|
||||
}
|
||||
}
|
||||
|
||||
headerInfoBlock.put(filter.getCalculateFor().toMetadataString(), String.valueOf(num_frequencies));
|
||||
|
||||
for (CalculateFor otherKey : filter.getMultipleKeys()) {
|
||||
|
@ -109,7 +120,9 @@ public class Export {
|
|||
FILE_HEADER_AL.add("Skupna relativna pogostost (na milijon pojavitev)");
|
||||
|
||||
if (filter.getCollocability().size() > 0){
|
||||
FILE_HEADER_AL.add(filter.getCollocability().get(0).toHeaderString());
|
||||
for (Collocability c : filter.getCollocability()) {
|
||||
FILE_HEADER_AL.add(c.toHeaderString());
|
||||
}
|
||||
}
|
||||
|
||||
for (String key : taxonomyResults.keySet()) {
|
||||
|
@ -167,6 +180,45 @@ public class Export {
|
|||
dataEntry.add(eraseSkipgramStars(e.getKey().getK1().toLowerCase(), filter));
|
||||
}
|
||||
|
||||
if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) {
|
||||
if(filter.getSuffixLength() > 0 || filter.getPrefixLength() > 0) {
|
||||
if (filter.getPrefixLength() > 0) {
|
||||
dataEntry.add(((String) dataEntry.get(0)).substring(0, filter.getPrefixLength()));
|
||||
}
|
||||
dataEntry.add(((String) dataEntry.get(0)).substring(filter.getPrefixLength(), ((String) dataEntry.get(0)).length() - filter.getSuffixLength()));
|
||||
if (filter.getSuffixLength() > 0) {
|
||||
dataEntry.add(((String) dataEntry.get(0)).substring(((String) dataEntry.get(0)).length() - filter.getSuffixLength()));
|
||||
}
|
||||
} else {
|
||||
String key = (String) dataEntry.get(0);
|
||||
// real prefix
|
||||
String rpf = "";
|
||||
for(String pf : filter.getPrefixList()){
|
||||
if (pf.equals(key.substring(0, pf.length()))){
|
||||
rpf = pf;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// real suffix
|
||||
String rsf = "";
|
||||
for(String sf : filter.getSuffixList()){
|
||||
if (sf.equals(key.substring(key.length() - sf.length()))){
|
||||
rsf = sf;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (filter.getPrefixList().size() > 0) {
|
||||
dataEntry.add(rpf);
|
||||
}
|
||||
dataEntry.add(key.substring(rpf.length(), key.length() - rsf.length()));
|
||||
if (filter.getSuffixList().size() > 0) {
|
||||
dataEntry.add(rsf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
for (CalculateFor otherKey : filter.getMultipleKeys()){
|
||||
switch(i){
|
||||
|
@ -207,7 +259,9 @@ public class Export {
|
|||
}
|
||||
|
||||
if (filter.getCollocability().size() > 0){
|
||||
dataEntry.add(String.format("%.4f", statistics.getCollocability().get(filter.getCollocability().get(0)).get(e.getKey())));
|
||||
for (Collocability c : filter.getCollocability()) {
|
||||
dataEntry.add(statistics.getCollocability().get(c).get(e.getKey()));
|
||||
}
|
||||
}
|
||||
|
||||
// Write msd separated per letters at the end of each line in csv
|
||||
|
@ -419,6 +473,5 @@ public class Export {
|
|||
List values = new ArrayList();
|
||||
csvFilePrinter.printRecord(values);
|
||||
csvFilePrinter.printRecord(values);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user