Some functionality from OneWord copied to StringAnalysis and fixed

This commit is contained in:
2018-08-22 09:11:14 +02:00
parent e140a9538b
commit a8d147de52
12 changed files with 289 additions and 89 deletions

View File

@@ -67,22 +67,43 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys1(key);
break;
case 1:
multipleKeys = new MultipleHMKeys2(key, wordToString(ngramCandidate, otherKeys.get(0)));
String k1_2 = wordToString(ngramCandidate, otherKeys.get(0));
if (stats.getFilter().getNotePunctuations())
k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length()-1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
multipleKeys = new MultipleHMKeys2(key, k1_2);
break;
case 2:
multipleKeys = new MultipleHMKeys3(key, wordToString(ngramCandidate, otherKeys.get(0)),
wordToString(ngramCandidate, otherKeys.get(1)));
String k2_2 = wordToString(ngramCandidate, otherKeys.get(0));
String k2_3 = wordToString(ngramCandidate, otherKeys.get(1));
if (stats.getFilter().getNotePunctuations()) {
k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
}
multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
break;
case 3:
multipleKeys = new MultipleHMKeys4(key, wordToString(ngramCandidate, otherKeys.get(0)),
wordToString(ngramCandidate, otherKeys.get(1)),
wordToString(ngramCandidate, otherKeys.get(2)));
String k3_2 = wordToString(ngramCandidate, otherKeys.get(0));
String k3_3 = wordToString(ngramCandidate, otherKeys.get(1));
String k3_4 = wordToString(ngramCandidate, otherKeys.get(2));
if (stats.getFilter().getNotePunctuations()) {
k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
k3_4 = (!k3_4.equals("") && k3_4.charAt(k3_4.length() - 1) == ',') ? k3_4.substring(0, k3_4.length() - 1) : k3_4;
}
multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
break;
case 4:
multipleKeys = new MultipleHMKeys5(key, wordToString(ngramCandidate, otherKeys.get(0)),
wordToString(ngramCandidate, otherKeys.get(1)),
wordToString(ngramCandidate, otherKeys.get(2)),
wordToString(ngramCandidate, otherKeys.get(3)));
String k4_2 = wordToString(ngramCandidate, otherKeys.get(0));
String k4_3 = wordToString(ngramCandidate, otherKeys.get(1));
String k4_4 = wordToString(ngramCandidate, otherKeys.get(2));
String k4_5 = wordToString(ngramCandidate, otherKeys.get(3));
if (stats.getFilter().getNotePunctuations()) {
k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
k4_4 = (!k4_4.equals("") && k4_4.charAt(k4_4.length() - 1) == ',') ? k4_4.substring(0, k4_4.length() - 1) : k4_4;
k4_5 = (!k4_5.equals("") && k4_5.charAt(k4_5.length() - 1) == ',') ? k4_5.substring(0, k4_5.length() - 1) : k4_5;
}
multipleKeys = new MultipleHMKeys5(key, k4_2, k4_3, k4_4, k4_5);
break;
default:
multipleKeys = null;
@@ -265,7 +286,7 @@ public class Ngrams {
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
currentLoop.add(sentence.get(j));
validateAndCountSkipgramCandidate(currentLoop, stats);
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
} else {
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
if (ngram == 3 && k < sentence.size()) {
@@ -274,7 +295,7 @@ public class Ngrams {
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
currentLoop.add(sentence.get(k));
validateAndCountSkipgramCandidate(currentLoop, stats);
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
} else {
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
if (ngram == 4 && l < sentence.size()) {
@@ -284,7 +305,7 @@ public class Ngrams {
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats));
currentLoop.add(sentence.get(l));
validateAndCountSkipgramCandidate(currentLoop, stats);
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
} else {
for (int m = l + 1; m <= l + 1 + skip; m++) { // 5gram
if (ngram == 5 && m < sentence.size()) {
@@ -295,7 +316,7 @@ public class Ngrams {
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, l, m, stats));
currentLoop.add(sentence.get(m));
validateAndCountSkipgramCandidate(currentLoop, stats);
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
}
}
}
@@ -308,13 +329,80 @@ public class Ngrams {
}
}
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats, List<String> taxonomy) {
// count if no regex is set or if it is & candidate passes it
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
stats.updateTaxonomyResults(new MultipleHMKeys1(key),
stats.getCorpus().getTaxonomy());
// String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
// key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
// stats.updateTaxonomyResults(new MultipleHMKeys1(key),
// stats.getCorpus().getTaxonomy());
ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();
String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
// if last letter is ',' erase it
// if (key.equals("")){
// String test = key;
// }
if (stats.getFilter().getNotePunctuations())
key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
MultipleHMKeys multipleKeys;
// create MultipleHMKeys for different amount of other keys
switch (otherKeys.size()) {
case 0:
multipleKeys = new MultipleHMKeys1(key);
break;
case 1:
String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0));
if (stats.getFilter().getNotePunctuations())
k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length() - 1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
multipleKeys = new MultipleHMKeys2(key, k1_2);
break;
case 2:
String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0));
String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1));
if (stats.getFilter().getNotePunctuations()) {
k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
}
multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
break;
case 3:
String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0));
String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1));
String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2));
if (stats.getFilter().getNotePunctuations()) {
k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
k3_4 = (!k3_4.equals("") && k3_4.charAt(k3_4.length() - 1) == ',') ? k3_4.substring(0, k3_4.length() - 1) : k3_4;
}
multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
break;
case 4:
String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0));
String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1));
String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2));
String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3));
if (stats.getFilter().getNotePunctuations()) {
k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
k4_4 = (!k4_4.equals("") && k4_4.charAt(k4_4.length() - 1) == ',') ? k4_4.substring(0, k4_4.length() - 1) : k4_4;
k4_5 = (!k4_5.equals("") && k4_5.charAt(k4_5.length() - 1) == ',') ? k4_5.substring(0, k4_5.length() - 1) : k4_5;
}
multipleKeys = new MultipleHMKeys5(key, k4_2, k4_3, k4_4, k4_5);
break;
default:
multipleKeys = null;
}
stats.updateTaxonomyResults(multipleKeys, taxonomy);
}
}
}