Added some optimizations and new taxonomy names

This commit is contained in:
2018-08-31 07:57:58 +02:00
parent 1c00f1a283
commit 426a9ccc46
21 changed files with 1345 additions and 1182 deletions

View File

@@ -43,12 +43,12 @@ public class Ngrams {
List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
// if msd regex is set and this candidate doesn't pass it, skip this iteration
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
continue;
}
// generate proper MultipleHMKeys depending on filter data
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor());
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts());
// if last letter is ',' erase it
@@ -67,14 +67,14 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys1(key);
break;
case 1:
String k1_2 = wordToString(ngramCandidate, otherKeys.get(0));
String k1_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations())
// k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length()-1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
multipleKeys = new MultipleHMKeys2(key, k1_2);
break;
case 2:
String k2_2 = wordToString(ngramCandidate, otherKeys.get(0));
String k2_3 = wordToString(ngramCandidate, otherKeys.get(1));
String k2_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k2_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
// k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
@@ -82,9 +82,9 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
break;
case 3:
String k3_2 = wordToString(ngramCandidate, otherKeys.get(0));
String k3_3 = wordToString(ngramCandidate, otherKeys.get(1));
String k3_4 = wordToString(ngramCandidate, otherKeys.get(2));
String k3_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k3_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k3_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
// k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
@@ -93,10 +93,10 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
break;
case 4:
String k4_2 = wordToString(ngramCandidate, otherKeys.get(0));
String k4_3 = wordToString(ngramCandidate, otherKeys.get(1));
String k4_4 = wordToString(ngramCandidate, otherKeys.get(2));
String k4_5 = wordToString(ngramCandidate, otherKeys.get(3));
String k4_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k4_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k4_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
String k4_5 = wordToString(ngramCandidate, otherKeys.get(3), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
// k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
@@ -137,7 +137,7 @@ public class Ngrams {
/**
* Checks whether an ngram candidate passes specified regex filter.
*/
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex, ArrayList<CalculateFor> wordParts) {
if (ngramCandidate.size() != regex.size()) {
logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
return false;
@@ -145,7 +145,7 @@ public class Ngrams {
for (int i = 0; i < regex.size(); i++) {
//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern() + ".*")) {
if (!ngramCandidate.get(i).getMsd(wordParts).matches(regex.get(i).pattern() + ".*")) {
return false;
}
}
@@ -153,33 +153,33 @@ public class Ngrams {
return true;
}
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor, ArrayList<CalculateFor> wordParts) {
ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());
switch (calculateFor) {
case LEMMA:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getLemma)
.map(w -> w.getLemma(wordParts))
.collect(Collectors.toList()));
return StringUtils.join(candidate, " ");
case WORD:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getWord)
.map(w -> w.getWord(wordParts))
.collect(Collectors.toList()));
return StringUtils.join(candidate, " ");
case MORPHOSYNTACTIC_SPECS:
case MORPHOSYNTACTIC_PROPERTY:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getMsd)
.map(w -> w.getMsd(wordParts))
.collect(Collectors.toList()));
return StringUtils.join(candidate, " ");
case WORD_TYPE:
candidate.addAll(ngramCandidate
.stream()
.map(w -> Character.toString(w.getMsd().charAt(0)))
.map(w -> Character.toString(w.getMsd(wordParts).charAt(0)))
.collect(Collectors.toList()));
// candidate.addAll(ngramCandidate
// .stream()
@@ -190,7 +190,7 @@ public class Ngrams {
case NORMALIZED_WORD:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getNormalizedWord)
.map(w -> w.getNormalizedWord(wordParts))
.collect(Collectors.toList()));
return StringUtils.join(candidate, " ");
}
@@ -208,14 +208,14 @@ public class Ngrams {
for (Sentence s : corpus) {
for (Word w : s.getWords()) {
List<String> taxonomy = s.getTaxonomy();
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv(), stats.getFilter().getWordParts());
// skip this iteration if:
// - word doesn't contain a proper version (missing lemma for example)
// - msd regex is given but this word's msd doesn't match it, skip this iteration
// - given substring length is larger than the word length
if (ValidationUtil.isEmpty(word)
|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())
|| stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern())
|| word.length() < stats.getFilter().getStringLength()) {
continue;
}
@@ -331,7 +331,7 @@ public class Ngrams {
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats, List<String> taxonomy) {
// count if no regex is set or if it is & candidate passes it
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
// String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
// key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
// stats.updateTaxonomyResults(new MultipleHMKeys1(key),
@@ -340,7 +340,7 @@ public class Ngrams {
ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();
String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts());
// if last letter is ',' erase it
@@ -359,14 +359,14 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys1(key);
break;
case 1:
String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0));
String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations())
// k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length() - 1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
multipleKeys = new MultipleHMKeys2(key, k1_2);
break;
case 2:
String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0));
String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1));
String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
// k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
@@ -374,9 +374,9 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
break;
case 3:
String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0));
String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1));
String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2));
String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
// k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
@@ -385,10 +385,10 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
break;
case 4:
String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0));
String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1));
String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2));
String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3));
String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
// k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;