Added some performance measures

This commit is contained in:
2018-08-09 09:21:06 +02:00
parent 179f09c4bd
commit 9b5fa4616b
24 changed files with 734 additions and 379 deletions

View File

@@ -3,9 +3,11 @@ package alg.ngram;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.sun.xml.internal.bind.v2.runtime.reflect.Lister;
import data.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
@@ -28,6 +30,9 @@ public class Ngrams {
}
public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
// preprocess CalculateFor for this corpus and prepare data for MultipleHMKeys
ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();
for (Sentence s : corpus) {
// skip sentences shorter than specified ngram length
if (s.getWords().size() < stats.getFilter().getNgramValue()) {
@@ -46,29 +51,62 @@ public class Ngrams {
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor());
// if last letter is ',' erase it
key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
// String key = "aaaaaaaaaaaaaaaaaaaaaaa";
String lemma = "";
String wordType = "";
String msd = "";
for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){
if(otherKey.toString().equals("lema")){
// lemma = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
lemma = wordToString(ngramCandidate, otherKey);
} else if(otherKey.toString().equals("besedna vrsta")){
wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);
} else if(otherKey.toString().equals("oblikoskladenjska oznaka")){
msd = wordToString(ngramCandidate, otherKey);
}
// if (key.equals("")){
// String test = key;
// }
// key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
MultipleHMKeys multipleKeys;
// create MultipleHMKeys for different amount of other keys
switch (otherKeys.size()) {
case 0:
multipleKeys = new MultipleHMKeys1(key);
break;
case 1:
multipleKeys = new MultipleHMKeys2(key, wordToString(ngramCandidate, otherKeys.get(0)));
break;
case 2:
multipleKeys = new MultipleHMKeys3(key, wordToString(ngramCandidate, otherKeys.get(0)),
wordToString(ngramCandidate, otherKeys.get(1)));
break;
case 3:
multipleKeys = new MultipleHMKeys4(key, wordToString(ngramCandidate, otherKeys.get(0)),
wordToString(ngramCandidate, otherKeys.get(1)),
wordToString(ngramCandidate, otherKeys.get(2)));
break;
case 4:
multipleKeys = new MultipleHMKeys5(key, wordToString(ngramCandidate, otherKeys.get(0)),
wordToString(ngramCandidate, otherKeys.get(1)),
wordToString(ngramCandidate, otherKeys.get(2)),
wordToString(ngramCandidate, otherKeys.get(3)));
break;
default:
multipleKeys = null;
}
// String lemma = "";
// String wordType = "";
// String msd = "";
// for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){
// if(otherKey.toString().equals("lema")){
// lemma = wordToString(ngramCandidate, otherKey);
// } else if(otherKey.toString().equals("besedna vrsta")){
// wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);
// } else if(otherKey.toString().equals("oblikoskladenjska oznaka")){
// msd = wordToString(ngramCandidate, otherKey);
// }
// }
//
// MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd);
MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd);
// UPDATE TAXONOMY HERE!!!
stats.updateTaxonomyResults(multipleKeys, ngramCandidate.get(0).getTaxonomy());
stats.updateTaxonomyResults(multipleKeys, s.getTaxonomy());
// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
}
}
@@ -102,26 +140,31 @@ public class Ngrams {
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
break;
return StringUtils.join(candidate, " ");
case WORD:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
break;
return StringUtils.join(candidate, " ");
case MORPHOSYNTACTIC_SPECS:
case MORPHOSYNTACTIC_PROPERTY:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getMsd)
.collect(Collectors.toList()));
break;
return StringUtils.join(candidate, " ");
case WORD_TYPE:
candidate.addAll(ngramCandidate
.stream()
.map(w -> Character.toString(w.getMsd().charAt(0)))
.collect(Collectors.toList()));
break;
// candidate.addAll(ngramCandidate
// .stream()
// .map(w -> Character.toString(w.getMsd().charAt(0)))
// .collect(Collectors.toList()));
// .substring(0, 1)
return StringUtils.join(candidate, " ");
}
return StringUtils.join(candidate, " ");
@@ -136,7 +179,7 @@ public class Ngrams {
private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
for (Word w : s.getWords()) {
List<String> taxonomy = w.getTaxonomy();
List<String> taxonomy = s.getTaxonomy();
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
// skip this iteration if:
@@ -152,7 +195,7 @@ public class Ngrams {
for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
// TODO: locila?
MultipleHMKeys multipleKeys = new MultipleHMKeys(word.substring(i, i + stats.getFilter().getStringLength()));
MultipleHMKeys multipleKeys = new MultipleHMKeys1(word.substring(i, i + stats.getFilter().getStringLength()));
stats.updateTaxonomyResults(multipleKeys, taxonomy);
// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
@@ -183,8 +226,7 @@ public class Ngrams {
String punctuation = ",";
return new Word(sentence.get(i).getWord() + punctuation,
sentence.get(i).getLemma() + punctuation,
sentence.get(i).getMsd() + punctuation,
sentence.get(i).getTaxonomy());
sentence.get(i).getMsd() + punctuation);
}
}
return sentence.get(i);
@@ -204,6 +246,10 @@ public class Ngrams {
for (Sentence s : corpus) {
List<Word> sentence = s.getWords();
if (sentence == null){
continue;
}
for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
if (ngram == 2 && j < sentence.size()) {
@@ -260,7 +306,7 @@ public class Ngrams {
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
stats.updateTaxonomyResults(new MultipleHMKeys(key, "", "", ""),
stats.updateTaxonomyResults(new MultipleHMKeys1(key),
stats.getCorpus().getTaxonomy());
}
}