Added filter parameters to CSV + created names of columns for MSDs + [partly] fixed number of words parameter

This commit is contained in:
2018-11-13 13:57:49 +01:00
parent a4df732678
commit cbfe3e6025
9 changed files with 502 additions and 219 deletions

View File

@@ -36,6 +36,8 @@ public class Ngrams {
ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();
for (Sentence s : corpus) {
// stats.updateUniGramOccurrences(s.getWords().size());
// skip sentences shorter than specified ngram length
if (s.getWords().size() < stats.getFilter().getNgramValue()) {
continue;
@@ -176,6 +178,8 @@ public class Ngrams {
// UPDATE TAXONOMY HERE!!!
stats.updateTaxonomyResults(multipleKeys, s.getTaxonomy());
// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
@@ -261,16 +265,34 @@ public class Ngrams {
*/
private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
// stats.updateUniGramOccurrences(s.getWords().size());
for (Word w : s.getWords()) {
List<String> taxonomy = s.getTaxonomy();
//// List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
List<Word> ngramCandidate = new ArrayList<>();
ngramCandidate.add(w);
//
// // if msd regex is set and this candidate doesn't pass it, skip this iteration
// if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
// continue;
// }
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv(), stats.getFilter().getWordParts());
// skip this iteration if:
// - word doesn't contain a proper version (missing lemma for example)
// - msd regex is given but this word's msd doesn't match it, skip this iteration
// - given substring length is larger than the word length
// boolean t1 = stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern());
// boolean t2 = !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern());
// String t3 = stats.getFilter().getMsd().get(0).pattern();
// ArrayList<CalculateFor> t4 = stats.getFilter().getWordParts();
// boolean t5 = word.length() < stats.getFilter().getStringLength();
if (ValidationUtil.isEmpty(word)
|| stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern())
|| stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())
|| word.length() < stats.getFilter().getStringLength()) {
continue;
}
@@ -330,6 +352,8 @@ public class Ngrams {
for (Sentence s : corpus) {
List<Word> sentence = s.getWords();
// stats.updateUniGramOccurrences(s.getWords().size());
if (sentence == null){
continue;
}