Added filter parameters to CSV + created names of columns for MSDs + [partly] fixed number of words parameter

This commit is contained in:
2018-11-13 13:57:49 +01:00
parent a4df732678
commit cbfe3e6025
9 changed files with 502 additions and 219 deletions

View File

@@ -535,6 +535,7 @@ public class XML_processing {
public static boolean readXMLGigafida(String path, StatisticsNew stats) {
boolean inWord = false;
boolean inPunctuation = false;
boolean taxonomyMatch = true;
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
String lemma = "";
@@ -635,13 +636,19 @@ public class XML_processing {
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// count all UniGramOccurrences in sentence for statistics
stats.updateUniGramOccurrences(sentence.size());
// add sentence to corpus if it passes filters
sentence = runFilters(sentence, stats.getFilter());
if (!ValidationUtil.isEmpty(sentence)) {
if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
}
// taxonomyMatch = true;
// and start a new one
sentence = new ArrayList<>();
@@ -666,7 +673,9 @@ public class XML_processing {
if (currentFiletaxonomy.isEmpty()) {
// taxonomies don't match so stop
return false;
// return false;
taxonomyMatch = false;
// System.out.println("TEST");
}
}
}

View File

@@ -36,6 +36,8 @@ public class Ngrams {
ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();
for (Sentence s : corpus) {
// stats.updateUniGramOccurrences(s.getWords().size());
// skip sentences shorter than specified ngram length
if (s.getWords().size() < stats.getFilter().getNgramValue()) {
continue;
@@ -176,6 +178,8 @@ public class Ngrams {
// UPDATE TAXONOMY HERE!!!
stats.updateTaxonomyResults(multipleKeys, s.getTaxonomy());
// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
@@ -261,16 +265,34 @@ public class Ngrams {
*/
private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
// stats.updateUniGramOccurrences(s.getWords().size());
for (Word w : s.getWords()) {
List<String> taxonomy = s.getTaxonomy();
//// List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
List<Word> ngramCandidate = new ArrayList<>();
ngramCandidate.add(w);
//
// // if msd regex is set and this candidate doesn't pass it, skip this iteration
// if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
// continue;
// }
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv(), stats.getFilter().getWordParts());
// skip this iteration if:
// - word doesn't contain a proper version (missing lemma for example)
// - msd regex is given but this word's msd doesn't match it, skip this iteration
// - given substring length is larger than the word length
// boolean t1 = stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern());
// boolean t2 = !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern());
// String t3 = stats.getFilter().getMsd().get(0).pattern();
// ArrayList<CalculateFor> t4 = stats.getFilter().getWordParts();
// boolean t5 = word.length() < stats.getFilter().getStringLength();
if (ValidationUtil.isEmpty(word)
|| stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern())
|| stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())
|| word.length() < stats.getFilter().getStringLength()) {
continue;
}
@@ -330,6 +352,8 @@ public class Ngrams {
for (Sentence s : corpus) {
List<Word> sentence = s.getWords();
// stats.updateUniGramOccurrences(s.getWords().size());
if (sentence == null){
continue;
}