Added filter parameters to CSV + created names of columns for MSDs + [partly] fixed number of words parameter

This commit is contained in:
2018-11-13 13:57:49 +01:00
parent a4df732678
commit cbfe3e6025
9 changed files with 502 additions and 219 deletions

View File

@@ -535,6 +535,7 @@ public class XML_processing {
public static boolean readXMLGigafida(String path, StatisticsNew stats) {
boolean inWord = false;
boolean inPunctuation = false;
boolean taxonomyMatch = true;
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
String lemma = "";
@@ -635,13 +636,19 @@ public class XML_processing {
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// count all UniGramOccurrences in sentence for statistics
stats.updateUniGramOccurrences(sentence.size());
// add sentence to corpus if it passes filters
sentence = runFilters(sentence, stats.getFilter());
if (!ValidationUtil.isEmpty(sentence)) {
if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
}
// taxonomyMatch = true;
// and start a new one
sentence = new ArrayList<>();
@@ -666,7 +673,9 @@ public class XML_processing {
if (currentFiletaxonomy.isEmpty()) {
// taxonomies don't match so stop
return false;
// return false;
taxonomyMatch = false;
// System.out.println("TEST");
}
}
}