Added filter parameters to CSV + created names of columns for MSDs + [partly] fixed number of words parameter
This commit is contained in:
@@ -535,6 +535,7 @@ public class XML_processing {
|
||||
public static boolean readXMLGigafida(String path, StatisticsNew stats) {
|
||||
boolean inWord = false;
|
||||
boolean inPunctuation = false;
|
||||
boolean taxonomyMatch = true;
|
||||
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
|
||||
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
|
||||
String lemma = "";
|
||||
@@ -635,13 +636,19 @@ public class XML_processing {
|
||||
|
||||
// parser reached end of the current sentence
|
||||
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
||||
// count all UniGramOccurrences in sentence for statistics
|
||||
stats.updateUniGramOccurrences(sentence.size());
|
||||
|
||||
// add sentence to corpus if it passes filters
|
||||
sentence = runFilters(sentence, stats.getFilter());
|
||||
|
||||
if (!ValidationUtil.isEmpty(sentence)) {
|
||||
|
||||
|
||||
if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
|
||||
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
|
||||
}
|
||||
|
||||
// taxonomyMatch = true;
|
||||
// and start a new one
|
||||
sentence = new ArrayList<>();
|
||||
|
||||
@@ -666,7 +673,9 @@ public class XML_processing {
|
||||
|
||||
if (currentFiletaxonomy.isEmpty()) {
|
||||
// taxonomies don't match so stop
|
||||
return false;
|
||||
// return false;
|
||||
taxonomyMatch = false;
|
||||
// System.out.println("TEST");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user