Added filter parameters to CSV + created names of columns for MSDs + [partly] fixed number of words parameter

2018-11-13 13:57:49 +01:00
parent a4df732678
commit cbfe3e6025
9 changed files with 502 additions and 219 deletions
--- a/src/main/java/alg/XML_processing.java
+++ b/src/main/java/alg/XML_processing.java
@@ -535,6 +535,7 @@ public class XML_processing {
 	public static boolean readXMLGigafida(String path, StatisticsNew stats) {
 		boolean inWord = false;
 		boolean inPunctuation = false;
+		boolean taxonomyMatch = true;
 		ArrayList<String> currentFiletaxonomy = new ArrayList<>();
 		ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
 		String lemma = "";
@@ -635,13 +636,19 @@ public class XML_processing {

 						// parser reached end of the current sentence
 						if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
+                            // count all UniGramOccurrences in sentence for statistics
+                            stats.updateUniGramOccurrences(sentence.size());
+
 							// add sentence to corpus if it passes filters
 							sentence = runFilters(sentence, stats.getFilter());

-							if (!ValidationUtil.isEmpty(sentence)) {
+
+
+							if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
 								corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
 							}

+//							taxonomyMatch = true;
 							// and start a new one
 							sentence = new ArrayList<>();

@@ -666,7 +673,9 @@ public class XML_processing {

 								if (currentFiletaxonomy.isEmpty()) {
 									// taxonomies don't match so stop
-									return false;
+//									return false;
+                                    taxonomyMatch = false;
+//									System.out.println("TEST");
 								}
 							}
 						}