Added filter parameters to CSV + created names of columns for MSDs + [partly] fixed number of words parameter

2018-11-13 13:57:49 +01:00
parent a4df732678
commit cbfe3e6025
9 changed files with 502 additions and 219 deletions
--- a/src/main/java/alg/XML_processing.java
+++ b/src/main/java/alg/XML_processing.java
@@ -535,6 +535,7 @@ public class XML_processing {
 	public static boolean readXMLGigafida(String path, StatisticsNew stats) {
 		boolean inWord = false;
 		boolean inPunctuation = false;
+		boolean taxonomyMatch = true;
 		ArrayList<String> currentFiletaxonomy = new ArrayList<>();
 		ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
 		String lemma = "";
@@ -635,13 +636,19 @@ public class XML_processing {

 						// parser reached end of the current sentence
 						if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
+                            // count all UniGramOccurrences in sentence for statistics
+                            stats.updateUniGramOccurrences(sentence.size());
+
 							// add sentence to corpus if it passes filters
 							sentence = runFilters(sentence, stats.getFilter());

-							if (!ValidationUtil.isEmpty(sentence)) {
+
+
+							if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
 								corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
 							}

+//							taxonomyMatch = true;
 							// and start a new one
 							sentence = new ArrayList<>();

@@ -666,7 +673,9 @@ public class XML_processing {

 								if (currentFiletaxonomy.isEmpty()) {
 									// taxonomies don't match so stop
-									return false;
+//									return false;
+                                    taxonomyMatch = false;
+//									System.out.println("TEST");
 								}
 							}
 						}
--- a/src/main/java/alg/ngram/Ngrams.java
+++ b/src/main/java/alg/ngram/Ngrams.java
@@ -36,6 +36,8 @@ public class Ngrams {
 		ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();

 		for (Sentence s : corpus) {
+//            stats.updateUniGramOccurrences(s.getWords().size());
+
 			// skip sentences shorter than specified ngram length
 			if (s.getWords().size() < stats.getFilter().getNgramValue()) {
 				continue;
@@ -176,6 +178,8 @@ public class Ngrams {



+
+
 				// UPDATE TAXONOMY HERE!!!
                stats.updateTaxonomyResults(multipleKeys, s.getTaxonomy());
 //				stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
@@ -261,16 +265,34 @@ public class Ngrams {
 	 */
 	private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
 		for (Sentence s : corpus) {
+//            stats.updateUniGramOccurrences(s.getWords().size());
 			for (Word w : s.getWords()) {
 				List<String> taxonomy = s.getTaxonomy();
+
+////				List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
+				List<Word> ngramCandidate = new ArrayList<>();
+				ngramCandidate.add(w);
+//
+//				// if msd regex is set and this candidate doesn't pass it, skip this iteration
+//				if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
+//					continue;
+//				}
+
 				String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv(), stats.getFilter().getWordParts());

 				// skip this iteration if:
 				// - word doesn't contain a proper version (missing lemma for example)
 				// - msd regex is given but this word's msd doesn't match it, skip this iteration
 				// - given substring length is larger than the word length
+
+//                boolean t1 = stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern());
+//                boolean t2 = !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern());
+//                String t3 = stats.getFilter().getMsd().get(0).pattern();
+//                ArrayList<CalculateFor> t4 = stats.getFilter().getWordParts();
+//                boolean t5 = word.length() < stats.getFilter().getStringLength();
+
 				if (ValidationUtil.isEmpty(word)
-						|| stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern())
+						|| stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())
 						|| word.length() < stats.getFilter().getStringLength()) {
 					continue;
 				}
@@ -330,6 +352,8 @@ public class Ngrams {
 		for (Sentence s : corpus) {
 			List<Word> sentence = s.getWords();

+//			stats.updateUniGramOccurrences(s.getWords().size());
+
 			if (sentence == null){
 				continue;
 			}