Added filter parameters to CSV + created names of columns for MSDs + [partly] fixed number of words parameter

2018-11-13 13:57:49 +01:00
parent a4df732678
commit cbfe3e6025
9 changed files with 502 additions and 219 deletions
--- a/src/main/java/alg/ngram/Ngrams.java
+++ b/src/main/java/alg/ngram/Ngrams.java
@@ -36,6 +36,8 @@ public class Ngrams {
 		ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();

 		for (Sentence s : corpus) {
+//            stats.updateUniGramOccurrences(s.getWords().size());
+
 			// skip sentences shorter than specified ngram length
 			if (s.getWords().size() < stats.getFilter().getNgramValue()) {
 				continue;
@@ -176,6 +178,8 @@ public class Ngrams {



+
+
 				// UPDATE TAXONOMY HERE!!!
                stats.updateTaxonomyResults(multipleKeys, s.getTaxonomy());
 //				stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
@@ -261,16 +265,34 @@ public class Ngrams {
 	 */
 	private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
 		for (Sentence s : corpus) {
+//            stats.updateUniGramOccurrences(s.getWords().size());
 			for (Word w : s.getWords()) {
 				List<String> taxonomy = s.getTaxonomy();
+
+////				List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
+				List<Word> ngramCandidate = new ArrayList<>();
+				ngramCandidate.add(w);
+//
+//				// if msd regex is set and this candidate doesn't pass it, skip this iteration
+//				if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
+//					continue;
+//				}
+
 				String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv(), stats.getFilter().getWordParts());

 				// skip this iteration if:
 				// - word doesn't contain a proper version (missing lemma for example)
 				// - msd regex is given but this word's msd doesn't match it, skip this iteration
 				// - given substring length is larger than the word length
+
+//                boolean t1 = stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern());
+//                boolean t2 = !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern());
+//                String t3 = stats.getFilter().getMsd().get(0).pattern();
+//                ArrayList<CalculateFor> t4 = stats.getFilter().getWordParts();
+//                boolean t5 = word.length() < stats.getFilter().getStringLength();
+
 				if (ValidationUtil.isEmpty(word)
-						|| stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern())
+						|| stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())
 						|| word.length() < stats.getFilter().getStringLength()) {
 					continue;
 				}
@@ -330,6 +352,8 @@ public class Ngrams {
 		for (Sentence s : corpus) {
 			List<Word> sentence = s.getWords();

+//			stats.updateUniGramOccurrences(s.getWords().size());
+
 			if (sentence == null){
 				continue;
 			}