Added new ssj500k reading option. Fixed GOS taxonomy

2018-09-03 13:31:41 +02:00
parent 426a9ccc46
commit 1d9e9b7ed6
9 changed files with 280 additions and 40 deletions
--- a/src/main/java/alg/ngram/Ngrams.java
+++ b/src/main/java/alg/ngram/Ngrams.java
@@ -15,6 +15,8 @@ import org.apache.logging.log4j.Logger;

 import gui.ValidationUtil;

+import static alg.XML_processing.createWord;
+
 public class Ngrams {
 	public final static Logger logger = LogManager.getLogger(Ngrams.class);

@@ -138,16 +140,22 @@ public class Ngrams {
 	 * Checks whether an ngram candidate passes specified regex filter.
 	 */
 	private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex, ArrayList<CalculateFor> wordParts) {
-		if (ngramCandidate.size() != regex.size()) {
-			logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
-			return false;
-		}
+//		if (ngramCandidate.size() != regex.size()) {
+//			logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
+//			return false;
+//		}

-		for (int i = 0; i < regex.size(); i++) {
+		int j = 0;
+		for (int i = 0; i < ngramCandidate.size(); i++) {
+		    String msd = ngramCandidate.get(i).getMsd(wordParts);
+		    if (msd.equals("*")){
+		        continue;
+            }
 			//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
-			if (!ngramCandidate.get(i).getMsd(wordParts).matches(regex.get(i).pattern() + ".*")) {
+			if (!msd.matches(regex.get(j).pattern() + ".*")) {
 				return false;
 			}
+			j ++;
 		}

 		return true;
@@ -270,6 +278,7 @@ public class Ngrams {
 		ArrayList<Word> currentLoop;
 		int ngram = stats.getFilter().getNgramValue();
 		int skip = stats.getFilter().getSkipValue();
+		Word w = createWord("*", "*", "*", "*", stats.getFilter());

 		for (Sentence s : corpus) {
 			List<Word> sentence = s.getWords();
@@ -283,7 +292,8 @@ public class Ngrams {
 					if (ngram == 2 && j < sentence.size()) {
 						currentLoop = new ArrayList<>();
 //						currentLoop.add(sentence.get(i));
-						currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
+						currentLoop.add(sentence.get(i));
+						fillSkipgrams(currentLoop, i, j, w);
 						currentLoop.add(sentence.get(j));

 						validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
@@ -291,8 +301,10 @@ public class Ngrams {
 						for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
 							if (ngram == 3 && k < sentence.size()) {
 								currentLoop = new ArrayList<>();
-								currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
-								currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
+								currentLoop.add(sentence.get(i));
+								fillSkipgrams(currentLoop, i, j, w);
+								currentLoop.add(sentence.get(j));
+								fillSkipgrams(currentLoop, j, k, w);
 								currentLoop.add(sentence.get(k));

 								validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
@@ -300,9 +312,12 @@ public class Ngrams {
 								for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
 									if (ngram == 4 && l < sentence.size()) {
 										currentLoop = new ArrayList<>();
-										currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
-										currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
-										currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats));
+										currentLoop.add(sentence.get(i));
+										fillSkipgrams(currentLoop, i, j, w);
+										currentLoop.add(sentence.get(j));
+										fillSkipgrams(currentLoop, j, k, w);
+										currentLoop.add(sentence.get(k));
+										fillSkipgrams(currentLoop, k, l, w);
 										currentLoop.add(sentence.get(l));

 										validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
@@ -310,10 +325,14 @@ public class Ngrams {
 										for (int m = l + 1; m <= l + 1 + skip; m++) { // 5gram
 											if (ngram == 5 && m < sentence.size()) {
 												currentLoop = new ArrayList<>();
-												currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
-												currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
-												currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats));
-												currentLoop.add(checkAndModifySkipgramPunctuation(sentence, l, m, stats));
+												currentLoop.add(sentence.get(i));
+												fillSkipgrams(currentLoop, i, j, w);
+												currentLoop.add(sentence.get(j));
+												fillSkipgrams(currentLoop, j, k, w);
+												currentLoop.add(sentence.get(k));
+												fillSkipgrams(currentLoop, k, l, w);
+												currentLoop.add(sentence.get(l));
+												fillSkipgrams(currentLoop, l, m, w);
 												currentLoop.add(sentence.get(m));

 												validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
@@ -329,6 +348,12 @@ public class Ngrams {
 		}
 	}

+	private static void fillSkipgrams(ArrayList<Word> currentLoop, int i, int j, Word w){
+		for(int k = i + 1; k < j; k++){
+			currentLoop.add(w);
+		}
+	}
+
 	private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats, List<String> taxonomy) {
 		// count if no regex is set or if it is & candidate passes it
 		if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {