Added some performance measures

2018-08-09 09:21:06 +02:00
parent 179f09c4bd
commit 9b5fa4616b
24 changed files with 734 additions and 379 deletions
--- a/src/main/java/alg/ngram/Ngrams.java
+++ b/src/main/java/alg/ngram/Ngrams.java
@@ -3,9 +3,11 @@ package alg.ngram;

 import java.util.ArrayList;
 import java.util.List;
+import java.util.Set;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;

+import com.sun.xml.internal.bind.v2.runtime.reflect.Lister;
 import data.*;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.logging.log4j.LogManager;
@@ -28,6 +30,9 @@ public class Ngrams {
 	}

 	public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
+		// preprocess CalculateFor for this corpus and prepare data for MultipleHMKeys
+		ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();
+
 		for (Sentence s : corpus) {
 			// skip sentences shorter than specified ngram length
 			if (s.getWords().size() < stats.getFilter().getNgramValue()) {
@@ -46,29 +51,62 @@ public class Ngrams {
 				String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor());

 				// if last letter is ',' erase it
-				key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
-//				String key = "aaaaaaaaaaaaaaaaaaaaaaa";

-				String lemma = "";
-				String wordType = "";
-				String msd = "";
-				for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){
-					if(otherKey.toString().equals("lema")){
-//						lemma = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-						lemma = wordToString(ngramCandidate, otherKey);
-					} else if(otherKey.toString().equals("besedna vrsta")){
-						wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);
-					} else if(otherKey.toString().equals("oblikoskladenjska oznaka")){
-						msd = wordToString(ngramCandidate, otherKey);
-					}
+//				if (key.equals("")){
+//					String test = key;
+//				}
+
+//				key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
+
+				MultipleHMKeys multipleKeys;
+
+				// create MultipleHMKeys for different amount of other keys
+				switch (otherKeys.size()) {
+					case 0:
+						multipleKeys = new MultipleHMKeys1(key);
+						break;
+					case 1:
+						multipleKeys = new MultipleHMKeys2(key, wordToString(ngramCandidate, otherKeys.get(0)));
+						break;
+					case 2:
+						multipleKeys = new MultipleHMKeys3(key, wordToString(ngramCandidate, otherKeys.get(0)),
+								wordToString(ngramCandidate, otherKeys.get(1)));
+						break;
+					case 3:
+						multipleKeys = new MultipleHMKeys4(key, wordToString(ngramCandidate, otherKeys.get(0)),
+								wordToString(ngramCandidate, otherKeys.get(1)),
+								wordToString(ngramCandidate, otherKeys.get(2)));
+						break;
+					case 4:
+						multipleKeys = new MultipleHMKeys5(key, wordToString(ngramCandidate, otherKeys.get(0)),
+								wordToString(ngramCandidate, otherKeys.get(1)),
+								wordToString(ngramCandidate, otherKeys.get(2)),
+								wordToString(ngramCandidate, otherKeys.get(3)));
+						break;
+					default:
+						multipleKeys = null;
 				}


+//				String lemma = "";
+//				String wordType = "";
+//				String msd = "";
+//				for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){
+//					if(otherKey.toString().equals("lema")){
+//						lemma = wordToString(ngramCandidate, otherKey);
+//					} else if(otherKey.toString().equals("besedna vrsta")){
+//						wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);
+//					} else if(otherKey.toString().equals("oblikoskladenjska oznaka")){
+//						msd = wordToString(ngramCandidate, otherKey);
+//					}
+//				}
+//
+//				MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd);
+

-				MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd);

 				// UPDATE TAXONOMY HERE!!!
-                stats.updateTaxonomyResults(multipleKeys, ngramCandidate.get(0).getTaxonomy());
+                stats.updateTaxonomyResults(multipleKeys, s.getTaxonomy());
 //				stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
 			}
 		}
@@ -102,26 +140,31 @@ public class Ngrams {
 						.stream()
 						.map(Word::getLemma)
 						.collect(Collectors.toList()));
-				break;
+				return StringUtils.join(candidate, " ");
 			case WORD:
 				candidate.addAll(ngramCandidate
 						.stream()
 						.map(Word::getWord)
 						.collect(Collectors.toList()));
-				break;
+				return StringUtils.join(candidate, " ");
 			case MORPHOSYNTACTIC_SPECS:
 			case MORPHOSYNTACTIC_PROPERTY:
 				candidate.addAll(ngramCandidate
 						.stream()
 						.map(Word::getMsd)
 						.collect(Collectors.toList()));
-				break;
+				return StringUtils.join(candidate, " ");
 			case WORD_TYPE:
 				candidate.addAll(ngramCandidate
 						.stream()
 						.map(w -> Character.toString(w.getMsd().charAt(0)))
 						.collect(Collectors.toList()));
-				break;
+//				candidate.addAll(ngramCandidate
+//						.stream()
+//						.map(w -> Character.toString(w.getMsd().charAt(0)))
+//						.collect(Collectors.toList()));
+//				.substring(0, 1)
+				return StringUtils.join(candidate, " ");
 		}

 		return StringUtils.join(candidate, " ");
@@ -136,7 +179,7 @@ public class Ngrams {
 	private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
 		for (Sentence s : corpus) {
 			for (Word w : s.getWords()) {
-				List<String> taxonomy = w.getTaxonomy();
+				List<String> taxonomy = s.getTaxonomy();
 				String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());

 				// skip this iteration if:
@@ -152,7 +195,7 @@ public class Ngrams {
 				for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
 					// TODO: locila?

-					MultipleHMKeys multipleKeys = new MultipleHMKeys(word.substring(i, i + stats.getFilter().getStringLength()));
+					MultipleHMKeys multipleKeys = new MultipleHMKeys1(word.substring(i, i + stats.getFilter().getStringLength()));
 					stats.updateTaxonomyResults(multipleKeys, taxonomy);
 //					stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));

@@ -183,8 +226,7 @@ public class Ngrams {
 				String punctuation = ",";
 				return new Word(sentence.get(i).getWord() + punctuation,
 						sentence.get(i).getLemma() + punctuation,
-						sentence.get(i).getMsd() + punctuation,
-						sentence.get(i).getTaxonomy());
+						sentence.get(i).getMsd() + punctuation);
 			}
 		}
 		return sentence.get(i);
@@ -204,6 +246,10 @@ public class Ngrams {
 		for (Sentence s : corpus) {
 			List<Word> sentence = s.getWords();

+			if (sentence == null){
+				continue;
+			}
+
 			for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
 				for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
 					if (ngram == 2 && j < sentence.size()) {
@@ -260,7 +306,7 @@ public class Ngrams {
 		if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
 		    String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
            key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
-			stats.updateTaxonomyResults(new MultipleHMKeys(key, "", "", ""),
+			stats.updateTaxonomyResults(new MultipleHMKeys1(key),
 										stats.getCorpus().getTaxonomy());
 		}
 	}