Project copied

2018-06-19 09:15:37 +02:00
commit a18e52a599
94 changed files with 87092 additions and 0 deletions
--- a/src/main/java/alg/ngram/ForkJoin.java
+++ b/src/main/java/alg/ngram/ForkJoin.java
@@ -0,0 +1,62 @@
+package alg.ngram;
+
+import java.util.List;
+import java.util.concurrent.RecursiveAction;
+
+import data.Sentence;
+import data.StatisticsNew;
+
+public class ForkJoin extends RecursiveAction {
+	private static final long serialVersionUID = 5074814035083362355L;
+
+	private static final int ACCEPTABLE_SIZE = 1000;
+	private List<Sentence> corpus;
+	private StatisticsNew stats;
+	private int start;
+	private int end;
+
+
+	/**
+	 * Constructor for subproblems.
+	 */
+	private ForkJoin(List<Sentence> corpus, int start, int end, StatisticsNew stats) {
+		this.corpus = corpus;
+		this.start = start;
+		this.end = end;
+		this.stats = stats;
+	}
+
+	/**
+	 * Default constructor for the initial problem
+	 */
+	public ForkJoin(List<Sentence> corpus, StatisticsNew stats) {
+		this.corpus = corpus;
+		this.start = 0;
+		this.end = corpus.size();
+		this.stats = stats;
+	}
+
+	private void computeDirectly() {
+		List<Sentence> subCorpus = corpus.subList(start, end);
+		Ngrams.calculateForAll(subCorpus, stats);
+	}
+
+	@Override
+	protected void compute() {
+		int subCorpusSize = end - start;
+
+		if (subCorpusSize < ACCEPTABLE_SIZE) {
+			computeDirectly();
+		} else {
+			int mid = start + subCorpusSize / 2;
+			ForkJoin left = new ForkJoin(corpus, start, mid, stats);
+			ForkJoin right = new ForkJoin(corpus, mid, end, stats);
+
+			// fork (push to queue)-> compute -> join
+			left.fork();
+			right.fork();
+			left.join();
+			right.join();
+		}
+	}
+}
--- a/src/main/java/alg/ngram/Ngrams.java
+++ b/src/main/java/alg/ngram/Ngrams.java
@@ -0,0 +1,204 @@
+package alg.ngram;
+
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import data.CalculateFor;
+import data.Sentence;
+import data.StatisticsNew;
+import data.Word;
+import gui.ValidationUtil;
+
+public class Ngrams {
+	public final static Logger logger = LogManager.getLogger(Ngrams.class);
+
+
+	public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
+		if (stats.getFilter().getNgramValue() == 0) { // letter ngram
+			generateNgramLetterCandidates(corpus, stats);
+		} else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) {
+			generateSkipgramCandidates(corpus, stats);
+		} else {
+			generateNgramCandidates(corpus, stats);
+		}
+	}
+
+	public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
+		for (Sentence s : corpus) {
+			// skip sentences shorter than specified ngram length
+			if (s.getWords().size() < stats.getFilter().getNgramValue()) {
+				continue;
+			}
+
+			for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) {
+				List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
+
+				// if msd regex is set and this candidate doesn't pass it, skip this iteration
+				if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {
+					continue;
+				}
+
+				stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
+			}
+		}
+	}
+
+	/**
+	 * Checks whether an ngram candidate passes specified regex filter.
+	 */
+	private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {
+		if (ngramCandidate.size() != regex.size()) {
+			logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
+			return false;
+		}
+
+		for (int i = 0; i < regex.size(); i++) {
+			if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+	private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {
+		ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());
+
+		switch (calculateFor) {
+			case LEMMA:
+				candidate.addAll(ngramCandidate
+						.stream()
+						.map(Word::getLemma)
+						.collect(Collectors.toList()));
+				break;
+			case WORD:
+				candidate.addAll(ngramCandidate
+						.stream()
+						.map(Word::getWord)
+						.collect(Collectors.toList()));
+				break;
+			case MORPHOSYNTACTIC_SPECS:
+			case MORPHOSYNTACTIC_PROPERTY:
+				candidate.addAll(ngramCandidate
+						.stream()
+						.map(Word::getMsd)
+						.collect(Collectors.toList()));
+				break;
+			case WORD_TYPE:
+				candidate.addAll(ngramCandidate
+						.stream()
+						.map(w -> Character.toString(w.getMsd().charAt(0)))
+						.collect(Collectors.toList()));
+				break;
+		}
+
+		return StringUtils.join(candidate, " ");
+	}
+
+	/**
+	 * Generates candidates and updates results
+	 *
+	 * @param corpus
+	 * @param stats
+	 */
+	private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
+		for (Sentence s : corpus) {
+			for (Word w : s.getWords()) {
+				String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
+
+				// skip this iteration if:
+				// - word doesn't contain a proper version (missing lemma for example)
+				// - msd regex is given but this word's msd doesn't match it, skip this iteration
+				// - given substring length is larger than the word length
+				if (ValidationUtil.isEmpty(word)
+						|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())
+						|| word.length() < stats.getFilter().getStringLength()) {
+					continue;
+				}
+
+				for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
+					// TODO: locila?
+					stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength()));
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Extracts skipgram candidates.
+	 *
+	 * @return List of candidates represented as a list<candidates(String)>
+	 */
+	public static void generateSkipgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
+		ArrayList<Word> currentLoop;
+		int ngram = stats.getFilter().getNgramValue();
+		int skip = stats.getFilter().getSkipValue();
+
+		for (Sentence s : corpus) {
+			List<Word> sentence = s.getWords();
+
+			for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
+				for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
+					if (ngram == 2 && j < sentence.size()) {
+						currentLoop = new ArrayList<>();
+						currentLoop.add(sentence.get(i));
+						currentLoop.add(sentence.get(j));
+
+						validateAndCountSkipgramCandidate(currentLoop, stats);
+					} else {
+						for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
+							if (ngram == 3 && k < sentence.size()) {
+								currentLoop = new ArrayList<>();
+								currentLoop.add(sentence.get(i));
+								currentLoop.add(sentence.get(j));
+								currentLoop.add(sentence.get(k));
+
+								validateAndCountSkipgramCandidate(currentLoop, stats);
+							} else {
+								for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
+									if (ngram == 4 && k < sentence.size()) {
+										currentLoop = new ArrayList<>();
+										currentLoop.add(sentence.get(i));
+										currentLoop.add(sentence.get(j));
+										currentLoop.add(sentence.get(k));
+										currentLoop.add(sentence.get(l));
+
+										validateAndCountSkipgramCandidate(currentLoop, stats);
+									} else {
+										for (int m = k + 1; m <= k + 1 + skip; m++) { // 5gram
+											if (ngram == 5 && k < sentence.size()) {
+												currentLoop = new ArrayList<>();
+												currentLoop.add(sentence.get(i));
+												currentLoop.add(sentence.get(j));
+												currentLoop.add(sentence.get(k));
+												currentLoop.add(sentence.get(l));
+												currentLoop.add(sentence.get(m));
+
+												validateAndCountSkipgramCandidate(currentLoop, stats);
+											}
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+	private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
+		// count if no regex is set or if it is & candidate passes it
+		if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
+			stats.updateResults(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()));
+		}
+	}
+}