list/src/main/java/alg/ngram/Ngrams.java

package alg.ngram;


import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import data.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import gui.ValidationUtil;

public class Ngrams {
	public final static Logger logger = LogManager.getLogger(Ngrams.class);


	public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
		if (stats.getFilter().getNgramValue() == 0) { // letter ngram
			generateNgramLetterCandidates(corpus, stats);
		} else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) {
			generateSkipgramCandidates(corpus, stats);
		} else {
			generateNgramCandidates(corpus, stats);
		}
	}

	public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
		for (Sentence s : corpus) {
			// skip sentences shorter than specified ngram length
			if (s.getWords().size() < stats.getFilter().getNgramValue()) {
				continue;
			}

			for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) {
				List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());

				// if msd regex is set and this candidate doesn't pass it, skip this iteration
				if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {
					continue;
				}

				// generate proper MultipleHMKeys depending on filter data
				String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor());
				String lemma = "";
				String wordType = "";
				String msd = "";
				for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){
					if(otherKey.toString().equals("lema")){
						lemma = wordToString(ngramCandidate, otherKey);
					} else if(otherKey.toString().equals("besedna vrsta")){
						wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);
					} else if(otherKey.toString().equals("oblikoskladenjska oznaka")){
						msd = wordToString(ngramCandidate, otherKey);
					}
				}

				MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd);

				// UPDATE TAXONOMY HERE!!!
                stats.updateTaxonomyResults(multipleKeys, ngramCandidate.get(0).getTaxonomy());
//				stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
			}
		}
	}

	/**
	 * Checks whether an ngram candidate passes specified regex filter.
	 */
	private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {
		if (ngramCandidate.size() != regex.size()) {
			logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
			return false;
		}

		for (int i = 0; i < regex.size(); i++) {
			//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
			if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern() + ".*")) {
				return false;
			}
		}

		return true;
	}

	private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {
		ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());

		switch (calculateFor) {
			case LEMMA:
				candidate.addAll(ngramCandidate
						.stream()
						.map(Word::getLemma)
						.collect(Collectors.toList()));
				break;
			case WORD:
				candidate.addAll(ngramCandidate
						.stream()
						.map(Word::getWord)
						.collect(Collectors.toList()));
				break;
			case MORPHOSYNTACTIC_SPECS:
			case MORPHOSYNTACTIC_PROPERTY:
				candidate.addAll(ngramCandidate
						.stream()
						.map(Word::getMsd)
						.collect(Collectors.toList()));
				break;
			case WORD_TYPE:
				candidate.addAll(ngramCandidate
						.stream()
						.map(w -> Character.toString(w.getMsd().charAt(0)))
						.collect(Collectors.toList()));
				break;
		}

		return StringUtils.join(candidate, " ");
	}

	/**
	 * Generates candidates and updates results
	 *
	 * @param corpus
	 * @param stats
	 */
	private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
		for (Sentence s : corpus) {
			for (Word w : s.getWords()) {
				List<String> taxonomy = w.getTaxonomy();
				String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());

				// skip this iteration if:
				// - word doesn't contain a proper version (missing lemma for example)
				// - msd regex is given but this word's msd doesn't match it, skip this iteration
				// - given substring length is larger than the word length
				if (ValidationUtil.isEmpty(word)
						|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())
						|| word.length() < stats.getFilter().getStringLength()) {
					continue;
				}

				for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
					// TODO: locila?

					MultipleHMKeys multipleKeys = new MultipleHMKeys(word.substring(i, i + stats.getFilter().getStringLength()));
					stats.updateTaxonomyResults(multipleKeys, taxonomy);
//					stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));


					stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength()));
				}
			}
		}
	}


	/**
	 * Extracts skipgram candidates.
	 *
	 * @return List of candidates represented as a list<candidates(String)>
	 */
	public static void generateSkipgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
		ArrayList<Word> currentLoop;
		int ngram = stats.getFilter().getNgramValue();
		int skip = stats.getFilter().getSkipValue();

		for (Sentence s : corpus) {
			List<Word> sentence = s.getWords();

			for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
				for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
					if (ngram == 2 && j < sentence.size()) {
						currentLoop = new ArrayList<>();
						currentLoop.add(sentence.get(i));
						currentLoop.add(sentence.get(j));

						validateAndCountSkipgramCandidate(currentLoop, stats);
					} else {
						for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
							if (ngram == 3 && k < sentence.size()) {
								currentLoop = new ArrayList<>();
								currentLoop.add(sentence.get(i));
								currentLoop.add(sentence.get(j));
								currentLoop.add(sentence.get(k));

								validateAndCountSkipgramCandidate(currentLoop, stats);
							} else {
								for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
									if (ngram == 4 && k < sentence.size()) {
										currentLoop = new ArrayList<>();
										currentLoop.add(sentence.get(i));
										currentLoop.add(sentence.get(j));
										currentLoop.add(sentence.get(k));
										currentLoop.add(sentence.get(l));

										validateAndCountSkipgramCandidate(currentLoop, stats);
									} else {
										for (int m = k + 1; m <= k + 1 + skip; m++) { // 5gram
											if (ngram == 5 && k < sentence.size()) {
												currentLoop = new ArrayList<>();
												currentLoop.add(sentence.get(i));
												currentLoop.add(sentence.get(j));
												currentLoop.add(sentence.get(k));
												currentLoop.add(sentence.get(l));
												currentLoop.add(sentence.get(m));

												validateAndCountSkipgramCandidate(currentLoop, stats);
											}
										}
									}
								}
							}
						}
					}
				}
			}
		}
	}

	private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
		// count if no regex is set or if it is & candidate passes it
		if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
			stats.updateResults(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()));
		}
	}
}
Project copied 6 years ago			`package alg.ngram;`


			`import java.util.ArrayList;`
			`import java.util.List;`
			`import java.util.regex.Pattern;`
			`import java.util.stream.Collectors;`

Added functional additional combinational filters for words 6 years ago			`import data.*;`
Project copied 6 years ago			`import org.apache.commons.lang3.StringUtils;`
			`import org.apache.logging.log4j.LogManager;`
			`import org.apache.logging.log4j.Logger;`

			`import gui.ValidationUtil;`

			`public class Ngrams {`
			`public final static Logger logger = LogManager.getLogger(Ngrams.class);`


			`public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {`
			`if (stats.getFilter().getNgramValue() == 0) { // letter ngram`
			`generateNgramLetterCandidates(corpus, stats);`
			`} else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) {`
			`generateSkipgramCandidates(corpus, stats);`
			`} else {`
			`generateNgramCandidates(corpus, stats);`
			`}`
			`}`

			`public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {`
			`for (Sentence s : corpus) {`
			`// skip sentences shorter than specified ngram length`
			`if (s.getWords().size() < stats.getFilter().getNgramValue()) {`
			`continue;`
			`}`

			`for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) {`
			`List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());`

			`// if msd regex is set and this candidate doesn't pass it, skip this iteration`
			`if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {`
			`continue;`
			`}`

Added functional additional combinational filters for words 6 years ago			`// generate proper MultipleHMKeys depending on filter data`
			`String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor());`
			`String lemma = "";`
			`String wordType = "";`
			`String msd = "";`
			`for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){`
			`if(otherKey.toString().equals("lema")){`
			`lemma = wordToString(ngramCandidate, otherKey);`
			`} else if(otherKey.toString().equals("besedna vrsta")){`
			`wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);`
			`} else if(otherKey.toString().equals("oblikoskladenjska oznaka")){`
			`msd = wordToString(ngramCandidate, otherKey);`
			`}`
			`}`

			`MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd);`

Added taxonomy presentation in results 6 years ago			`// UPDATE TAXONOMY HERE!!!`
Added functional additional combinational filters for words 6 years ago			`stats.updateTaxonomyResults(multipleKeys, ngramCandidate.get(0).getTaxonomy());`
			`// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));`
Project copied 6 years ago			`}`
			`}`
			`}`

			`/**`
			`* Checks whether an ngram candidate passes specified regex filter.`
			`*/`
			`private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {`
			`if (ngramCandidate.size() != regex.size()) {`
			`logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway`
			`return false;`
			`}`

			`for (int i = 0; i < regex.size(); i++) {`
Added taxonomy presentation in results 6 years ago			`//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {`
			`if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern() + ".*")) {`
Project copied 6 years ago			`return false;`
			`}`
			`}`

			`return true;`
			`}`

			`private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {`
			`ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());`

			`switch (calculateFor) {`
			`case LEMMA:`
			`candidate.addAll(ngramCandidate`
			`.stream()`
			`.map(Word::getLemma)`
			`.collect(Collectors.toList()));`
			`break;`
			`case WORD:`
			`candidate.addAll(ngramCandidate`
			`.stream()`
			`.map(Word::getWord)`
			`.collect(Collectors.toList()));`
			`break;`
			`case MORPHOSYNTACTIC_SPECS:`
			`case MORPHOSYNTACTIC_PROPERTY:`
			`candidate.addAll(ngramCandidate`
			`.stream()`
			`.map(Word::getMsd)`
			`.collect(Collectors.toList()));`
			`break;`
			`case WORD_TYPE:`
			`candidate.addAll(ngramCandidate`
			`.stream()`
			`.map(w -> Character.toString(w.getMsd().charAt(0)))`
			`.collect(Collectors.toList()));`
			`break;`
			`}`

			`return StringUtils.join(candidate, " ");`
			`}`

			`/**`
			`* Generates candidates and updates results`
			`*`
			`* @param corpus`
			`* @param stats`
			`*/`
			`private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {`
			`for (Sentence s : corpus) {`
			`for (Word w : s.getWords()) {`
Refactored results - moved to taxonomyResults 6 years ago			`List<String> taxonomy = w.getTaxonomy();`
Project copied 6 years ago			`String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());`

			`// skip this iteration if:`
			`// - word doesn't contain a proper version (missing lemma for example)`
			`// - msd regex is given but this word's msd doesn't match it, skip this iteration`
			`// - given substring length is larger than the word length`
			`if (ValidationUtil.isEmpty(word)`
			`\|\| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())`
			`\|\| word.length() < stats.getFilter().getStringLength()) {`
			`continue;`
			`}`

			`for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {`
			`// TODO: locila?`
Added functional additional combinational filters for words 6 years ago
			`MultipleHMKeys multipleKeys = new MultipleHMKeys(word.substring(i, i + stats.getFilter().getStringLength()));`
			`stats.updateTaxonomyResults(multipleKeys, taxonomy);`
Refactored results - moved to taxonomyResults 6 years ago			`// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));`


Project copied 6 years ago			`stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength()));`
			`}`
			`}`
			`}`
			`}`


			`/**`
			`* Extracts skipgram candidates.`
			`*`
			`* @return List of candidates represented as a list<candidates(String)>`
			`*/`
			`public static void generateSkipgramCandidates(List<Sentence> corpus, StatisticsNew stats) {`
			`ArrayList<Word> currentLoop;`
			`int ngram = stats.getFilter().getNgramValue();`
			`int skip = stats.getFilter().getSkipValue();`

			`for (Sentence s : corpus) {`
			`List<Word> sentence = s.getWords();`

			`for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram`
			`for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram`
			`if (ngram == 2 && j < sentence.size()) {`
			`currentLoop = new ArrayList<>();`
			`currentLoop.add(sentence.get(i));`
			`currentLoop.add(sentence.get(j));`

			`validateAndCountSkipgramCandidate(currentLoop, stats);`
			`} else {`
			`for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram`
			`if (ngram == 3 && k < sentence.size()) {`
			`currentLoop = new ArrayList<>();`
			`currentLoop.add(sentence.get(i));`
			`currentLoop.add(sentence.get(j));`
			`currentLoop.add(sentence.get(k));`

			`validateAndCountSkipgramCandidate(currentLoop, stats);`
			`} else {`
			`for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram`
			`if (ngram == 4 && k < sentence.size()) {`
			`currentLoop = new ArrayList<>();`
			`currentLoop.add(sentence.get(i));`
			`currentLoop.add(sentence.get(j));`
			`currentLoop.add(sentence.get(k));`
			`currentLoop.add(sentence.get(l));`

			`validateAndCountSkipgramCandidate(currentLoop, stats);`
			`} else {`
			`for (int m = k + 1; m <= k + 1 + skip; m++) { // 5gram`
			`if (ngram == 5 && k < sentence.size()) {`
			`currentLoop = new ArrayList<>();`
			`currentLoop.add(sentence.get(i));`
			`currentLoop.add(sentence.get(j));`
			`currentLoop.add(sentence.get(k));`
			`currentLoop.add(sentence.get(l));`
			`currentLoop.add(sentence.get(m));`

			`validateAndCountSkipgramCandidate(currentLoop, stats);`
			`}`
			`}`
			`}`
			`}`
			`}`
			`}`
			`}`
			`}`
			`}`
			`}`
			`}`

			`private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {`
			`// count if no regex is set or if it is & candidate passes it`
			`if (!stats.getFilter().hasMsd() \|\| passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {`
			`stats.updateResults(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()));`
			`}`
			`}`
			`}`