list/src/main/java/alg/ngram/Ngrams.java

package alg.ngram;


import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import com.sun.xml.internal.bind.v2.runtime.reflect.Lister;
import data.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import gui.ValidationUtil;

import static alg.XML_processing.createWord;

public class Ngrams {
	public final static Logger logger = LogManager.getLogger(Ngrams.class);


	public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
		if (stats.getFilter().getNgramValue() == 0) { // letter ngram
			generateNgramLetterCandidates(corpus, stats);
		} else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) {
			generateSkipgramCandidates(corpus, stats);
		} else {
			generateNgramCandidates(corpus, stats);
		}
	}

	public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
		// preprocess CalculateFor for this corpus and prepare data for MultipleHMKeys
		ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();

		for (Sentence s : corpus) {
//            stats.updateUniGramOccurrences(s.getWords().size());

			// skip sentences shorter than specified ngram length
			if (s.getWords().size() < stats.getFilter().getNgramValue()) {
				continue;
			}

			for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) {
				List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());

				// if msd regex is set and this candidate doesn't pass it, skip this iteration
				if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
					continue;
				}

				// generate proper MultipleHMKeys depending on filter data
				String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts());

				if(stats.getFilter().getPrefixLength() != null && stats.getFilter().getSuffixLength() != null &&
						key.length() < stats.getFilter().getPrefixLength() + stats.getFilter().getSuffixLength()){
					continue;
				}

				if(stats.getFilter().getPrefixList() != null && stats.getFilter().getSuffixList() != null &&
                        (stats.getFilter().getPrefixList().size() > 0 || stats.getFilter().getSuffixList().size() > 0)){

                    String correctPrefix = "";
                    // go over all prefixes in PrefixList and look for them in words
                    for(String pf : stats.getFilter().getPrefixList()){
                        if (pf.length() <= key.length() && pf.equals(key.substring(0, pf.length()))){
                            correctPrefix = pf;
                            break;
                        }
                    }

                    String correctSuffix = "";
                    // go over all prefixes in SuffixList and look for them in words
                    for(String sf : stats.getFilter().getSuffixList()){
                        if (sf.length() <= key.length() && sf.equals(key.substring(key.length() - sf.length()))){
                            correctSuffix = sf;
                            break;
                        }
                    }

//                    boolean a = (correctPrefix.equals("") && !correctSuffix.equals(""));
//                    boolean b = (!correctPrefix.equals("") && correctSuffix.equals(""));
//                    boolean c = (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length());
//                    boolean d = !((correctPrefix.equals("") && !correctSuffix.equals("")) ||
//                            (!correctPrefix.equals("") && correctSuffix.equals("")) ||
//                            (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length()));

                    if(!((stats.getFilter().getPrefixList().size() == 0 && !correctSuffix.equals("")) ||
                            (!correctPrefix.equals("") && stats.getFilter().getSuffixList().size() == 0) ||
                            (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length()))){
                        continue;
                    }

//                    if(!((correctPrefix.equals("") && !correctSuffix.equals("")) ||
//                            (!correctPrefix.equals("") && correctSuffix.equals("")) ||
//                            (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length()))){
//                        continue;
//                    }

                }

				// if last letter is ',' erase it

//				if (key.equals("")){
//					String test = key;
//				}

//				if (stats.getFilter().getNotePunctuations())
//					key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;

				MultipleHMKeys multipleKeys;

				// create MultipleHMKeys for different amount of other keys
				switch (otherKeys.size()) {
					case 0:
						multipleKeys = new MultipleHMKeys1(key);
						break;
					case 1:
						String k1_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
//						if (stats.getFilter().getNotePunctuations())
//							k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length()-1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
						multipleKeys = new MultipleHMKeys2(key, k1_2);
						break;
					case 2:
						String k2_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
						String k2_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
//						if (stats.getFilter().getNotePunctuations()) {
//							k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
//							k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
//						}
						multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
						break;
					case 3:
						String k3_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
						String k3_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
						String k3_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
//						if (stats.getFilter().getNotePunctuations()) {
//							k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
//							k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
//							k3_4 = (!k3_4.equals("") && k3_4.charAt(k3_4.length() - 1) == ',') ? k3_4.substring(0, k3_4.length() - 1) : k3_4;
//						}
						multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
						break;
					case 4:
						String k4_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
						String k4_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
						String k4_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
						String k4_5 = wordToString(ngramCandidate, otherKeys.get(3), stats.getFilter().getWordParts());
//						if (stats.getFilter().getNotePunctuations()) {
//							k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
//							k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
//							k4_4 = (!k4_4.equals("") && k4_4.charAt(k4_4.length() - 1) == ',') ? k4_4.substring(0, k4_4.length() - 1) : k4_4;
//							k4_5 = (!k4_5.equals("") && k4_5.charAt(k4_5.length() - 1) == ',') ? k4_5.substring(0, k4_5.length() - 1) : k4_5;
//						}
						multipleKeys = new MultipleHMKeys5(key, k4_2, k4_3, k4_4, k4_5);
						break;
					default:
						multipleKeys = null;
				}


//				String lemma = "";
//				String wordType = "";
//				String msd = "";
//				for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){
//					if(otherKey.toString().equals("lema")){
//						lemma = wordToString(ngramCandidate, otherKey);
//					} else if(otherKey.toString().equals("besedna vrsta")){
//						wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);
//					} else if(otherKey.toString().equals("oblikoskladenjska oznaka")){
//						msd = wordToString(ngramCandidate, otherKey);
//					}
//				}
//
//				MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd);


				// UPDATE TAXONOMY HERE!!!
                stats.updateTaxonomyResults(multipleKeys, s.getTaxonomy());
//				stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
			}
		}
	}

	/**
	 * Checks whether an ngram candidate passes specified regex filter.
	 */
	private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex, ArrayList<CalculateFor> wordParts) {
//		if (ngramCandidate.size() != regex.size()) {
//			logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
//			return false;
//		}

		int j = 0;
		for (int i = 0; i < ngramCandidate.size(); i++) {
		    String msd = ngramCandidate.get(i).getMsd(wordParts);
		    if (msd.equals("*")){
		        continue;
            }
			//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
			if (!msd.matches(regex.get(j).pattern() + ".*")) {
				return false;
			}
			j ++;
		}

		return true;
	}

	private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor, ArrayList<CalculateFor> wordParts) {
		ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());

		switch (calculateFor) {
			case LEMMA:
				candidate.addAll(ngramCandidate
						.stream()
						.map(w -> w.getLemma(wordParts))
						.collect(Collectors.toList()));
				return StringUtils.join(candidate, " ");
			case WORD:
				candidate.addAll(ngramCandidate
						.stream()
						.map(w -> w.getWord(wordParts))
						.collect(Collectors.toList()));
				return StringUtils.join(candidate, " ");
			case MORPHOSYNTACTIC_SPECS:
			case MORPHOSYNTACTIC_PROPERTY:
				candidate.addAll(ngramCandidate
						.stream()
						.map(w -> w.getMsd(wordParts))
						.collect(Collectors.toList()));
				return StringUtils.join(candidate, " ");
			case WORD_TYPE:
				candidate.addAll(ngramCandidate
						.stream()
						.map(w -> Character.toString(w.getMsd(wordParts).charAt(0)))
						.collect(Collectors.toList()));
//				candidate.addAll(ngramCandidate
//						.stream()
//						.map(w -> Character.toString(w.getMsd().charAt(0)))
//						.collect(Collectors.toList()));
//				.substring(0, 1)
				return StringUtils.join(candidate, " ");
			case NORMALIZED_WORD:
				candidate.addAll(ngramCandidate
						.stream()
						.map(w -> w.getNormalizedWord(wordParts))
						.collect(Collectors.toList()));
				return StringUtils.join(candidate, " ");
		}

		return StringUtils.join(candidate, " ");
	}

	/**
	 * Generates candidates and updates results
	 *
	 * @param corpus
	 * @param stats
	 */
	private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
		for (Sentence s : corpus) {
//            stats.updateUniGramOccurrences(s.getWords().size());
			for (Word w : s.getWords()) {
				List<String> taxonomy = s.getTaxonomy();

////				List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
				List<Word> ngramCandidate = new ArrayList<>();
				ngramCandidate.add(w);
//
//				// if msd regex is set and this candidate doesn't pass it, skip this iteration
//				if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
//					continue;
//				}

				String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv(), stats.getFilter().getWordParts());

				// skip this iteration if:
				// - word doesn't contain a proper version (missing lemma for example)
				// - msd regex is given but this word's msd doesn't match it, skip this iteration
				// - given substring length is larger than the word length

//                boolean t1 = stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern());
//                boolean t2 = !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern());
//                String t3 = stats.getFilter().getMsd().get(0).pattern();
//                ArrayList<CalculateFor> t4 = stats.getFilter().getWordParts();
//                boolean t5 = word.length() < stats.getFilter().getStringLength();

				if (ValidationUtil.isEmpty(word)
						|| stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())
						|| word.length() < stats.getFilter().getStringLength()) {
					continue;
				}

				for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
					// TODO: locila?

					MultipleHMKeys multipleKeys = new MultipleHMKeys1(word.substring(i, i + stats.getFilter().getStringLength()));
					stats.updateTaxonomyResults(multipleKeys, taxonomy);
//					stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));


					stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength()));
				}
			}
		}
	}

	/**
	 * Checks skipped words and if necessary adds punctuations.
	 *
	 * @return List of candidates represented as a list<candidates(String)>
	 */
	private static Word checkAndModifySkipgramPunctuation(List<Word> sentence, int i, int j, StatisticsNew stats){
		// if punctuation checkbox selected and there words at indexes i and j are not next to each other
//		if(stats.getFilter().getNotePunctuations() && j - i > 1 && sentence.get(i).getWord().charAt(sentence.get(i).getWord().length() - 1) != ','){
//			boolean middleWordsHavePunctuation = false;
//			for (int n = i + 1; n < j; n++){
//				if (sentence.get(n).getWord().charAt(sentence.get(n).getWord().length() - 1) == ','){
//					middleWordsHavePunctuation = true;
//					break;
//				}
//			}
//			if (middleWordsHavePunctuation){
//
//				String punctuation = ",";
//				return new Word(sentence.get(i).getWord() + punctuation,
//						sentence.get(i).getLemma() + punctuation,
//						sentence.get(i).getMsd() + punctuation);
//			}
//		}
		return sentence.get(i);

	}

	/**
	 * Extracts skipgram candidates.
	 *
	 * @return List of candidates represented as a list<candidates(String)>
	 */
	public static void generateSkipgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
		ArrayList<Word> currentLoop;
		int ngram = stats.getFilter().getNgramValue();
		int skip = stats.getFilter().getSkipValue();
		Word w = createWord("*", "*", "*", "*", stats.getFilter());

		for (Sentence s : corpus) {
			List<Word> sentence = s.getWords();

//			stats.updateUniGramOccurrences(s.getWords().size());

			if (sentence == null){
				continue;
			}

			for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
				for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
					if (ngram == 2 && j < sentence.size()) {
						currentLoop = new ArrayList<>();
//						currentLoop.add(sentence.get(i));
						currentLoop.add(sentence.get(i));
						fillSkipgrams(currentLoop, i, j, w);
						currentLoop.add(sentence.get(j));

						validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
					} else {
						for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
							if (ngram == 3 && k < sentence.size()) {
								currentLoop = new ArrayList<>();
								currentLoop.add(sentence.get(i));
								fillSkipgrams(currentLoop, i, j, w);
								currentLoop.add(sentence.get(j));
								fillSkipgrams(currentLoop, j, k, w);
								currentLoop.add(sentence.get(k));

								validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
							} else {
								for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
									if (ngram == 4 && l < sentence.size()) {
										currentLoop = new ArrayList<>();
										currentLoop.add(sentence.get(i));
										fillSkipgrams(currentLoop, i, j, w);
										currentLoop.add(sentence.get(j));
										fillSkipgrams(currentLoop, j, k, w);
										currentLoop.add(sentence.get(k));
										fillSkipgrams(currentLoop, k, l, w);
										currentLoop.add(sentence.get(l));

										validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
									} else {
										for (int m = l + 1; m <= l + 1 + skip; m++) { // 5gram
											if (ngram == 5 && m < sentence.size()) {
												currentLoop = new ArrayList<>();
												currentLoop.add(sentence.get(i));
												fillSkipgrams(currentLoop, i, j, w);
												currentLoop.add(sentence.get(j));
												fillSkipgrams(currentLoop, j, k, w);
												currentLoop.add(sentence.get(k));
												fillSkipgrams(currentLoop, k, l, w);
												currentLoop.add(sentence.get(l));
												fillSkipgrams(currentLoop, l, m, w);
												currentLoop.add(sentence.get(m));

												validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
											}
										}
									}
								}
							}
						}
					}
				}
			}
		}
	}

	private static void fillSkipgrams(ArrayList<Word> currentLoop, int i, int j, Word w){
		for(int k = i + 1; k < j; k++){
			currentLoop.add(w);
		}
	}

	private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats, List<String> taxonomy) {
		// count if no regex is set or if it is & candidate passes it
		if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
//		    String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
//            key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
//			stats.updateTaxonomyResults(new MultipleHMKeys1(key),
//										stats.getCorpus().getTaxonomy());


			ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();

			String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts());

			// if last letter is ',' erase it

//				if (key.equals("")){
//					String test = key;
//				}

//			if (stats.getFilter().getNotePunctuations())
//				key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;

			MultipleHMKeys multipleKeys;

			// create MultipleHMKeys for different amount of other keys
			switch (otherKeys.size()) {
				case 0:
					multipleKeys = new MultipleHMKeys1(key);
					break;
				case 1:
					String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
//					if (stats.getFilter().getNotePunctuations())
//						k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length() - 1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
					multipleKeys = new MultipleHMKeys2(key, k1_2);
					break;
				case 2:
					String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
					String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
//					if (stats.getFilter().getNotePunctuations()) {
//						k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
//						k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
//					}
					multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
					break;
				case 3:
					String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
					String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
					String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
//					if (stats.getFilter().getNotePunctuations()) {
//						k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
//						k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
//						k3_4 = (!k3_4.equals("") && k3_4.charAt(k3_4.length() - 1) == ',') ? k3_4.substring(0, k3_4.length() - 1) : k3_4;
//					}
					multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
					break;
				case 4:
					String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
					String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
					String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
					String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3), stats.getFilter().getWordParts());
//					if (stats.getFilter().getNotePunctuations()) {
//						k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
//						k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
//						k4_4 = (!k4_4.equals("") && k4_4.charAt(k4_4.length() - 1) == ',') ? k4_4.substring(0, k4_4.length() - 1) : k4_4;
//						k4_5 = (!k4_5.equals("") && k4_5.charAt(k4_5.length() - 1) == ',') ? k4_5.substring(0, k4_5.length() - 1) : k4_5;
//					}
					multipleKeys = new MultipleHMKeys5(key, k4_2, k4_3, k4_4, k4_5);
					break;
				default:
					multipleKeys = null;

			}
			stats.updateTaxonomyResults(multipleKeys, taxonomy);

		}
	}
}