package alg.ngram; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.regex.Pattern; import java.util.stream.Collectors; import com.sun.xml.internal.bind.v2.runtime.reflect.Lister; import data.*; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import gui.ValidationUtil; import static alg.XML_processing.createWord; public class Ngrams { public final static Logger logger = LogManager.getLogger(Ngrams.class); public static void calculateForAll(List corpus, StatisticsNew stats) { if (stats.getFilter().getNgramValue() == 0) { // letter ngram generateNgramLetterCandidates(corpus, stats); } else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) { generateSkipgramCandidates(corpus, stats); } else { generateNgramCandidates(corpus, stats); } } public static void generateNgramCandidates(List corpus, StatisticsNew stats) { // preprocess CalculateFor for this corpus and prepare data for MultipleHMKeys ArrayList otherKeys = stats.getFilter().getMultipleKeys(); for (Sentence s : corpus) { // stats.updateUniGramOccurrences(s.getWords().size()); // skip sentences shorter than specified ngram length if (s.getWords().size() < stats.getFilter().getNgramValue()) { continue; } for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) { List ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue()); // if msd regex is set and this candidate doesn't pass it, skip this iteration if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) { continue; } // generate proper MultipleHMKeys depending on filter data String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts()); if(stats.getFilter().getPrefixLength() != null && stats.getFilter().getSuffixLength() != null && key.length() < stats.getFilter().getPrefixLength() + stats.getFilter().getSuffixLength()){ continue; } if(stats.getFilter().getPrefixList() != null && stats.getFilter().getSuffixList() != null && (stats.getFilter().getPrefixList().size() > 0 || stats.getFilter().getSuffixList().size() > 0)){ String correctPrefix = ""; // go over all prefixes in PrefixList and look for them in words for(String pf : stats.getFilter().getPrefixList()){ if (pf.length() <= key.length() && pf.equals(key.substring(0, pf.length()))){ correctPrefix = pf; break; } } String correctSuffix = ""; // go over all prefixes in SuffixList and look for them in words for(String sf : stats.getFilter().getSuffixList()){ if (sf.length() <= key.length() && sf.equals(key.substring(key.length() - sf.length()))){ correctSuffix = sf; break; } } // boolean a = (correctPrefix.equals("") && !correctSuffix.equals("")); // boolean b = (!correctPrefix.equals("") && correctSuffix.equals("")); // boolean c = (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length()); // boolean d = !((correctPrefix.equals("") && !correctSuffix.equals("")) || // (!correctPrefix.equals("") && correctSuffix.equals("")) || // (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length())); if(!((stats.getFilter().getPrefixList().size() == 0 && !correctSuffix.equals("")) || (!correctPrefix.equals("") && stats.getFilter().getSuffixList().size() == 0) || (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length()))){ continue; } // if(!((correctPrefix.equals("") && !correctSuffix.equals("")) || // (!correctPrefix.equals("") && correctSuffix.equals("")) || // (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length()))){ // continue; // } } // if last letter is ',' erase it // if (key.equals("")){ // String test = key; // } // if (stats.getFilter().getNotePunctuations()) // key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key; MultipleHMKeys multipleKeys; // create MultipleHMKeys for different amount of other keys switch (otherKeys.size()) { case 0: multipleKeys = new MultipleHMKeys1(key); break; case 1: String k1_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) // k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length()-1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2; multipleKeys = new MultipleHMKeys2(key, k1_2); break; case 2: String k2_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); String k2_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) { // k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2; // k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3; // } multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3); break; case 3: String k3_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); String k3_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts()); String k3_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) { // k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2; // k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3; // k3_4 = (!k3_4.equals("") && k3_4.charAt(k3_4.length() - 1) == ',') ? k3_4.substring(0, k3_4.length() - 1) : k3_4; // } multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4); break; case 4: String k4_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); String k4_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts()); String k4_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts()); String k4_5 = wordToString(ngramCandidate, otherKeys.get(3), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) { // k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2; // k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3; // k4_4 = (!k4_4.equals("") && k4_4.charAt(k4_4.length() - 1) == ',') ? k4_4.substring(0, k4_4.length() - 1) : k4_4; // k4_5 = (!k4_5.equals("") && k4_5.charAt(k4_5.length() - 1) == ',') ? k4_5.substring(0, k4_5.length() - 1) : k4_5; // } multipleKeys = new MultipleHMKeys5(key, k4_2, k4_3, k4_4, k4_5); break; default: multipleKeys = null; } // String lemma = ""; // String wordType = ""; // String msd = ""; // for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){ // if(otherKey.toString().equals("lema")){ // lemma = wordToString(ngramCandidate, otherKey); // } else if(otherKey.toString().equals("besedna vrsta")){ // wordType = wordToString(ngramCandidate, otherKey).substring(0, 1); // } else if(otherKey.toString().equals("oblikoskladenjska oznaka")){ // msd = wordToString(ngramCandidate, otherKey); // } // } // // MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd); // UPDATE TAXONOMY HERE!!! stats.updateTaxonomyResults(multipleKeys, s.getTaxonomy()); // stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor())); } } } /** * Checks whether an ngram candidate passes specified regex filter. */ private static boolean passesRegex(List ngramCandidate, ArrayList regex, ArrayList wordParts) { // if (ngramCandidate.size() != regex.size()) { // logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway // return false; // } int j = 0; for (int i = 0; i < ngramCandidate.size(); i++) { String msd = ngramCandidate.get(i).getMsd(wordParts); if (msd.equals("*")){ continue; } //if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) { if (!msd.matches(regex.get(j).pattern() + ".*")) { return false; } j ++; } return true; } private static String wordToString(List ngramCandidate, CalculateFor calculateFor, ArrayList wordParts) { ArrayList candidate = new ArrayList<>(ngramCandidate.size()); switch (calculateFor) { case LEMMA: candidate.addAll(ngramCandidate .stream() .map(w -> w.getLemma(wordParts)) .collect(Collectors.toList())); return StringUtils.join(candidate, " "); case WORD: candidate.addAll(ngramCandidate .stream() .map(w -> w.getWord(wordParts)) .collect(Collectors.toList())); return StringUtils.join(candidate, " "); case MORPHOSYNTACTIC_SPECS: case MORPHOSYNTACTIC_PROPERTY: candidate.addAll(ngramCandidate .stream() .map(w -> w.getMsd(wordParts)) .collect(Collectors.toList())); return StringUtils.join(candidate, " "); case WORD_TYPE: candidate.addAll(ngramCandidate .stream() .map(w -> Character.toString(w.getMsd(wordParts).charAt(0))) .collect(Collectors.toList())); // candidate.addAll(ngramCandidate // .stream() // .map(w -> Character.toString(w.getMsd().charAt(0))) // .collect(Collectors.toList())); // .substring(0, 1) return StringUtils.join(candidate, " "); case NORMALIZED_WORD: candidate.addAll(ngramCandidate .stream() .map(w -> w.getNormalizedWord(wordParts)) .collect(Collectors.toList())); return StringUtils.join(candidate, " "); } return StringUtils.join(candidate, " "); } /** * Generates candidates and updates results * * @param corpus * @param stats */ private static void generateNgramLetterCandidates(List corpus, StatisticsNew stats) { for (Sentence s : corpus) { // stats.updateUniGramOccurrences(s.getWords().size()); for (Word w : s.getWords()) { List taxonomy = s.getTaxonomy(); //// List ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue()); List ngramCandidate = new ArrayList<>(); ngramCandidate.add(w); // // // if msd regex is set and this candidate doesn't pass it, skip this iteration // if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) { // continue; // } String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv(), stats.getFilter().getWordParts()); // skip this iteration if: // - word doesn't contain a proper version (missing lemma for example) // - msd regex is given but this word's msd doesn't match it, skip this iteration // - given substring length is larger than the word length // boolean t1 = stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern()); // boolean t2 = !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern()); // String t3 = stats.getFilter().getMsd().get(0).pattern(); // ArrayList t4 = stats.getFilter().getWordParts(); // boolean t5 = word.length() < stats.getFilter().getStringLength(); if (ValidationUtil.isEmpty(word) || stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts()) || word.length() < stats.getFilter().getStringLength()) { continue; } for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) { // TODO: locila? MultipleHMKeys multipleKeys = new MultipleHMKeys1(word.substring(i, i + stats.getFilter().getStringLength())); stats.updateTaxonomyResults(multipleKeys, taxonomy); // stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor())); stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength())); } } } } /** * Checks skipped words and if necessary adds punctuations. * * @return List of candidates represented as a list */ private static Word checkAndModifySkipgramPunctuation(List sentence, int i, int j, StatisticsNew stats){ // if punctuation checkbox selected and there words at indexes i and j are not next to each other // if(stats.getFilter().getNotePunctuations() && j - i > 1 && sentence.get(i).getWord().charAt(sentence.get(i).getWord().length() - 1) != ','){ // boolean middleWordsHavePunctuation = false; // for (int n = i + 1; n < j; n++){ // if (sentence.get(n).getWord().charAt(sentence.get(n).getWord().length() - 1) == ','){ // middleWordsHavePunctuation = true; // break; // } // } // if (middleWordsHavePunctuation){ // // String punctuation = ","; // return new Word(sentence.get(i).getWord() + punctuation, // sentence.get(i).getLemma() + punctuation, // sentence.get(i).getMsd() + punctuation); // } // } return sentence.get(i); } /** * Extracts skipgram candidates. * * @return List of candidates represented as a list */ public static void generateSkipgramCandidates(List corpus, StatisticsNew stats) { ArrayList currentLoop; int ngram = stats.getFilter().getNgramValue(); int skip = stats.getFilter().getSkipValue(); Word w = createWord("*", "*", "*", "*", stats.getFilter()); for (Sentence s : corpus) { List sentence = s.getWords(); // stats.updateUniGramOccurrences(s.getWords().size()); if (sentence == null){ continue; } for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram if (ngram == 2 && j < sentence.size()) { currentLoop = new ArrayList<>(); // currentLoop.add(sentence.get(i)); currentLoop.add(sentence.get(i)); fillSkipgrams(currentLoop, i, j, w); currentLoop.add(sentence.get(j)); validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy()); } else { for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram if (ngram == 3 && k < sentence.size()) { currentLoop = new ArrayList<>(); currentLoop.add(sentence.get(i)); fillSkipgrams(currentLoop, i, j, w); currentLoop.add(sentence.get(j)); fillSkipgrams(currentLoop, j, k, w); currentLoop.add(sentence.get(k)); validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy()); } else { for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram if (ngram == 4 && l < sentence.size()) { currentLoop = new ArrayList<>(); currentLoop.add(sentence.get(i)); fillSkipgrams(currentLoop, i, j, w); currentLoop.add(sentence.get(j)); fillSkipgrams(currentLoop, j, k, w); currentLoop.add(sentence.get(k)); fillSkipgrams(currentLoop, k, l, w); currentLoop.add(sentence.get(l)); validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy()); } else { for (int m = l + 1; m <= l + 1 + skip; m++) { // 5gram if (ngram == 5 && m < sentence.size()) { currentLoop = new ArrayList<>(); currentLoop.add(sentence.get(i)); fillSkipgrams(currentLoop, i, j, w); currentLoop.add(sentence.get(j)); fillSkipgrams(currentLoop, j, k, w); currentLoop.add(sentence.get(k)); fillSkipgrams(currentLoop, k, l, w); currentLoop.add(sentence.get(l)); fillSkipgrams(currentLoop, l, m, w); currentLoop.add(sentence.get(m)); validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy()); } } } } } } } } } } } private static void fillSkipgrams(ArrayList currentLoop, int i, int j, Word w){ for(int k = i + 1; k < j; k++){ currentLoop.add(w); } } private static void validateAndCountSkipgramCandidate(ArrayList skipgramCandidate, StatisticsNew stats, List taxonomy) { // count if no regex is set or if it is & candidate passes it if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) { // String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()); // key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key; // stats.updateTaxonomyResults(new MultipleHMKeys1(key), // stats.getCorpus().getTaxonomy()); ArrayList otherKeys = stats.getFilter().getMultipleKeys(); String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts()); // if last letter is ',' erase it // if (key.equals("")){ // String test = key; // } // if (stats.getFilter().getNotePunctuations()) // key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key; MultipleHMKeys multipleKeys; // create MultipleHMKeys for different amount of other keys switch (otherKeys.size()) { case 0: multipleKeys = new MultipleHMKeys1(key); break; case 1: String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) // k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length() - 1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2; multipleKeys = new MultipleHMKeys2(key, k1_2); break; case 2: String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) { // k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2; // k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3; // } multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3); break; case 3: String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts()); String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) { // k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2; // k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3; // k3_4 = (!k3_4.equals("") && k3_4.charAt(k3_4.length() - 1) == ',') ? k3_4.substring(0, k3_4.length() - 1) : k3_4; // } multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4); break; case 4: String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts()); String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts()); String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) { // k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2; // k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3; // k4_4 = (!k4_4.equals("") && k4_4.charAt(k4_4.length() - 1) == ',') ? k4_4.substring(0, k4_4.length() - 1) : k4_4; // k4_5 = (!k4_5.equals("") && k4_5.charAt(k4_5.length() - 1) == ',') ? k4_5.substring(0, k4_5.length() - 1) : k4_5; // } multipleKeys = new MultipleHMKeys5(key, k4_2, k4_3, k4_4, k4_5); break; default: multipleKeys = null; } stats.updateTaxonomyResults(multipleKeys, taxonomy); } } }