package alg.ngram; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import data.CalculateFor; import data.Sentence; import data.StatisticsNew; import data.Word; import gui.ValidationUtil; public class Ngrams { public final static Logger logger = LogManager.getLogger(Ngrams.class); public static void calculateForAll(List corpus, StatisticsNew stats) { if (stats.getFilter().getNgramValue() == 0) { // letter ngram generateNgramLetterCandidates(corpus, stats); } else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) { generateSkipgramCandidates(corpus, stats); } else { generateNgramCandidates(corpus, stats); } } public static void generateNgramCandidates(List corpus, StatisticsNew stats) { for (Sentence s : corpus) { // skip sentences shorter than specified ngram length if (s.getWords().size() < stats.getFilter().getNgramValue()) { continue; } for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) { List ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue()); // if msd regex is set and this candidate doesn't pass it, skip this iteration if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) { continue; } // UPDATE TAXONOMY HERE!!! stats.updateTaxonomyResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()), ngramCandidate); stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor())); } } } /** * Checks whether an ngram candidate passes specified regex filter. */ private static boolean passesRegex(List ngramCandidate, ArrayList regex) { if (ngramCandidate.size() != regex.size()) { logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway return false; } for (int i = 0; i < regex.size(); i++) { //if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) { if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern() + ".*")) { return false; } } return true; } private static String wordToString(List ngramCandidate, CalculateFor calculateFor) { ArrayList candidate = new ArrayList<>(ngramCandidate.size()); switch (calculateFor) { case LEMMA: candidate.addAll(ngramCandidate .stream() .map(Word::getLemma) .collect(Collectors.toList())); break; case WORD: candidate.addAll(ngramCandidate .stream() .map(Word::getWord) .collect(Collectors.toList())); break; case MORPHOSYNTACTIC_SPECS: case MORPHOSYNTACTIC_PROPERTY: candidate.addAll(ngramCandidate .stream() .map(Word::getMsd) .collect(Collectors.toList())); break; case WORD_TYPE: candidate.addAll(ngramCandidate .stream() .map(w -> Character.toString(w.getMsd().charAt(0))) .collect(Collectors.toList())); break; } return StringUtils.join(candidate, " "); } /** * Generates candidates and updates results * * @param corpus * @param stats */ private static void generateNgramLetterCandidates(List corpus, StatisticsNew stats) { for (Sentence s : corpus) { for (Word w : s.getWords()) { String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv()); // skip this iteration if: // - word doesn't contain a proper version (missing lemma for example) // - msd regex is given but this word's msd doesn't match it, skip this iteration // - given substring length is larger than the word length if (ValidationUtil.isEmpty(word) || stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern()) || word.length() < stats.getFilter().getStringLength()) { continue; } for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) { // TODO: locila? stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength())); } } } } /** * Extracts skipgram candidates. * * @return List of candidates represented as a list */ public static void generateSkipgramCandidates(List corpus, StatisticsNew stats) { ArrayList currentLoop; int ngram = stats.getFilter().getNgramValue(); int skip = stats.getFilter().getSkipValue(); for (Sentence s : corpus) { List sentence = s.getWords(); for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram if (ngram == 2 && j < sentence.size()) { currentLoop = new ArrayList<>(); currentLoop.add(sentence.get(i)); currentLoop.add(sentence.get(j)); validateAndCountSkipgramCandidate(currentLoop, stats); } else { for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram if (ngram == 3 && k < sentence.size()) { currentLoop = new ArrayList<>(); currentLoop.add(sentence.get(i)); currentLoop.add(sentence.get(j)); currentLoop.add(sentence.get(k)); validateAndCountSkipgramCandidate(currentLoop, stats); } else { for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram if (ngram == 4 && k < sentence.size()) { currentLoop = new ArrayList<>(); currentLoop.add(sentence.get(i)); currentLoop.add(sentence.get(j)); currentLoop.add(sentence.get(k)); currentLoop.add(sentence.get(l)); validateAndCountSkipgramCandidate(currentLoop, stats); } else { for (int m = k + 1; m <= k + 1 + skip; m++) { // 5gram if (ngram == 5 && k < sentence.size()) { currentLoop = new ArrayList<>(); currentLoop.add(sentence.get(i)); currentLoop.add(sentence.get(j)); currentLoop.add(sentence.get(k)); currentLoop.add(sentence.get(l)); currentLoop.add(sentence.get(m)); validateAndCountSkipgramCandidate(currentLoop, stats); } } } } } } } } } } } private static void validateAndCountSkipgramCandidate(ArrayList skipgramCandidate, StatisticsNew stats) { // count if no regex is set or if it is & candidate passes it if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) { stats.updateResults(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor())); } } }