You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

208 lines
6.7 KiB

package alg.ngram;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import data.CalculateFor;
import data.Sentence;
import data.StatisticsNew;
import data.Word;
import gui.ValidationUtil;
public class Ngrams {
public final static Logger logger = LogManager.getLogger(Ngrams.class);
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
if (stats.getFilter().getNgramValue() == 0) { // letter ngram
generateNgramLetterCandidates(corpus, stats);
} else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) {
generateSkipgramCandidates(corpus, stats);
} else {
generateNgramCandidates(corpus, stats);
}
}
public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
// skip sentences shorter than specified ngram length
if (s.getWords().size() < stats.getFilter().getNgramValue()) {
continue;
}
for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) {
List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
// if msd regex is set and this candidate doesn't pass it, skip this iteration
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {
continue;
}
// UPDATE TAXONOMY HERE!!!
stats.updateTaxonomyResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()), ngramCandidate);
stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
}
}
}
/**
* Checks whether an ngram candidate passes specified regex filter.
*/
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {
if (ngramCandidate.size() != regex.size()) {
logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
return false;
}
for (int i = 0; i < regex.size(); i++) {
//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern() + ".*")) {
return false;
}
}
return true;
}
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {
ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());
switch (calculateFor) {
case LEMMA:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
break;
case WORD:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
break;
case MORPHOSYNTACTIC_SPECS:
case MORPHOSYNTACTIC_PROPERTY:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getMsd)
.collect(Collectors.toList()));
break;
case WORD_TYPE:
candidate.addAll(ngramCandidate
.stream()
.map(w -> Character.toString(w.getMsd().charAt(0)))
.collect(Collectors.toList()));
break;
}
return StringUtils.join(candidate, " ");
}
/**
* Generates candidates and updates results
*
* @param corpus
* @param stats
*/
private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
for (Word w : s.getWords()) {
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
// skip this iteration if:
// - word doesn't contain a proper version (missing lemma for example)
// - msd regex is given but this word's msd doesn't match it, skip this iteration
// - given substring length is larger than the word length
if (ValidationUtil.isEmpty(word)
|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())
|| word.length() < stats.getFilter().getStringLength()) {
continue;
}
for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
// TODO: locila?
stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength()));
}
}
}
}
/**
* Extracts skipgram candidates.
*
* @return List of candidates represented as a list<candidates(String)>
*/
public static void generateSkipgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
ArrayList<Word> currentLoop;
int ngram = stats.getFilter().getNgramValue();
int skip = stats.getFilter().getSkipValue();
for (Sentence s : corpus) {
List<Word> sentence = s.getWords();
for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
if (ngram == 2 && j < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
validateAndCountSkipgramCandidate(currentLoop, stats);
} else {
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
if (ngram == 3 && k < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
currentLoop.add(sentence.get(k));
validateAndCountSkipgramCandidate(currentLoop, stats);
} else {
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
if (ngram == 4 && k < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
currentLoop.add(sentence.get(k));
currentLoop.add(sentence.get(l));
validateAndCountSkipgramCandidate(currentLoop, stats);
} else {
for (int m = k + 1; m <= k + 1 + skip; m++) { // 5gram
if (ngram == 5 && k < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
currentLoop.add(sentence.get(k));
currentLoop.add(sentence.get(l));
currentLoop.add(sentence.get(m));
validateAndCountSkipgramCandidate(currentLoop, stats);
}
}
}
}
}
}
}
}
}
}
}
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
// count if no regex is set or if it is & candidate passes it
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
stats.updateResults(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()));
}
}
}