You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
321 lines
11 KiB
321 lines
11 KiB
package alg.ngram;
|
|
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.Set;
|
|
import java.util.regex.Pattern;
|
|
import java.util.stream.Collectors;
|
|
|
|
import com.sun.xml.internal.bind.v2.runtime.reflect.Lister;
|
|
import data.*;
|
|
import org.apache.commons.lang3.StringUtils;
|
|
import org.apache.logging.log4j.LogManager;
|
|
import org.apache.logging.log4j.Logger;
|
|
|
|
import gui.ValidationUtil;
|
|
|
|
public class Ngrams {
|
|
public final static Logger logger = LogManager.getLogger(Ngrams.class);
|
|
|
|
|
|
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
|
|
if (stats.getFilter().getNgramValue() == 0) { // letter ngram
|
|
generateNgramLetterCandidates(corpus, stats);
|
|
} else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) {
|
|
generateSkipgramCandidates(corpus, stats);
|
|
} else {
|
|
generateNgramCandidates(corpus, stats);
|
|
}
|
|
}
|
|
|
|
public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
|
|
// preprocess CalculateFor for this corpus and prepare data for MultipleHMKeys
|
|
ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();
|
|
|
|
for (Sentence s : corpus) {
|
|
// skip sentences shorter than specified ngram length
|
|
if (s.getWords().size() < stats.getFilter().getNgramValue()) {
|
|
continue;
|
|
}
|
|
|
|
for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) {
|
|
List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
|
|
|
|
// if msd regex is set and this candidate doesn't pass it, skip this iteration
|
|
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {
|
|
continue;
|
|
}
|
|
|
|
// generate proper MultipleHMKeys depending on filter data
|
|
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor());
|
|
|
|
// if last letter is ',' erase it
|
|
|
|
// if (key.equals("")){
|
|
// String test = key;
|
|
// }
|
|
|
|
if (stats.getFilter().getNotePunctuations())
|
|
key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
|
|
|
|
MultipleHMKeys multipleKeys;
|
|
|
|
// create MultipleHMKeys for different amount of other keys
|
|
switch (otherKeys.size()) {
|
|
case 0:
|
|
multipleKeys = new MultipleHMKeys1(key);
|
|
break;
|
|
case 1:
|
|
multipleKeys = new MultipleHMKeys2(key, wordToString(ngramCandidate, otherKeys.get(0)));
|
|
break;
|
|
case 2:
|
|
multipleKeys = new MultipleHMKeys3(key, wordToString(ngramCandidate, otherKeys.get(0)),
|
|
wordToString(ngramCandidate, otherKeys.get(1)));
|
|
break;
|
|
case 3:
|
|
multipleKeys = new MultipleHMKeys4(key, wordToString(ngramCandidate, otherKeys.get(0)),
|
|
wordToString(ngramCandidate, otherKeys.get(1)),
|
|
wordToString(ngramCandidate, otherKeys.get(2)));
|
|
break;
|
|
case 4:
|
|
multipleKeys = new MultipleHMKeys5(key, wordToString(ngramCandidate, otherKeys.get(0)),
|
|
wordToString(ngramCandidate, otherKeys.get(1)),
|
|
wordToString(ngramCandidate, otherKeys.get(2)),
|
|
wordToString(ngramCandidate, otherKeys.get(3)));
|
|
break;
|
|
default:
|
|
multipleKeys = null;
|
|
}
|
|
|
|
|
|
// String lemma = "";
|
|
// String wordType = "";
|
|
// String msd = "";
|
|
// for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){
|
|
// if(otherKey.toString().equals("lema")){
|
|
// lemma = wordToString(ngramCandidate, otherKey);
|
|
// } else if(otherKey.toString().equals("besedna vrsta")){
|
|
// wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);
|
|
// } else if(otherKey.toString().equals("oblikoskladenjska oznaka")){
|
|
// msd = wordToString(ngramCandidate, otherKey);
|
|
// }
|
|
// }
|
|
//
|
|
// MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd);
|
|
|
|
|
|
|
|
// UPDATE TAXONOMY HERE!!!
|
|
stats.updateTaxonomyResults(multipleKeys, s.getTaxonomy());
|
|
// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Checks whether an ngram candidate passes specified regex filter.
|
|
*/
|
|
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {
|
|
if (ngramCandidate.size() != regex.size()) {
|
|
logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
|
|
return false;
|
|
}
|
|
|
|
for (int i = 0; i < regex.size(); i++) {
|
|
//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
|
|
if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern() + ".*")) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {
|
|
ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());
|
|
|
|
switch (calculateFor) {
|
|
case LEMMA:
|
|
candidate.addAll(ngramCandidate
|
|
.stream()
|
|
.map(Word::getLemma)
|
|
.collect(Collectors.toList()));
|
|
return StringUtils.join(candidate, " ");
|
|
case WORD:
|
|
candidate.addAll(ngramCandidate
|
|
.stream()
|
|
.map(Word::getWord)
|
|
.collect(Collectors.toList()));
|
|
return StringUtils.join(candidate, " ");
|
|
case MORPHOSYNTACTIC_SPECS:
|
|
case MORPHOSYNTACTIC_PROPERTY:
|
|
candidate.addAll(ngramCandidate
|
|
.stream()
|
|
.map(Word::getMsd)
|
|
.collect(Collectors.toList()));
|
|
return StringUtils.join(candidate, " ");
|
|
case WORD_TYPE:
|
|
candidate.addAll(ngramCandidate
|
|
.stream()
|
|
.map(w -> Character.toString(w.getMsd().charAt(0)))
|
|
.collect(Collectors.toList()));
|
|
// candidate.addAll(ngramCandidate
|
|
// .stream()
|
|
// .map(w -> Character.toString(w.getMsd().charAt(0)))
|
|
// .collect(Collectors.toList()));
|
|
// .substring(0, 1)
|
|
return StringUtils.join(candidate, " ");
|
|
case NORMALIZED_WORD:
|
|
candidate.addAll(ngramCandidate
|
|
.stream()
|
|
.map(Word::getNormalizedWord)
|
|
.collect(Collectors.toList()));
|
|
return StringUtils.join(candidate, " ");
|
|
}
|
|
|
|
return StringUtils.join(candidate, " ");
|
|
}
|
|
|
|
/**
|
|
* Generates candidates and updates results
|
|
*
|
|
* @param corpus
|
|
* @param stats
|
|
*/
|
|
private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
|
|
for (Sentence s : corpus) {
|
|
for (Word w : s.getWords()) {
|
|
List<String> taxonomy = s.getTaxonomy();
|
|
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
|
|
|
|
// skip this iteration if:
|
|
// - word doesn't contain a proper version (missing lemma for example)
|
|
// - msd regex is given but this word's msd doesn't match it, skip this iteration
|
|
// - given substring length is larger than the word length
|
|
if (ValidationUtil.isEmpty(word)
|
|
|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())
|
|
|| word.length() < stats.getFilter().getStringLength()) {
|
|
continue;
|
|
}
|
|
|
|
for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
|
|
// TODO: locila?
|
|
|
|
MultipleHMKeys multipleKeys = new MultipleHMKeys1(word.substring(i, i + stats.getFilter().getStringLength()));
|
|
stats.updateTaxonomyResults(multipleKeys, taxonomy);
|
|
// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
|
|
|
|
|
|
stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength()));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Checks skipped words and if necessary adds punctuations.
|
|
*
|
|
* @return List of candidates represented as a list<candidates(String)>
|
|
*/
|
|
private static Word checkAndModifySkipgramPunctuation(List<Word> sentence, int i, int j, StatisticsNew stats){
|
|
// if punctuation checkbox selected and there words at indexes i and j are not next to each other
|
|
if(stats.getFilter().getNotePunctuations() && j - i > 1 && sentence.get(i).getWord().charAt(sentence.get(i).getWord().length() - 1) != ','){
|
|
boolean middleWordsHavePunctuation = false;
|
|
for (int n = i + 1; n < j; n++){
|
|
if (sentence.get(n).getWord().charAt(sentence.get(n).getWord().length() - 1) == ','){
|
|
middleWordsHavePunctuation = true;
|
|
break;
|
|
}
|
|
}
|
|
if (middleWordsHavePunctuation){
|
|
|
|
String punctuation = ",";
|
|
return new Word(sentence.get(i).getWord() + punctuation,
|
|
sentence.get(i).getLemma() + punctuation,
|
|
sentence.get(i).getMsd() + punctuation);
|
|
}
|
|
}
|
|
return sentence.get(i);
|
|
|
|
}
|
|
|
|
/**
|
|
* Extracts skipgram candidates.
|
|
*
|
|
* @return List of candidates represented as a list<candidates(String)>
|
|
*/
|
|
public static void generateSkipgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
|
|
ArrayList<Word> currentLoop;
|
|
int ngram = stats.getFilter().getNgramValue();
|
|
int skip = stats.getFilter().getSkipValue();
|
|
|
|
for (Sentence s : corpus) {
|
|
List<Word> sentence = s.getWords();
|
|
|
|
if (sentence == null){
|
|
continue;
|
|
}
|
|
|
|
for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
|
|
for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
|
|
if (ngram == 2 && j < sentence.size()) {
|
|
currentLoop = new ArrayList<>();
|
|
// currentLoop.add(sentence.get(i));
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
|
currentLoop.add(sentence.get(j));
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats);
|
|
} else {
|
|
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
|
|
if (ngram == 3 && k < sentence.size()) {
|
|
currentLoop = new ArrayList<>();
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
|
|
currentLoop.add(sentence.get(k));
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats);
|
|
} else {
|
|
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
|
|
if (ngram == 4 && l < sentence.size()) {
|
|
currentLoop = new ArrayList<>();
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats));
|
|
currentLoop.add(sentence.get(l));
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats);
|
|
} else {
|
|
for (int m = l + 1; m <= l + 1 + skip; m++) { // 5gram
|
|
if (ngram == 5 && m < sentence.size()) {
|
|
currentLoop = new ArrayList<>();
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats));
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, l, m, stats));
|
|
currentLoop.add(sentence.get(m));
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
|
|
// count if no regex is set or if it is & candidate passes it
|
|
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
|
|
String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
|
|
key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
|
|
stats.updateTaxonomyResults(new MultipleHMKeys1(key),
|
|
stats.getCorpus().getTaxonomy());
|
|
}
|
|
}
|
|
}
|