You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

516 lines
22 KiB

package alg.ngram;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.sun.xml.internal.bind.v2.runtime.reflect.Lister;
import data.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import gui.ValidationUtil;
import static alg.XML_processing.createWord;
public class Ngrams {
public final static Logger logger = LogManager.getLogger(Ngrams.class);
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
if (stats.getFilter().getNgramValue() == 0) { // letter ngram
generateNgramLetterCandidates(corpus, stats);
} else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) {
generateSkipgramCandidates(corpus, stats);
} else {
generateNgramCandidates(corpus, stats);
}
}
public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
// preprocess CalculateFor for this corpus and prepare data for MultipleHMKeys
ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();
for (Sentence s : corpus) {
// stats.updateUniGramOccurrences(s.getWords().size());
// skip sentences shorter than specified ngram length
if (s.getWords().size() < stats.getFilter().getNgramValue()) {
continue;
}
for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) {
List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
// if msd regex is set and this candidate doesn't pass it, skip this iteration
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
continue;
}
// generate proper MultipleHMKeys depending on filter data
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts());
if(stats.getFilter().getPrefixLength() != null && stats.getFilter().getSuffixLength() != null &&
key.length() < stats.getFilter().getPrefixLength() + stats.getFilter().getSuffixLength()){
continue;
}
if(stats.getFilter().getPrefixList() != null && stats.getFilter().getSuffixList() != null &&
(stats.getFilter().getPrefixList().size() > 0 || stats.getFilter().getSuffixList().size() > 0)){
String correctPrefix = "";
// go over all prefixes in PrefixList and look for them in words
for(String pf : stats.getFilter().getPrefixList()){
if (pf.length() <= key.length() && pf.equals(key.substring(0, pf.length()))){
correctPrefix = pf;
break;
}
}
String correctSuffix = "";
// go over all prefixes in SuffixList and look for them in words
for(String sf : stats.getFilter().getSuffixList()){
if (sf.length() <= key.length() && sf.equals(key.substring(key.length() - sf.length()))){
correctSuffix = sf;
break;
}
}
// boolean a = (correctPrefix.equals("") && !correctSuffix.equals(""));
// boolean b = (!correctPrefix.equals("") && correctSuffix.equals(""));
// boolean c = (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length());
// boolean d = !((correctPrefix.equals("") && !correctSuffix.equals("")) ||
// (!correctPrefix.equals("") && correctSuffix.equals("")) ||
// (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length()));
if(!((stats.getFilter().getPrefixList().size() == 0 && !correctSuffix.equals("")) ||
(!correctPrefix.equals("") && stats.getFilter().getSuffixList().size() == 0) ||
(!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length()))){
continue;
}
// if(!((correctPrefix.equals("") && !correctSuffix.equals("")) ||
// (!correctPrefix.equals("") && correctSuffix.equals("")) ||
// (!correctPrefix.equals("") && !correctSuffix.equals("") && correctPrefix.length() + correctSuffix.length() <= key.length()))){
// continue;
// }
}
// if last letter is ',' erase it
// if (key.equals("")){
// String test = key;
// }
// if (stats.getFilter().getNotePunctuations())
// key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
MultipleHMKeys multipleKeys;
// create MultipleHMKeys for different amount of other keys
switch (otherKeys.size()) {
case 0:
multipleKeys = new MultipleHMKeys1(key);
break;
case 1:
String k1_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations())
// k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length()-1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
multipleKeys = new MultipleHMKeys2(key, k1_2);
break;
case 2:
String k2_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k2_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
// k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
// }
multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
break;
case 3:
String k3_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k3_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k3_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
// k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
// k3_4 = (!k3_4.equals("") && k3_4.charAt(k3_4.length() - 1) == ',') ? k3_4.substring(0, k3_4.length() - 1) : k3_4;
// }
multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
break;
case 4:
String k4_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k4_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k4_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
String k4_5 = wordToString(ngramCandidate, otherKeys.get(3), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
// k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
// k4_4 = (!k4_4.equals("") && k4_4.charAt(k4_4.length() - 1) == ',') ? k4_4.substring(0, k4_4.length() - 1) : k4_4;
// k4_5 = (!k4_5.equals("") && k4_5.charAt(k4_5.length() - 1) == ',') ? k4_5.substring(0, k4_5.length() - 1) : k4_5;
// }
multipleKeys = new MultipleHMKeys5(key, k4_2, k4_3, k4_4, k4_5);
break;
default:
multipleKeys = null;
}
// String lemma = "";
// String wordType = "";
// String msd = "";
// for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){
// if(otherKey.toString().equals("lema")){
// lemma = wordToString(ngramCandidate, otherKey);
// } else if(otherKey.toString().equals("besedna vrsta")){
// wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);
// } else if(otherKey.toString().equals("oblikoskladenjska oznaka")){
// msd = wordToString(ngramCandidate, otherKey);
// }
// }
//
// MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd);
// UPDATE TAXONOMY HERE!!!
stats.updateTaxonomyResults(multipleKeys, s.getTaxonomy());
// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
}
}
}
/**
* Checks whether an ngram candidate passes specified regex filter.
*/
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex, ArrayList<CalculateFor> wordParts) {
// if (ngramCandidate.size() != regex.size()) {
// logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
// return false;
// }
int j = 0;
for (int i = 0; i < ngramCandidate.size(); i++) {
String msd = ngramCandidate.get(i).getMsd(wordParts);
if (msd.equals("*")){
continue;
}
//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
if (!msd.matches(regex.get(j).pattern() + ".*")) {
return false;
}
j ++;
}
return true;
}
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor, ArrayList<CalculateFor> wordParts) {
ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());
switch (calculateFor) {
case LEMMA:
candidate.addAll(ngramCandidate
.stream()
.map(w -> w.getLemma(wordParts))
.collect(Collectors.toList()));
return StringUtils.join(candidate, " ");
case LOWERCASE_WORD:
candidate.addAll(ngramCandidate
.stream()
.map(w -> w.getWord(wordParts).toLowerCase())
.collect(Collectors.toList()));
return StringUtils.join(candidate, " ");
case WORD:
candidate.addAll(ngramCandidate
.stream()
.map(w -> w.getWord(wordParts))
.collect(Collectors.toList()));
return StringUtils.join(candidate, " ");
case MORPHOSYNTACTIC_SPECS:
case MORPHOSYNTACTIC_PROPERTY:
candidate.addAll(ngramCandidate
.stream()
.map(w -> w.getMsd(wordParts))
.collect(Collectors.toList()));
return StringUtils.join(candidate, " ");
case WORD_TYPE:
//'/' has to be a possiblility for when i.e. msd in GOS equals anon
candidate.addAll(ngramCandidate
.stream()
.map(w -> Character.toString(w.getMsd(wordParts).length() > 0 ? w.getMsd(wordParts).charAt(0) : '/'))
.collect(Collectors.toList()));
// candidate.addAll(ngramCandidate
// .stream()
// .map(w -> Character.toString(w.getMsd().charAt(0)))
// .collect(Collectors.toList()));
// .substring(0, 1)
return StringUtils.join(candidate, " ");
case NORMALIZED_WORD:
candidate.addAll(ngramCandidate
.stream()
.map(w -> w.getNormalizedWord(wordParts))
.collect(Collectors.toList()));
return StringUtils.join(candidate, " ");
}
return StringUtils.join(candidate, " ");
}
/**
* Generates candidates and updates results
*
* @param corpus
* @param stats
*/
private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
// stats.updateUniGramOccurrences(s.getWords().size());
for (Word w : s.getWords()) {
List<Taxonomy> taxonomy = s.getTaxonomy();
//// List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
List<Word> ngramCandidate = new ArrayList<>();
ngramCandidate.add(w);
//
// // if msd regex is set and this candidate doesn't pass it, skip this iteration
// if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
// continue;
// }
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv(), stats.getFilter().getWordParts());
// skip this iteration if:
// - word doesn't contain a proper version (missing lemma for example)
// - msd regex is given but this word's msd doesn't match it, skip this iteration
// - given substring length is larger than the word length
// boolean t1 = stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern());
// boolean t2 = !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern());
// String t3 = stats.getFilter().getMsd().get(0).pattern();
// ArrayList<CalculateFor> t4 = stats.getFilter().getWordParts();
// boolean t5 = word.length() < stats.getFilter().getStringLength();
if (ValidationUtil.isEmpty(word)
|| stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())
|| word.length() < stats.getFilter().getStringLength()) {
continue;
}
if(stats.getFilter().getCalculateFor().equals(CalculateFor.LOWERCASE_WORD)){
word = word.toLowerCase();
}
for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
// TODO: locila?
MultipleHMKeys multipleKeys = new MultipleHMKeys1(word.substring(i, i + stats.getFilter().getStringLength()));
stats.updateTaxonomyResults(multipleKeys, taxonomy);
// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength()));
}
}
}
}
/**
* Checks skipped words and if necessary adds punctuations.
*
* @return List of candidates represented as a list<candidates(String)>
*/
private static Word checkAndModifySkipgramPunctuation(List<Word> sentence, int i, int j, StatisticsNew stats){
// if punctuation checkbox selected and there words at indexes i and j are not next to each other
// if(stats.getFilter().getNotePunctuations() && j - i > 1 && sentence.get(i).getWord().charAt(sentence.get(i).getWord().length() - 1) != ','){
// boolean middleWordsHavePunctuation = false;
// for (int n = i + 1; n < j; n++){
// if (sentence.get(n).getWord().charAt(sentence.get(n).getWord().length() - 1) == ','){
// middleWordsHavePunctuation = true;
// break;
// }
// }
// if (middleWordsHavePunctuation){
//
// String punctuation = ",";
// return new Word(sentence.get(i).getWord() + punctuation,
// sentence.get(i).getLemma() + punctuation,
// sentence.get(i).getMsd() + punctuation);
// }
// }
return sentence.get(i);
}
/**
* Extracts skipgram candidates.
*
* @return List of candidates represented as a list<candidates(String)>
*/
public static void generateSkipgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
ArrayList<Word> currentLoop;
int ngram = stats.getFilter().getNgramValue();
int skip = stats.getFilter().getSkipValue();
Word w = createWord("*", "*", "*", "*", stats.getFilter());
for (Sentence s : corpus) {
List<Word> sentence = s.getWords();
// stats.updateUniGramOccurrences(s.getWords().size());
if (sentence == null){
continue;
}
for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
if (ngram == 2 && j < sentence.size()) {
currentLoop = new ArrayList<>();
// currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(i));
fillSkipgrams(currentLoop, i, j, w);
currentLoop.add(sentence.get(j));
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
} else {
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
if (ngram == 3 && k < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
fillSkipgrams(currentLoop, i, j, w);
currentLoop.add(sentence.get(j));
fillSkipgrams(currentLoop, j, k, w);
currentLoop.add(sentence.get(k));
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
} else {
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
if (ngram == 4 && l < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
fillSkipgrams(currentLoop, i, j, w);
currentLoop.add(sentence.get(j));
fillSkipgrams(currentLoop, j, k, w);
currentLoop.add(sentence.get(k));
fillSkipgrams(currentLoop, k, l, w);
currentLoop.add(sentence.get(l));
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
} else {
for (int m = l + 1; m <= l + 1 + skip; m++) { // 5gram
if (ngram == 5 && m < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
fillSkipgrams(currentLoop, i, j, w);
currentLoop.add(sentence.get(j));
fillSkipgrams(currentLoop, j, k, w);
currentLoop.add(sentence.get(k));
fillSkipgrams(currentLoop, k, l, w);
currentLoop.add(sentence.get(l));
fillSkipgrams(currentLoop, l, m, w);
currentLoop.add(sentence.get(m));
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
}
}
}
}
}
}
}
}
}
}
}
private static void fillSkipgrams(ArrayList<Word> currentLoop, int i, int j, Word w){
for(int k = i + 1; k < j; k++){
currentLoop.add(w);
}
}
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats, List<Taxonomy> taxonomy) {
// count if no regex is set or if it is & candidate passes it
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
// String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
// key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
// stats.updateTaxonomyResults(new MultipleHMKeys1(key),
// stats.getCorpus().getObservableListTaxonomy());
ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();
String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts());
// if last letter is ',' erase it
// if (key.equals("")){
// String test = key;
// }
// if (stats.getFilter().getNotePunctuations())
// key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
MultipleHMKeys multipleKeys;
// create MultipleHMKeys for different amount of other keys
switch (otherKeys.size()) {
case 0:
multipleKeys = new MultipleHMKeys1(key);
break;
case 1:
String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations())
// k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length() - 1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
multipleKeys = new MultipleHMKeys2(key, k1_2);
break;
case 2:
String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
// k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
// }
multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
break;
case 3:
String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
// k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
// k3_4 = (!k3_4.equals("") && k3_4.charAt(k3_4.length() - 1) == ',') ? k3_4.substring(0, k3_4.length() - 1) : k3_4;
// }
multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
break;
case 4:
String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
// k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
// k4_4 = (!k4_4.equals("") && k4_4.charAt(k4_4.length() - 1) == ',') ? k4_4.substring(0, k4_4.length() - 1) : k4_4;
// k4_5 = (!k4_5.equals("") && k4_5.charAt(k4_5.length() - 1) == ',') ? k4_5.substring(0, k4_5.length() - 1) : k4_5;
// }
multipleKeys = new MultipleHMKeys5(key, k4_2, k4_3, k4_4, k4_5);
break;
default:
multipleKeys = null;
}
stats.updateTaxonomyResults(multipleKeys, taxonomy);
}
}
}