|
|
@ -44,6 +44,8 @@ public class Ngrams {
|
|
|
|
|
|
|
|
|
|
|
|
// generate proper MultipleHMKeys depending on filter data
|
|
|
|
// generate proper MultipleHMKeys depending on filter data
|
|
|
|
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor());
|
|
|
|
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor());
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// if last letter is ',' erase it
|
|
|
|
key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
|
|
|
|
key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
|
|
|
|
// String key = "aaaaaaaaaaaaaaaaaaaaaaa";
|
|
|
|
// String key = "aaaaaaaaaaaaaaaaaaaaaaa";
|
|
|
|
|
|
|
|
|
|
|
@ -161,6 +163,33 @@ public class Ngrams {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
|
|
* Checks skipped words and if necessary adds punctuations.
|
|
|
|
|
|
|
|
*
|
|
|
|
|
|
|
|
* @return List of candidates represented as a list<candidates(String)>
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
private static Word checkAndModifySkipgramPunctuation(List<Word> sentence, int i, int j, StatisticsNew stats){
|
|
|
|
|
|
|
|
// if punctuation checkbox selected and there words at indexes i and j are not next to each other
|
|
|
|
|
|
|
|
if(stats.getFilter().getNotePunctuations() && j - i > 1 && sentence.get(i).getWord().charAt(sentence.get(i).getWord().length() - 1) != ','){
|
|
|
|
|
|
|
|
boolean middleWordsHavePunctuation = false;
|
|
|
|
|
|
|
|
for (int n = i + 1; n < j; n++){
|
|
|
|
|
|
|
|
if (sentence.get(n).getWord().charAt(sentence.get(n).getWord().length() - 1) == ','){
|
|
|
|
|
|
|
|
middleWordsHavePunctuation = true;
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if (middleWordsHavePunctuation){
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
String punctuation = ",";
|
|
|
|
|
|
|
|
return new Word(sentence.get(i).getWord() + punctuation,
|
|
|
|
|
|
|
|
sentence.get(i).getLemma() + punctuation,
|
|
|
|
|
|
|
|
sentence.get(i).getMsd() + punctuation,
|
|
|
|
|
|
|
|
sentence.get(i).getTaxonomy());
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return sentence.get(i);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
/**
|
|
|
|
* Extracts skipgram candidates.
|
|
|
|
* Extracts skipgram candidates.
|
|
|
@ -179,7 +208,8 @@ public class Ngrams {
|
|
|
|
for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
|
|
|
|
for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
|
|
|
|
if (ngram == 2 && j < sentence.size()) {
|
|
|
|
if (ngram == 2 && j < sentence.size()) {
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
currentLoop.add(sentence.get(i));
|
|
|
|
// currentLoop.add(sentence.get(i));
|
|
|
|
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
|
|
|
currentLoop.add(sentence.get(j));
|
|
|
|
currentLoop.add(sentence.get(j));
|
|
|
|
|
|
|
|
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats);
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats);
|
|
|
@ -187,29 +217,29 @@ public class Ngrams {
|
|
|
|
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
|
|
|
|
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
|
|
|
|
if (ngram == 3 && k < sentence.size()) {
|
|
|
|
if (ngram == 3 && k < sentence.size()) {
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
currentLoop.add(sentence.get(i));
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
|
|
|
currentLoop.add(sentence.get(j));
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
|
|
|
|
currentLoop.add(sentence.get(k));
|
|
|
|
currentLoop.add(sentence.get(k));
|
|
|
|
|
|
|
|
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats);
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats);
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
|
|
|
|
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
|
|
|
|
if (ngram == 4 && k < sentence.size()) {
|
|
|
|
if (ngram == 4 && l < sentence.size()) {
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
currentLoop.add(sentence.get(i));
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
|
|
|
currentLoop.add(sentence.get(j));
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
|
|
|
|
currentLoop.add(sentence.get(k));
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats));
|
|
|
|
currentLoop.add(sentence.get(l));
|
|
|
|
currentLoop.add(sentence.get(l));
|
|
|
|
|
|
|
|
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats);
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats);
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
for (int m = k + 1; m <= k + 1 + skip; m++) { // 5gram
|
|
|
|
for (int m = l + 1; m <= l + 1 + skip; m++) { // 5gram
|
|
|
|
if (ngram == 5 && k < sentence.size()) {
|
|
|
|
if (ngram == 5 && m < sentence.size()) {
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
currentLoop.add(sentence.get(i));
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
|
|
|
currentLoop.add(sentence.get(j));
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
|
|
|
|
currentLoop.add(sentence.get(k));
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats));
|
|
|
|
currentLoop.add(sentence.get(l));
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, l, m, stats));
|
|
|
|
currentLoop.add(sentence.get(m));
|
|
|
|
currentLoop.add(sentence.get(m));
|
|
|
|
|
|
|
|
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats);
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats);
|
|
|
@ -228,7 +258,9 @@ public class Ngrams {
|
|
|
|
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
|
|
|
|
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
|
|
|
|
// count if no regex is set or if it is & candidate passes it
|
|
|
|
// count if no regex is set or if it is & candidate passes it
|
|
|
|
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
|
|
|
|
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
|
|
|
|
stats.updateTaxonomyResults(new MultipleHMKeys(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()), "", "", ""),
|
|
|
|
String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
|
|
|
|
|
|
|
|
key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
|
|
|
|
|
|
|
|
stats.updateTaxonomyResults(new MultipleHMKeys(key, "", "", ""),
|
|
|
|
stats.getCorpus().getTaxonomy());
|
|
|
|
stats.getCorpus().getTaxonomy());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|