|
|
@ -15,6 +15,8 @@ import org.apache.logging.log4j.Logger;
|
|
|
|
|
|
|
|
|
|
|
|
import gui.ValidationUtil;
|
|
|
|
import gui.ValidationUtil;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import static alg.XML_processing.createWord;
|
|
|
|
|
|
|
|
|
|
|
|
public class Ngrams {
|
|
|
|
public class Ngrams {
|
|
|
|
public final static Logger logger = LogManager.getLogger(Ngrams.class);
|
|
|
|
public final static Logger logger = LogManager.getLogger(Ngrams.class);
|
|
|
|
|
|
|
|
|
|
|
@ -138,16 +140,22 @@ public class Ngrams {
|
|
|
|
* Checks whether an ngram candidate passes specified regex filter.
|
|
|
|
* Checks whether an ngram candidate passes specified regex filter.
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex, ArrayList<CalculateFor> wordParts) {
|
|
|
|
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex, ArrayList<CalculateFor> wordParts) {
|
|
|
|
if (ngramCandidate.size() != regex.size()) {
|
|
|
|
// if (ngramCandidate.size() != regex.size()) {
|
|
|
|
logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
|
|
|
|
// logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
|
|
|
|
return false;
|
|
|
|
// return false;
|
|
|
|
}
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < regex.size(); i++) {
|
|
|
|
int j = 0;
|
|
|
|
|
|
|
|
for (int i = 0; i < ngramCandidate.size(); i++) {
|
|
|
|
|
|
|
|
String msd = ngramCandidate.get(i).getMsd(wordParts);
|
|
|
|
|
|
|
|
if (msd.equals("*")){
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
}
|
|
|
|
//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
|
|
|
|
//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
|
|
|
|
if (!ngramCandidate.get(i).getMsd(wordParts).matches(regex.get(i).pattern() + ".*")) {
|
|
|
|
if (!msd.matches(regex.get(j).pattern() + ".*")) {
|
|
|
|
return false;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
j ++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
return true;
|
|
|
@ -270,6 +278,7 @@ public class Ngrams {
|
|
|
|
ArrayList<Word> currentLoop;
|
|
|
|
ArrayList<Word> currentLoop;
|
|
|
|
int ngram = stats.getFilter().getNgramValue();
|
|
|
|
int ngram = stats.getFilter().getNgramValue();
|
|
|
|
int skip = stats.getFilter().getSkipValue();
|
|
|
|
int skip = stats.getFilter().getSkipValue();
|
|
|
|
|
|
|
|
Word w = createWord("*", "*", "*", "*", stats.getFilter());
|
|
|
|
|
|
|
|
|
|
|
|
for (Sentence s : corpus) {
|
|
|
|
for (Sentence s : corpus) {
|
|
|
|
List<Word> sentence = s.getWords();
|
|
|
|
List<Word> sentence = s.getWords();
|
|
|
@ -283,7 +292,8 @@ public class Ngrams {
|
|
|
|
if (ngram == 2 && j < sentence.size()) {
|
|
|
|
if (ngram == 2 && j < sentence.size()) {
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
// currentLoop.add(sentence.get(i));
|
|
|
|
// currentLoop.add(sentence.get(i));
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
|
|
|
currentLoop.add(sentence.get(i));
|
|
|
|
|
|
|
|
fillSkipgrams(currentLoop, i, j, w);
|
|
|
|
currentLoop.add(sentence.get(j));
|
|
|
|
currentLoop.add(sentence.get(j));
|
|
|
|
|
|
|
|
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
|
|
|
@ -291,8 +301,10 @@ public class Ngrams {
|
|
|
|
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
|
|
|
|
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
|
|
|
|
if (ngram == 3 && k < sentence.size()) {
|
|
|
|
if (ngram == 3 && k < sentence.size()) {
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
|
|
|
currentLoop.add(sentence.get(i));
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
|
|
|
|
fillSkipgrams(currentLoop, i, j, w);
|
|
|
|
|
|
|
|
currentLoop.add(sentence.get(j));
|
|
|
|
|
|
|
|
fillSkipgrams(currentLoop, j, k, w);
|
|
|
|
currentLoop.add(sentence.get(k));
|
|
|
|
currentLoop.add(sentence.get(k));
|
|
|
|
|
|
|
|
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
|
|
|
@ -300,9 +312,12 @@ public class Ngrams {
|
|
|
|
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
|
|
|
|
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
|
|
|
|
if (ngram == 4 && l < sentence.size()) {
|
|
|
|
if (ngram == 4 && l < sentence.size()) {
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
|
|
|
currentLoop.add(sentence.get(i));
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
|
|
|
|
fillSkipgrams(currentLoop, i, j, w);
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats));
|
|
|
|
currentLoop.add(sentence.get(j));
|
|
|
|
|
|
|
|
fillSkipgrams(currentLoop, j, k, w);
|
|
|
|
|
|
|
|
currentLoop.add(sentence.get(k));
|
|
|
|
|
|
|
|
fillSkipgrams(currentLoop, k, l, w);
|
|
|
|
currentLoop.add(sentence.get(l));
|
|
|
|
currentLoop.add(sentence.get(l));
|
|
|
|
|
|
|
|
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
|
|
|
@ -310,10 +325,14 @@ public class Ngrams {
|
|
|
|
for (int m = l + 1; m <= l + 1 + skip; m++) { // 5gram
|
|
|
|
for (int m = l + 1; m <= l + 1 + skip; m++) { // 5gram
|
|
|
|
if (ngram == 5 && m < sentence.size()) {
|
|
|
|
if (ngram == 5 && m < sentence.size()) {
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
currentLoop = new ArrayList<>();
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
|
|
|
currentLoop.add(sentence.get(i));
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
|
|
|
|
fillSkipgrams(currentLoop, i, j, w);
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats));
|
|
|
|
currentLoop.add(sentence.get(j));
|
|
|
|
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, l, m, stats));
|
|
|
|
fillSkipgrams(currentLoop, j, k, w);
|
|
|
|
|
|
|
|
currentLoop.add(sentence.get(k));
|
|
|
|
|
|
|
|
fillSkipgrams(currentLoop, k, l, w);
|
|
|
|
|
|
|
|
currentLoop.add(sentence.get(l));
|
|
|
|
|
|
|
|
fillSkipgrams(currentLoop, l, m, w);
|
|
|
|
currentLoop.add(sentence.get(m));
|
|
|
|
currentLoop.add(sentence.get(m));
|
|
|
|
|
|
|
|
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
|
|
|
|
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
|
|
|
@ -329,6 +348,12 @@ public class Ngrams {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private static void fillSkipgrams(ArrayList<Word> currentLoop, int i, int j, Word w){
|
|
|
|
|
|
|
|
for(int k = i + 1; k < j; k++){
|
|
|
|
|
|
|
|
currentLoop.add(w);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats, List<String> taxonomy) {
|
|
|
|
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats, List<String> taxonomy) {
|
|
|
|
// count if no regex is set or if it is & candidate passes it
|
|
|
|
// count if no regex is set or if it is & candidate passes it
|
|
|
|
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
|
|
|
|
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
|
|
|
|