diff --git a/src/main/java/alg/XML_processing.java b/src/main/java/alg/XML_processing.java index 90bc913..a5bec3f 100755 --- a/src/main/java/alg/XML_processing.java +++ b/src/main/java/alg/XML_processing.java @@ -542,16 +542,36 @@ public class XML_processing { sentence.add(new Word(word, lemma, msd, currentFiletaxonomyLong)); inWord = false; } -// if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { -//// String punctuation = characters.getData(); -// String punctuation = ","; -// sentence.get(sentence.size()-1).setWord(sentence.get(sentence.size()-1).getWord() + punctuation); -// sentence.get(sentence.size()-1).setLemma(sentence.get(sentence.size()-1).getLemma() + punctuation); -// sentence.get(sentence.size()-1).setMsd(sentence.get(sentence.size()-1).getMsd() + punctuation); -// inPunctuation = false; -// } + if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { +// String punctuation = characters.getData(); + String punctuation = ","; + + sentence.get(sentence.size() - 1).setWord(sentence.get(sentence.size() - 1).getWord() + punctuation); + sentence.get(sentence.size() - 1).setLemma(sentence.get(sentence.size() - 1).getLemma() + punctuation); + sentence.get(sentence.size() - 1).setMsd(sentence.get(sentence.size() - 1).getMsd() + punctuation); + inPunctuation = false; + } break; +// if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { +// String actualPunctuation = characters.getData(); +// if (actualPunctuation.equals(".") || actualPunctuation.equals("!") || actualPunctuation.equals("?") || actualPunctuation.equals("...")) +// break; +// String punctuation = ","; +// int skip_number = 0; +// if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue())){ +// skip_number = stats.getFilter().getSkipValue(); +// } +// for(int i = 1; i < skip_number + 2; i ++){ +// if (i < sentence.size() && !sentence.get(sentence.size() - i).equals(punctuation)) { +// sentence.get(sentence.size() - i).setWord(sentence.get(sentence.size() - i).getWord() + punctuation); +// sentence.get(sentence.size() - i).setLemma(sentence.get(sentence.size() - i).getLemma() + punctuation); +// sentence.get(sentence.size() - i).setMsd(sentence.get(sentence.size() - i).getMsd() + punctuation); +// } +// } +// inPunctuation = false; +// } + case XMLStreamConstants.END_ELEMENT: EndElement endElement = event.asEndElement(); diff --git a/src/main/java/alg/ngram/Ngrams.java b/src/main/java/alg/ngram/Ngrams.java index b2c069b..6f6f218 100755 --- a/src/main/java/alg/ngram/Ngrams.java +++ b/src/main/java/alg/ngram/Ngrams.java @@ -44,6 +44,8 @@ public class Ngrams { // generate proper MultipleHMKeys depending on filter data String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor()); + + // if last letter is ',' erase it key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key; // String key = "aaaaaaaaaaaaaaaaaaaaaaa"; @@ -161,6 +163,33 @@ public class Ngrams { } } + /** + * Checks skipped words and if necessary adds punctuations. + * + * @return List of candidates represented as a list + */ + private static Word checkAndModifySkipgramPunctuation(List sentence, int i, int j, StatisticsNew stats){ + // if punctuation checkbox selected and there words at indexes i and j are not next to each other + if(stats.getFilter().getNotePunctuations() && j - i > 1 && sentence.get(i).getWord().charAt(sentence.get(i).getWord().length() - 1) != ','){ + boolean middleWordsHavePunctuation = false; + for (int n = i + 1; n < j; n++){ + if (sentence.get(n).getWord().charAt(sentence.get(n).getWord().length() - 1) == ','){ + middleWordsHavePunctuation = true; + break; + } + } + if (middleWordsHavePunctuation){ + + String punctuation = ","; + return new Word(sentence.get(i).getWord() + punctuation, + sentence.get(i).getLemma() + punctuation, + sentence.get(i).getMsd() + punctuation, + sentence.get(i).getTaxonomy()); + } + } + return sentence.get(i); + + } /** * Extracts skipgram candidates. @@ -179,7 +208,8 @@ public class Ngrams { for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram if (ngram == 2 && j < sentence.size()) { currentLoop = new ArrayList<>(); - currentLoop.add(sentence.get(i)); +// currentLoop.add(sentence.get(i)); + currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats)); currentLoop.add(sentence.get(j)); validateAndCountSkipgramCandidate(currentLoop, stats); @@ -187,29 +217,29 @@ public class Ngrams { for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram if (ngram == 3 && k < sentence.size()) { currentLoop = new ArrayList<>(); - currentLoop.add(sentence.get(i)); - currentLoop.add(sentence.get(j)); + currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats)); + currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats)); currentLoop.add(sentence.get(k)); validateAndCountSkipgramCandidate(currentLoop, stats); } else { for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram - if (ngram == 4 && k < sentence.size()) { + if (ngram == 4 && l < sentence.size()) { currentLoop = new ArrayList<>(); - currentLoop.add(sentence.get(i)); - currentLoop.add(sentence.get(j)); - currentLoop.add(sentence.get(k)); + currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats)); + currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats)); + currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats)); currentLoop.add(sentence.get(l)); validateAndCountSkipgramCandidate(currentLoop, stats); } else { - for (int m = k + 1; m <= k + 1 + skip; m++) { // 5gram - if (ngram == 5 && k < sentence.size()) { + for (int m = l + 1; m <= l + 1 + skip; m++) { // 5gram + if (ngram == 5 && m < sentence.size()) { currentLoop = new ArrayList<>(); - currentLoop.add(sentence.get(i)); - currentLoop.add(sentence.get(j)); - currentLoop.add(sentence.get(k)); - currentLoop.add(sentence.get(l)); + currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats)); + currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats)); + currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats)); + currentLoop.add(checkAndModifySkipgramPunctuation(sentence, l, m, stats)); currentLoop.add(sentence.get(m)); validateAndCountSkipgramCandidate(currentLoop, stats); @@ -228,7 +258,9 @@ public class Ngrams { private static void validateAndCountSkipgramCandidate(ArrayList skipgramCandidate, StatisticsNew stats) { // count if no regex is set or if it is & candidate passes it if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) { - stats.updateTaxonomyResults(new MultipleHMKeys(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()), "", "", ""), + String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()); + key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key; + stats.updateTaxonomyResults(new MultipleHMKeys(key, "", "", ""), stats.getCorpus().getTaxonomy()); } }