Project copied
This commit is contained in:
62
src/main/java/alg/ngram/ForkJoin.java
Normal file
62
src/main/java/alg/ngram/ForkJoin.java
Normal file
@@ -0,0 +1,62 @@
|
||||
package alg.ngram;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.RecursiveAction;
|
||||
|
||||
import data.Sentence;
|
||||
import data.StatisticsNew;
|
||||
|
||||
public class ForkJoin extends RecursiveAction {
|
||||
private static final long serialVersionUID = 5074814035083362355L;
|
||||
|
||||
private static final int ACCEPTABLE_SIZE = 1000;
|
||||
private List<Sentence> corpus;
|
||||
private StatisticsNew stats;
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
/**
|
||||
* Constructor for subproblems.
|
||||
*/
|
||||
private ForkJoin(List<Sentence> corpus, int start, int end, StatisticsNew stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default constructor for the initial problem
|
||||
*/
|
||||
public ForkJoin(List<Sentence> corpus, StatisticsNew stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = 0;
|
||||
this.end = corpus.size();
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
private void computeDirectly() {
|
||||
List<Sentence> subCorpus = corpus.subList(start, end);
|
||||
Ngrams.calculateForAll(subCorpus, stats);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void compute() {
|
||||
int subCorpusSize = end - start;
|
||||
|
||||
if (subCorpusSize < ACCEPTABLE_SIZE) {
|
||||
computeDirectly();
|
||||
} else {
|
||||
int mid = start + subCorpusSize / 2;
|
||||
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
|
||||
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
|
||||
|
||||
// fork (push to queue)-> compute -> join
|
||||
left.fork();
|
||||
right.fork();
|
||||
left.join();
|
||||
right.join();
|
||||
}
|
||||
}
|
||||
}
|
||||
204
src/main/java/alg/ngram/Ngrams.java
Normal file
204
src/main/java/alg/ngram/Ngrams.java
Normal file
@@ -0,0 +1,204 @@
|
||||
package alg.ngram;
|
||||
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import data.CalculateFor;
|
||||
import data.Sentence;
|
||||
import data.StatisticsNew;
|
||||
import data.Word;
|
||||
import gui.ValidationUtil;
|
||||
|
||||
public class Ngrams {
|
||||
public final static Logger logger = LogManager.getLogger(Ngrams.class);
|
||||
|
||||
|
||||
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
|
||||
if (stats.getFilter().getNgramValue() == 0) { // letter ngram
|
||||
generateNgramLetterCandidates(corpus, stats);
|
||||
} else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) {
|
||||
generateSkipgramCandidates(corpus, stats);
|
||||
} else {
|
||||
generateNgramCandidates(corpus, stats);
|
||||
}
|
||||
}
|
||||
|
||||
public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
|
||||
for (Sentence s : corpus) {
|
||||
// skip sentences shorter than specified ngram length
|
||||
if (s.getWords().size() < stats.getFilter().getNgramValue()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) {
|
||||
List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
|
||||
|
||||
// if msd regex is set and this candidate doesn't pass it, skip this iteration
|
||||
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether an ngram candidate passes specified regex filter.
|
||||
*/
|
||||
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {
|
||||
if (ngramCandidate.size() != regex.size()) {
|
||||
logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < regex.size(); i++) {
|
||||
if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {
|
||||
ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());
|
||||
|
||||
switch (calculateFor) {
|
||||
case LEMMA:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
case WORD:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
case MORPHOSYNTACTIC_SPECS:
|
||||
case MORPHOSYNTACTIC_PROPERTY:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(Word::getMsd)
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
case WORD_TYPE:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(w -> Character.toString(w.getMsd().charAt(0)))
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
}
|
||||
|
||||
return StringUtils.join(candidate, " ");
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates candidates and updates results
|
||||
*
|
||||
* @param corpus
|
||||
* @param stats
|
||||
*/
|
||||
private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
|
||||
for (Sentence s : corpus) {
|
||||
for (Word w : s.getWords()) {
|
||||
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
|
||||
|
||||
// skip this iteration if:
|
||||
// - word doesn't contain a proper version (missing lemma for example)
|
||||
// - msd regex is given but this word's msd doesn't match it, skip this iteration
|
||||
// - given substring length is larger than the word length
|
||||
if (ValidationUtil.isEmpty(word)
|
||||
|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())
|
||||
|| word.length() < stats.getFilter().getStringLength()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
|
||||
// TODO: locila?
|
||||
stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Extracts skipgram candidates.
|
||||
*
|
||||
* @return List of candidates represented as a list<candidates(String)>
|
||||
*/
|
||||
public static void generateSkipgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
|
||||
ArrayList<Word> currentLoop;
|
||||
int ngram = stats.getFilter().getNgramValue();
|
||||
int skip = stats.getFilter().getSkipValue();
|
||||
|
||||
for (Sentence s : corpus) {
|
||||
List<Word> sentence = s.getWords();
|
||||
|
||||
for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
|
||||
for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
|
||||
if (ngram == 2 && j < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
currentLoop.add(sentence.get(i));
|
||||
currentLoop.add(sentence.get(j));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats);
|
||||
} else {
|
||||
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
|
||||
if (ngram == 3 && k < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
currentLoop.add(sentence.get(i));
|
||||
currentLoop.add(sentence.get(j));
|
||||
currentLoop.add(sentence.get(k));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats);
|
||||
} else {
|
||||
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
|
||||
if (ngram == 4 && k < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
currentLoop.add(sentence.get(i));
|
||||
currentLoop.add(sentence.get(j));
|
||||
currentLoop.add(sentence.get(k));
|
||||
currentLoop.add(sentence.get(l));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats);
|
||||
} else {
|
||||
for (int m = k + 1; m <= k + 1 + skip; m++) { // 5gram
|
||||
if (ngram == 5 && k < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
currentLoop.add(sentence.get(i));
|
||||
currentLoop.add(sentence.get(j));
|
||||
currentLoop.add(sentence.get(k));
|
||||
currentLoop.add(sentence.get(l));
|
||||
currentLoop.add(sentence.get(m));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
|
||||
// count if no regex is set or if it is & candidate passes it
|
||||
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
|
||||
stats.updateResults(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()));
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user