Added some optimizations and new taxonomy names

This commit is contained in:
2018-08-31 07:57:58 +02:00
parent 1c00f1a283
commit 426a9ccc46
21 changed files with 1345 additions and 1182 deletions

View File

@@ -262,7 +262,7 @@ public class XML_processing {
if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() &&
stavek.size() > 0){
stavek.add(new Word(c3Content, c3Content, "/"));
stavek.add(createWord(c3Content, c3Content, "/", "", stats.getFilter()));
}
@@ -297,7 +297,7 @@ public class XML_processing {
// "word" node value
if (in_word) {
stavek.add(new Word(characters.getData(), lemma, msd));
stavek.add(createWord(characters.getData(), lemma, msd, "", stats.getFilter()));
in_word = false;
}
break;
@@ -537,12 +537,12 @@ public class XML_processing {
// "word" node value
if (inWord) {
String word = characters.getData();
sentence.add(new Word(word, lemma, msd));
sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
inWord = false;
}
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
String punctuation = characters.getData();
sentence.add(new Word(punctuation, punctuation, "/"));
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
inPunctuation = false;
// String punctuation = ",";
@@ -761,7 +761,7 @@ public class XML_processing {
// GOSCorpusHM.put(GOSCorpusHMKey, sentence);
String word = "";
Characters characters = event.asCharacters();
sentence.add(new Word(characters.getData(), "", ""));
sentence.add(createWord(characters.getData(), "", "", "", stats.getFilter()));
// if algorithm is in normalized part find orthodox word and add other info to it
} else {
Characters characters = event.asCharacters();
@@ -769,15 +769,16 @@ public class XML_processing {
// System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex);
if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) {
Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex);
currentWord.setLemma(lemma);
currentWord.setMsd(msd);
currentWord.setNormalizedWord(characters.getData());
currentWord.setLemma(lemma, stats.getFilter().getWordParts());
currentWord.setMsd(msd, stats.getFilter().getWordParts());
currentWord.setNormalizedWord(characters.getData(), stats.getFilter().getWordParts());
wordIndex += 1;
// when a word is separated from one to many we have to create these duplicates
if (inSeparatedWord){
GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, new Word(currentWord.getWord(), "", ""));
GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, createWord(currentWord.getWord(stats.getFilter().getWordParts()),
"", "", "", stats.getFilter()));
}
} //else {
// System.out.println("Error");
@@ -893,8 +894,8 @@ public class XML_processing {
// if we're calculating values for letters, omit words that are shorter than string length
if (filter.getNgramValue() == 0) {
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength())
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength()));
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord(filter.getWordParts()).length() < filter.getStringLength())
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma(filter.getWordParts()).length() < filter.getStringLength()));
}
}
@@ -912,4 +913,38 @@ public class XML_processing {
return atts;
}
private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
List<String> wString = new ArrayList<>();
if (f.getWordParts().contains(CalculateFor.WORD))
wString.add(word);
if (f.getWordParts().contains(CalculateFor.LEMMA))
wString.add(lemma);
if (f.getWordParts().contains(CalculateFor.MORPHOSYNTACTIC_SPECS))
wString.add(msd);
if (f.getWordParts().contains(CalculateFor.NORMALIZED_WORD))
wString.add(normalizedWord);
// find appropriate strings and put them in word
Word w;
switch (f.getWordParts().size()) {
case 1:
w = new Word1(wString.get(0));
break;
case 2:
w = new Word2(wString.get(0), wString.get(1));
break;
case 3:
w = new Word3(wString.get(0), wString.get(1), wString.get(2));
break;
case 4:
w = new Word4(wString.get(0), wString.get(1), wString.get(2), wString.get(3));
break;
default:
w = null;
}
return w;
}
}

View File

@@ -1,67 +1,67 @@
package alg.inflectedJOS;
import java.util.List;
import java.util.concurrent.RecursiveAction;
import data.Sentence;
import data.Statistics;
public class ForkJoin extends RecursiveAction {
private static final long serialVersionUID = -1260951004477299634L;
private static final int ACCEPTABLE_SIZE = 1000;
private List<Sentence> corpus;
private Statistics stats;
private int start;
private int end;
/**
* Constructor for subproblems.
*/
private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
this.corpus = corpus;
this.start = start;
this.end = end;
this.stats = stats;
}
/**
* Default constructor for the initial problem
*/
public ForkJoin(List<Sentence> corpus, Statistics stats) {
this.corpus = corpus;
this.start = 0;
this.end = corpus.size();
this.stats = stats;
}
private void computeDirectly() {
List<Sentence> subCorpus = corpus.subList(start, end);
if (stats.isTaxonomySet()) {
InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
} else {
InflectedJOSCount.calculateForAll(subCorpus, stats, null);
}
}
@Override
protected void compute() {
int subCorpusSize = end - start;
if (subCorpusSize < ACCEPTABLE_SIZE) {
computeDirectly();
} else {
int mid = start + subCorpusSize / 2;
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
// fork (push to queue)-> compute -> join
left.fork();
right.fork();
left.join();
right.join();
}
}
}
//package alg.inflectedJOS;
//
//import java.util.List;
//import java.util.concurrent.RecursiveAction;
//
//import data.Sentence;
//import data.Statistics;
//
//public class ForkJoin extends RecursiveAction {
// private static final long serialVersionUID = -1260951004477299634L;
//
// private static final int ACCEPTABLE_SIZE = 1000;
// private List<Sentence> corpus;
// private Statistics stats;
// private int start;
// private int end;
//
//
// /**
// * Constructor for subproblems.
// */
// private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
// this.corpus = corpus;
// this.start = start;
// this.end = end;
// this.stats = stats;
// }
//
// /**
// * Default constructor for the initial problem
// */
// public ForkJoin(List<Sentence> corpus, Statistics stats) {
// this.corpus = corpus;
// this.start = 0;
// this.end = corpus.size();
// this.stats = stats;
// }
//
// private void computeDirectly() {
// List<Sentence> subCorpus = corpus.subList(start, end);
//
// if (stats.isTaxonomySet()) {
// InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
// } else {
// InflectedJOSCount.calculateForAll(subCorpus, stats, null);
// }
// }
//
// @Override
// protected void compute() {
// int subCorpusSize = end - start;
//
// if (subCorpusSize < ACCEPTABLE_SIZE) {
// computeDirectly();
// } else {
// int mid = start + subCorpusSize / 2;
// ForkJoin left = new ForkJoin(corpus, start, mid, stats);
// ForkJoin right = new ForkJoin(corpus, mid, end, stats);
//
// // fork (push to queue)-> compute -> join
// left.fork();
// right.fork();
// left.join();
// right.join();
// }
// }
//}

View File

@@ -1,170 +1,170 @@
package alg.inflectedJOS;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import alg.Common;
import data.Sentence;
import data.Statistics;
import data.StatisticsNew;
import data.Word;
public class InflectedJOSCount {
public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
// static {
// // calculate all possible combinations of indices we will substitute with a '-' for substring statistics
// indices = new HashMap<>();
// for (int i = 5; i <= 8; i++) {
// indices.put(i, calculateCombinations(i));
// }
// }
//
// private static List<Integer> calculateCombinations(int i) {
// int arr[] = {1, 2, 3, 4, 5};
// int r = 3;
// int n = arr.length;
// ArrayList<ArrayList<Integer>> result = new ArrayList<>();
//
// return printCombination(arr, n, r);
// }
//
// /* arr[] ---> Input Array
// data[] ---> Temporary array to store current combination
// start & end ---> Staring and Ending indexes in arr[]
// index ---> Current index in data[]
// r ---> Size of a combination to be printed */
// static void combinationUtil(int arr[], int data[], int start,
// int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
// // Current combination is ready to be printed, print it
// ArrayList<Integer> tmpResult = new ArrayList<>();
//
// if (index == r) {
// ArrayList<Integer> tmpResult = new ArrayList<>();
// for (int j = 0; j < r; j++)
// System.out.print(data[j] + " ");
// System.out.println("");
// return;
// }
//
// // replace index with all possible elements. The condition
// // "end-i+1 >= r-index" makes sure that including one element
// // at index will make a combination with remaining elements
// // at remaining positions
// for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
// data[index] = arr[i];
// combinationUtil(arr, data, i + 1, end, index + 1, r);
// }
// }
//
// // The main function that prints all combinations of size r
// // in arr[] of size n. This function mainly uses combinationUtil()
// static void printCombination(int arr[], int n, int r) {
// // A temporary array to store all combination one by one
// int data[] = new int[r];
//
// // Print all combination using temprary array 'data[]'
// combinationUtil(arr, data, 0, n - 1, 0, r);
// }
// public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
// for (Sentence s : corpus) {
// // disregard if wrong taxonomy
// if (!(s.getTaxonomy().startsWith(taxonomy))) {
// continue;
// }
//
// calculateCommon(s, stats.result);
//
// for (Word word : s.getWords()) {
// // skip if current word is not inflected
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
//
// String msd = word.getMsd();
//
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
//
// for (int i = 1; i < msd.length(); i++) {
// entry.setCharAt(i, msd.charAt(i));
// Common.updateMap(stats.result, entry.toString());
// entry.setCharAt(i, '-');
// }
// }
// }
// }
// public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
// for (Sentence s : corpus) {
// for (Word word : s.getWords()) {
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
//
// String msd = word.getMsd();
//
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
//
// for (int i = 1; i < msd.length(); i++) {
// entry.setCharAt(i, msd.charAt(i));
// Common.updateMap(stats.result, entry.toString());
// entry.setCharAt(i, '-');
// }
// }
// }
// }
static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
for (Sentence s : corpus) {
// disregard if wrong taxonomy
// if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
// continue;
//package alg.inflectedJOS;
//
//import java.util.ArrayList;
//import java.util.HashMap;
//import java.util.List;
//
//import org.apache.commons.lang3.StringUtils;
//
//import alg.Common;
//import data.Sentence;
//import data.Statistics;
//import data.StatisticsNew;
//import data.Word;
//
//public class InflectedJOSCount {
//
// public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
//
// // static {
// // // calculate all possible combinations of indices we will substitute with a '-' for substring statistics
// // indices = new HashMap<>();
// // for (int i = 5; i <= 8; i++) {
// // indices.put(i, calculateCombinations(i));
// // }
// // }
// //
// // private static List<Integer> calculateCombinations(int i) {
// // int arr[] = {1, 2, 3, 4, 5};
// // int r = 3;
// // int n = arr.length;
// // ArrayList<ArrayList<Integer>> result = new ArrayList<>();
// //
// // return printCombination(arr, n, r);
// // }
// //
// // /* arr[] ---> Input Array
// // data[] ---> Temporary array to store current combination
// // start & end ---> Staring and Ending indexes in arr[]
// // index ---> Current index in data[]
// // r ---> Size of a combination to be printed */
// // static void combinationUtil(int arr[], int data[], int start,
// // int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
// // // Current combination is ready to be printed, print it
// // ArrayList<Integer> tmpResult = new ArrayList<>();
// //
// // if (index == r) {
// // ArrayList<Integer> tmpResult = new ArrayList<>();
// // for (int j = 0; j < r; j++)
// // System.out.print(data[j] + " ");
// // System.out.println("");
// // return;
// // }
// //
// // // replace index with all possible elements. The condition
// // // "end-i+1 >= r-index" makes sure that including one element
// // // at index will make a combination with remaining elements
// // // at remaining positions
// // for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
// // data[index] = arr[i];
// // combinationUtil(arr, data, i + 1, end, index + 1, r);
// // }
// // }
// //
// // // The main function that prints all combinations of size r
// // // in arr[] of size n. This function mainly uses combinationUtil()
// // static void printCombination(int arr[], int n, int r) {
// // // A temporary array to store all combination one by one
// // int data[] = new int[r];
// //
// // // Print all combination using temprary array 'data[]'
// // combinationUtil(arr, data, 0, n - 1, 0, r);
// // }
//
// // public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
// // for (Sentence s : corpus) {
// // // disregard if wrong taxonomy
// // if (!(s.getTaxonomy().startsWith(taxonomy))) {
// // continue;
// // }
// //
// // calculateCommon(s, stats.result);
// //
// // for (Word word : s.getWords()) {
// // // skip if current word is not inflected
// // if (!(word.getMsd().length() > 0)) {
// // continue;
// // }
// //
// // String msd = word.getMsd();
// //
// // StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
// //
// // for (int i = 1; i < msd.length(); i++) {
// // entry.setCharAt(i, msd.charAt(i));
// // Common.updateMap(stats.result, entry.toString());
// // entry.setCharAt(i, '-');
// // }
// // }
// // }
// // }
//
// // public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
// // for (Sentence s : corpus) {
// // for (Word word : s.getWords()) {
// // if (!(word.getMsd().length() > 0)) {
// // continue;
// // }
// //
// // String msd = word.getMsd();
// //
// // StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
// //
// // for (int i = 1; i < msd.length(); i++) {
// // entry.setCharAt(i, msd.charAt(i));
// // Common.updateMap(stats.result, entry.toString());
// // entry.setCharAt(i, '-');
// // }
// // }
// // }
// // }
//
// static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
// for (Sentence s : corpus) {
// // disregard if wrong taxonomy
//// if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
//// continue;
//// }
//
// for (Word word : s.getWords()) {
// // skip if current word is not inflected
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
//
// String msd = word.getMsd();
//
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
//
// for (int i = 1; i < msd.length(); i++) {
// entry.setCharAt(i, msd.charAt(i));
// Common.updateMap(stats.result, entry.toString());
// entry.setCharAt(i, '-');
// }
// }
for (Word word : s.getWords()) {
// skip if current word is not inflected
if (!(word.getMsd().length() > 0)) {
continue;
}
String msd = word.getMsd();
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
for (int i = 1; i < msd.length(); i++) {
entry.setCharAt(i, msd.charAt(i));
Common.updateMap(stats.result, entry.toString());
entry.setCharAt(i, '-');
}
}
}
}
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
for (Sentence s : corpus) {
for (Word word : s.getWords()) {
// skip if current word is not inflected
// // TODO: if has defined msd and is of correct type (create a set)
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
String msd = word.getMsd();
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
for (int i = 1; i < msd.length(); i++) {
entry.setCharAt(i, msd.charAt(i));
stats.updateResults(entry.toString());
entry.setCharAt(i, '-');
}
}
}
}
}
// }
// }
//
// public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
// for (Sentence s : corpus) {
//
// for (Word word : s.getWords()) {
// // skip if current word is not inflected
// // // TODO: if has defined msd and is of correct type (create a set)
// // if (!(word.getMsd().length() > 0)) {
// // continue;
// // }
//
// String msd = word.getMsd();
//
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
//
// for (int i = 1; i < msd.length(); i++) {
// entry.setCharAt(i, msd.charAt(i));
// stats.updateResults(entry.toString());
// entry.setCharAt(i, '-');
// }
// }
// }
// }
//}

View File

@@ -43,12 +43,12 @@ public class Ngrams {
List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
// if msd regex is set and this candidate doesn't pass it, skip this iteration
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
continue;
}
// generate proper MultipleHMKeys depending on filter data
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor());
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts());
// if last letter is ',' erase it
@@ -67,14 +67,14 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys1(key);
break;
case 1:
String k1_2 = wordToString(ngramCandidate, otherKeys.get(0));
String k1_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations())
// k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length()-1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
multipleKeys = new MultipleHMKeys2(key, k1_2);
break;
case 2:
String k2_2 = wordToString(ngramCandidate, otherKeys.get(0));
String k2_3 = wordToString(ngramCandidate, otherKeys.get(1));
String k2_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k2_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
// k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
@@ -82,9 +82,9 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
break;
case 3:
String k3_2 = wordToString(ngramCandidate, otherKeys.get(0));
String k3_3 = wordToString(ngramCandidate, otherKeys.get(1));
String k3_4 = wordToString(ngramCandidate, otherKeys.get(2));
String k3_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k3_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k3_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
// k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
@@ -93,10 +93,10 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
break;
case 4:
String k4_2 = wordToString(ngramCandidate, otherKeys.get(0));
String k4_3 = wordToString(ngramCandidate, otherKeys.get(1));
String k4_4 = wordToString(ngramCandidate, otherKeys.get(2));
String k4_5 = wordToString(ngramCandidate, otherKeys.get(3));
String k4_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k4_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k4_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
String k4_5 = wordToString(ngramCandidate, otherKeys.get(3), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
// k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
@@ -137,7 +137,7 @@ public class Ngrams {
/**
* Checks whether an ngram candidate passes specified regex filter.
*/
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex, ArrayList<CalculateFor> wordParts) {
if (ngramCandidate.size() != regex.size()) {
logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
return false;
@@ -145,7 +145,7 @@ public class Ngrams {
for (int i = 0; i < regex.size(); i++) {
//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern() + ".*")) {
if (!ngramCandidate.get(i).getMsd(wordParts).matches(regex.get(i).pattern() + ".*")) {
return false;
}
}
@@ -153,33 +153,33 @@ public class Ngrams {
return true;
}
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor, ArrayList<CalculateFor> wordParts) {
ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());
switch (calculateFor) {
case LEMMA:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getLemma)
.map(w -> w.getLemma(wordParts))
.collect(Collectors.toList()));
return StringUtils.join(candidate, " ");
case WORD:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getWord)
.map(w -> w.getWord(wordParts))
.collect(Collectors.toList()));
return StringUtils.join(candidate, " ");
case MORPHOSYNTACTIC_SPECS:
case MORPHOSYNTACTIC_PROPERTY:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getMsd)
.map(w -> w.getMsd(wordParts))
.collect(Collectors.toList()));
return StringUtils.join(candidate, " ");
case WORD_TYPE:
candidate.addAll(ngramCandidate
.stream()
.map(w -> Character.toString(w.getMsd().charAt(0)))
.map(w -> Character.toString(w.getMsd(wordParts).charAt(0)))
.collect(Collectors.toList()));
// candidate.addAll(ngramCandidate
// .stream()
@@ -190,7 +190,7 @@ public class Ngrams {
case NORMALIZED_WORD:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getNormalizedWord)
.map(w -> w.getNormalizedWord(wordParts))
.collect(Collectors.toList()));
return StringUtils.join(candidate, " ");
}
@@ -208,14 +208,14 @@ public class Ngrams {
for (Sentence s : corpus) {
for (Word w : s.getWords()) {
List<String> taxonomy = s.getTaxonomy();
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv(), stats.getFilter().getWordParts());
// skip this iteration if:
// - word doesn't contain a proper version (missing lemma for example)
// - msd regex is given but this word's msd doesn't match it, skip this iteration
// - given substring length is larger than the word length
if (ValidationUtil.isEmpty(word)
|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())
|| stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern())
|| word.length() < stats.getFilter().getStringLength()) {
continue;
}
@@ -331,7 +331,7 @@ public class Ngrams {
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats, List<String> taxonomy) {
// count if no regex is set or if it is & candidate passes it
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
// String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
// key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
// stats.updateTaxonomyResults(new MultipleHMKeys1(key),
@@ -340,7 +340,7 @@ public class Ngrams {
ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();
String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts());
// if last letter is ',' erase it
@@ -359,14 +359,14 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys1(key);
break;
case 1:
String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0));
String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations())
// k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length() - 1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
multipleKeys = new MultipleHMKeys2(key, k1_2);
break;
case 2:
String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0));
String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1));
String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
// k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
@@ -374,9 +374,9 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
break;
case 3:
String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0));
String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1));
String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2));
String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
// k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
@@ -385,10 +385,10 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
break;
case 4:
String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0));
String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1));
String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2));
String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3));
String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) {
// k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
// k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;

View File

@@ -10,84 +10,84 @@ import data.Sentence;
import data.Statistics;
import data.Word;
class WordCount {
private static void calculateNoFilter(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
List<String> sentence = new ArrayList<>(s.getWords().size());
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
Common.updateMap(stats.result, word);
}
}
}
private static void calculateVCC(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
List<String> sentence = new ArrayList<>(s.getWords().size());
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getCVVLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getCVVWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
if (word.length() > stats.getSubstringLength()) {
for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) {
String substring = word.substring(i, i + stats.getSubstringLength());
Common.updateMap(stats.result, substring);
}
}
}
}
}
private static void calculateForJosType(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
List<String> sentence = new ArrayList<>(s.getWords().size());
List<Word> filteredWords = new ArrayList<>();
for (Word word : s.getWords()) {
if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
filteredWords.add(word);
}
}
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(filteredWords
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(filteredWords
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
Common.updateMap(stats.result, word);
}
}
}
//class WordCount {
// private static void calculateNoFilter(List<Sentence> corpus, Statistics stats) {
// for (Sentence s : corpus) {
// List<String> sentence = new ArrayList<>(s.getWords().size());
//
// if (stats.getCf() == CalculateFor.LEMMA) {
// sentence.addAll(s.getWords()
// .stream()
// .map(Word::getLemma)
// .collect(Collectors.toList()));
// } else if (stats.getCf() == CalculateFor.WORD) {
// sentence.addAll(s.getWords()
// .stream()
// .map(Word::getWord)
// .collect(Collectors.toList()));
// }
//
// for (String word : sentence) {
// Common.updateMap(stats.result, word);
// }
// }
// }
//
// private static void calculateVCC(List<Sentence> corpus, Statistics stats) {
// for (Sentence s : corpus) {
// List<String> sentence = new ArrayList<>(s.getWords().size());
//
// if (stats.getCf() == CalculateFor.LEMMA) {
// sentence.addAll(s.getWords()
// .stream()
// .map(Word::getCVVLemma)
// .collect(Collectors.toList()));
// } else if (stats.getCf() == CalculateFor.WORD) {
// sentence.addAll(s.getWords()
// .stream()
// .map(Word::getCVVWord)
// .collect(Collectors.toList()));
// }
//
// for (String word : sentence) {
// if (word.length() > stats.getSubstringLength()) {
// for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) {
// String substring = word.substring(i, i + stats.getSubstringLength());
// Common.updateMap(stats.result, substring);
// }
// }
// }
// }
// }
//
// private static void calculateForJosType(List<Sentence> corpus, Statistics stats) {
// for (Sentence s : corpus) {
// List<String> sentence = new ArrayList<>(s.getWords().size());
// List<Word> filteredWords = new ArrayList<>();
//
// for (Word word : s.getWords()) {
// if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
// filteredWords.add(word);
// }
// }
//
// if (stats.getCf() == CalculateFor.LEMMA) {
// sentence.addAll(filteredWords
// .stream()
// .map(Word::getLemma)
// .collect(Collectors.toList()));
// } else if (stats.getCf() == CalculateFor.WORD) {
// sentence.addAll(filteredWords
// .stream()
// .map(Word::getWord)
// .collect(Collectors.toList()));
// }
//
// for (String word : sentence) {
// Common.updateMap(stats.result, word);
// }
// }
// }
// private static void calculateForTaxonomyAndJosType(List<Sentence> corpus, Statistics stats) {
// for (Sentence s : corpus) {
@@ -164,4 +164,4 @@ class WordCount {
// }
// }
// }
}
//}

View File

@@ -34,8 +34,8 @@ public class WordLevel {
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
for (Word word : s.getWords()) {
calculateForSuffixes(word.getWord(), stats);
calculateForPrefixes(word.getWord(), stats);
calculateForSuffixes(word.getWord(stats.getFilter().getWordParts()), stats);
calculateForPrefixes(word.getWord(stats.getFilter().getWordParts()), stats);
}
}
}