Added some optimizations and new taxonomy names

master
Luka 6 years ago
parent 1c00f1a283
commit 426a9ccc46

@ -262,7 +262,7 @@ public class XML_processing {
if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() &&
stavek.size() > 0){ stavek.size() > 0){
stavek.add(new Word(c3Content, c3Content, "/")); stavek.add(createWord(c3Content, c3Content, "/", "", stats.getFilter()));
} }
@ -297,7 +297,7 @@ public class XML_processing {
// "word" node value // "word" node value
if (in_word) { if (in_word) {
stavek.add(new Word(characters.getData(), lemma, msd)); stavek.add(createWord(characters.getData(), lemma, msd, "", stats.getFilter()));
in_word = false; in_word = false;
} }
break; break;
@ -537,12 +537,12 @@ public class XML_processing {
// "word" node value // "word" node value
if (inWord) { if (inWord) {
String word = characters.getData(); String word = characters.getData();
sentence.add(new Word(word, lemma, msd)); sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
inWord = false; inWord = false;
} }
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
String punctuation = characters.getData(); String punctuation = characters.getData();
sentence.add(new Word(punctuation, punctuation, "/")); sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
inPunctuation = false; inPunctuation = false;
// String punctuation = ","; // String punctuation = ",";
@ -761,7 +761,7 @@ public class XML_processing {
// GOSCorpusHM.put(GOSCorpusHMKey, sentence); // GOSCorpusHM.put(GOSCorpusHMKey, sentence);
String word = ""; String word = "";
Characters characters = event.asCharacters(); Characters characters = event.asCharacters();
sentence.add(new Word(characters.getData(), "", "")); sentence.add(createWord(characters.getData(), "", "", "", stats.getFilter()));
// if algorithm is in normalized part find orthodox word and add other info to it // if algorithm is in normalized part find orthodox word and add other info to it
} else { } else {
Characters characters = event.asCharacters(); Characters characters = event.asCharacters();
@ -769,15 +769,16 @@ public class XML_processing {
// System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex); // System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex);
if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) { if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) {
Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex); Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex);
currentWord.setLemma(lemma); currentWord.setLemma(lemma, stats.getFilter().getWordParts());
currentWord.setMsd(msd); currentWord.setMsd(msd, stats.getFilter().getWordParts());
currentWord.setNormalizedWord(characters.getData()); currentWord.setNormalizedWord(characters.getData(), stats.getFilter().getWordParts());
wordIndex += 1; wordIndex += 1;
// when a word is separated from one to many we have to create these duplicates // when a word is separated from one to many we have to create these duplicates
if (inSeparatedWord){ if (inSeparatedWord){
GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, new Word(currentWord.getWord(), "", "")); GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, createWord(currentWord.getWord(stats.getFilter().getWordParts()),
"", "", "", stats.getFilter()));
} }
} //else { } //else {
// System.out.println("Error"); // System.out.println("Error");
@ -893,8 +894,8 @@ public class XML_processing {
// if we're calculating values for letters, omit words that are shorter than string length // if we're calculating values for letters, omit words that are shorter than string length
if (filter.getNgramValue() == 0) { if (filter.getNgramValue() == 0) {
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength()) sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord(filter.getWordParts()).length() < filter.getStringLength())
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength())); || (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma(filter.getWordParts()).length() < filter.getStringLength()));
} }
} }
@ -912,4 +913,38 @@ public class XML_processing {
return atts; return atts;
} }
private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
List<String> wString = new ArrayList<>();
if (f.getWordParts().contains(CalculateFor.WORD))
wString.add(word);
if (f.getWordParts().contains(CalculateFor.LEMMA))
wString.add(lemma);
if (f.getWordParts().contains(CalculateFor.MORPHOSYNTACTIC_SPECS))
wString.add(msd);
if (f.getWordParts().contains(CalculateFor.NORMALIZED_WORD))
wString.add(normalizedWord);
// find appropriate strings and put them in word
Word w;
switch (f.getWordParts().size()) {
case 1:
w = new Word1(wString.get(0));
break;
case 2:
w = new Word2(wString.get(0), wString.get(1));
break;
case 3:
w = new Word3(wString.get(0), wString.get(1), wString.get(2));
break;
case 4:
w = new Word4(wString.get(0), wString.get(1), wString.get(2), wString.get(3));
break;
default:
w = null;
}
return w;
}
} }

@ -1,67 +1,67 @@
package alg.inflectedJOS; //package alg.inflectedJOS;
//
import java.util.List; //import java.util.List;
import java.util.concurrent.RecursiveAction; //import java.util.concurrent.RecursiveAction;
//
import data.Sentence; //import data.Sentence;
import data.Statistics; //import data.Statistics;
//
public class ForkJoin extends RecursiveAction { //public class ForkJoin extends RecursiveAction {
private static final long serialVersionUID = -1260951004477299634L; // private static final long serialVersionUID = -1260951004477299634L;
//
private static final int ACCEPTABLE_SIZE = 1000; // private static final int ACCEPTABLE_SIZE = 1000;
private List<Sentence> corpus; // private List<Sentence> corpus;
private Statistics stats; // private Statistics stats;
private int start; // private int start;
private int end; // private int end;
//
//
/** // /**
* Constructor for subproblems. // * Constructor for subproblems.
*/ // */
private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) { // private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
this.corpus = corpus; // this.corpus = corpus;
this.start = start; // this.start = start;
this.end = end; // this.end = end;
this.stats = stats; // this.stats = stats;
} // }
//
/** // /**
* Default constructor for the initial problem // * Default constructor for the initial problem
*/ // */
public ForkJoin(List<Sentence> corpus, Statistics stats) { // public ForkJoin(List<Sentence> corpus, Statistics stats) {
this.corpus = corpus; // this.corpus = corpus;
this.start = 0; // this.start = 0;
this.end = corpus.size(); // this.end = corpus.size();
this.stats = stats; // this.stats = stats;
} // }
//
private void computeDirectly() { // private void computeDirectly() {
List<Sentence> subCorpus = corpus.subList(start, end); // List<Sentence> subCorpus = corpus.subList(start, end);
//
if (stats.isTaxonomySet()) { // if (stats.isTaxonomySet()) {
InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy()); // InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
} else { // } else {
InflectedJOSCount.calculateForAll(subCorpus, stats, null); // InflectedJOSCount.calculateForAll(subCorpus, stats, null);
} // }
} // }
//
@Override // @Override
protected void compute() { // protected void compute() {
int subCorpusSize = end - start; // int subCorpusSize = end - start;
//
if (subCorpusSize < ACCEPTABLE_SIZE) { // if (subCorpusSize < ACCEPTABLE_SIZE) {
computeDirectly(); // computeDirectly();
} else { // } else {
int mid = start + subCorpusSize / 2; // int mid = start + subCorpusSize / 2;
ForkJoin left = new ForkJoin(corpus, start, mid, stats); // ForkJoin left = new ForkJoin(corpus, start, mid, stats);
ForkJoin right = new ForkJoin(corpus, mid, end, stats); // ForkJoin right = new ForkJoin(corpus, mid, end, stats);
//
// fork (push to queue)-> compute -> join // // fork (push to queue)-> compute -> join
left.fork(); // left.fork();
right.fork(); // right.fork();
left.join(); // left.join();
right.join(); // right.join();
} // }
} // }
} //}

@ -1,170 +1,170 @@
package alg.inflectedJOS; //package alg.inflectedJOS;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.HashMap; //import java.util.HashMap;
import java.util.List; //import java.util.List;
//
import org.apache.commons.lang3.StringUtils; //import org.apache.commons.lang3.StringUtils;
//
import alg.Common; //import alg.Common;
import data.Sentence; //import data.Sentence;
import data.Statistics; //import data.Statistics;
import data.StatisticsNew; //import data.StatisticsNew;
import data.Word; //import data.Word;
//
public class InflectedJOSCount { //public class InflectedJOSCount {
//
public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices; // public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
//
// static { // // static {
// // calculate all possible combinations of indices we will substitute with a '-' for substring statistics // // // calculate all possible combinations of indices we will substitute with a '-' for substring statistics
// indices = new HashMap<>(); // // indices = new HashMap<>();
// for (int i = 5; i <= 8; i++) { // // for (int i = 5; i <= 8; i++) {
// indices.put(i, calculateCombinations(i)); // // indices.put(i, calculateCombinations(i));
// } // // }
// } // // }
// // //
// private static List<Integer> calculateCombinations(int i) { // // private static List<Integer> calculateCombinations(int i) {
// int arr[] = {1, 2, 3, 4, 5}; // // int arr[] = {1, 2, 3, 4, 5};
// int r = 3; // // int r = 3;
// int n = arr.length; // // int n = arr.length;
// ArrayList<ArrayList<Integer>> result = new ArrayList<>(); // // ArrayList<ArrayList<Integer>> result = new ArrayList<>();
// // //
// return printCombination(arr, n, r); // // return printCombination(arr, n, r);
// } // // }
// // //
// /* arr[] ---> Input Array // // /* arr[] ---> Input Array
// data[] ---> Temporary array to store current combination // // data[] ---> Temporary array to store current combination
// start & end ---> Staring and Ending indexes in arr[] // // start & end ---> Staring and Ending indexes in arr[]
// index ---> Current index in data[] // // index ---> Current index in data[]
// r ---> Size of a combination to be printed */ // // r ---> Size of a combination to be printed */
// static void combinationUtil(int arr[], int data[], int start, // // static void combinationUtil(int arr[], int data[], int start,
// int end, int index, int r, ArrayList<ArrayList<Integer>> result) { // // int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
// // Current combination is ready to be printed, print it // // // Current combination is ready to be printed, print it
// ArrayList<Integer> tmpResult = new ArrayList<>(); // // ArrayList<Integer> tmpResult = new ArrayList<>();
// // //
// if (index == r) { // // if (index == r) {
// ArrayList<Integer> tmpResult = new ArrayList<>(); // // ArrayList<Integer> tmpResult = new ArrayList<>();
// for (int j = 0; j < r; j++) // // for (int j = 0; j < r; j++)
// System.out.print(data[j] + " "); // // System.out.print(data[j] + " ");
// System.out.println(""); // // System.out.println("");
// return; // // return;
// } // // }
// // //
// // replace index with all possible elements. The condition // // // replace index with all possible elements. The condition
// // "end-i+1 >= r-index" makes sure that including one element // // // "end-i+1 >= r-index" makes sure that including one element
// // at index will make a combination with remaining elements // // // at index will make a combination with remaining elements
// // at remaining positions // // // at remaining positions
// for (int i = start; i <= end && end - i + 1 >= r - index; i++) { // // for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
// data[index] = arr[i]; // // data[index] = arr[i];
// combinationUtil(arr, data, i + 1, end, index + 1, r); // // combinationUtil(arr, data, i + 1, end, index + 1, r);
// } // // }
// } // // }
// // //
// // The main function that prints all combinations of size r // // // The main function that prints all combinations of size r
// // in arr[] of size n. This function mainly uses combinationUtil() // // // in arr[] of size n. This function mainly uses combinationUtil()
// static void printCombination(int arr[], int n, int r) { // // static void printCombination(int arr[], int n, int r) {
// // A temporary array to store all combination one by one // // // A temporary array to store all combination one by one
// int data[] = new int[r]; // // int data[] = new int[r];
// // //
// // Print all combination using temprary array 'data[]' // // // Print all combination using temprary array 'data[]'
// combinationUtil(arr, data, 0, n - 1, 0, r); // // combinationUtil(arr, data, 0, n - 1, 0, r);
// } // // }
//
// public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) { // // public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
// for (Sentence s : corpus) { // // for (Sentence s : corpus) {
// // disregard if wrong taxonomy // // // disregard if wrong taxonomy
// if (!(s.getTaxonomy().startsWith(taxonomy))) { // // if (!(s.getTaxonomy().startsWith(taxonomy))) {
// continue; // // continue;
// } // // }
// // //
// calculateCommon(s, stats.result); // // calculateCommon(s, stats.result);
// // //
// for (Word word : s.getWords()) { // // for (Word word : s.getWords()) {
// // skip if current word is not inflected // // // skip if current word is not inflected
// if (!(word.getMsd().length() > 0)) { // // if (!(word.getMsd().length() > 0)) {
// continue; // // continue;
// } // // }
// // //
// String msd = word.getMsd(); // // String msd = word.getMsd();
// // //
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1))); // // StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
// // //
// for (int i = 1; i < msd.length(); i++) { // // for (int i = 1; i < msd.length(); i++) {
// entry.setCharAt(i, msd.charAt(i)); // // entry.setCharAt(i, msd.charAt(i));
// Common.updateMap(stats.result, entry.toString()); // // Common.updateMap(stats.result, entry.toString());
// entry.setCharAt(i, '-'); // // entry.setCharAt(i, '-');
// } // // }
// } // // }
// } // // }
// } // // }
//
// public static void calculateForAll(List<Sentence> corpus, Statistics stats) { // // public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
// for (Sentence s : corpus) { // // for (Sentence s : corpus) {
// for (Word word : s.getWords()) { // // for (Word word : s.getWords()) {
// if (!(word.getMsd().length() > 0)) { // // if (!(word.getMsd().length() > 0)) {
// continue; // // continue;
// } // // }
// // //
// String msd = word.getMsd(); // // String msd = word.getMsd();
// // //
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1))); // // StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
// // //
// for (int i = 1; i < msd.length(); i++) { // // for (int i = 1; i < msd.length(); i++) {
// entry.setCharAt(i, msd.charAt(i)); // // entry.setCharAt(i, msd.charAt(i));
// Common.updateMap(stats.result, entry.toString()); // // Common.updateMap(stats.result, entry.toString());
// entry.setCharAt(i, '-'); // // entry.setCharAt(i, '-');
// } // // }
// } // // }
// } // // }
// } // // }
//
static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) { // static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
for (Sentence s : corpus) { // for (Sentence s : corpus) {
// disregard if wrong taxonomy // // disregard if wrong taxonomy
// if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) { //// if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
// continue; //// continue;
//// }
//
// for (Word word : s.getWords()) {
// // skip if current word is not inflected
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
//
// String msd = word.getMsd();
//
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
//
// for (int i = 1; i < msd.length(); i++) {
// entry.setCharAt(i, msd.charAt(i));
// Common.updateMap(stats.result, entry.toString());
// entry.setCharAt(i, '-');
// }
// } // }
// }
for (Word word : s.getWords()) { // }
// skip if current word is not inflected //
if (!(word.getMsd().length() > 0)) { // public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
continue; // for (Sentence s : corpus) {
} //
// for (Word word : s.getWords()) {
String msd = word.getMsd(); // // skip if current word is not inflected
// // // TODO: if has defined msd and is of correct type (create a set)
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1))); // // if (!(word.getMsd().length() > 0)) {
// // continue;
for (int i = 1; i < msd.length(); i++) { // // }
entry.setCharAt(i, msd.charAt(i)); //
Common.updateMap(stats.result, entry.toString()); // String msd = word.getMsd();
entry.setCharAt(i, '-'); //
} // StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
} //
} // for (int i = 1; i < msd.length(); i++) {
} // entry.setCharAt(i, msd.charAt(i));
// stats.updateResults(entry.toString());
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) { // entry.setCharAt(i, '-');
for (Sentence s : corpus) { // }
// }
for (Word word : s.getWords()) { // }
// skip if current word is not inflected // }
// // TODO: if has defined msd and is of correct type (create a set) //}
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
String msd = word.getMsd();
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
for (int i = 1; i < msd.length(); i++) {
entry.setCharAt(i, msd.charAt(i));
stats.updateResults(entry.toString());
entry.setCharAt(i, '-');
}
}
}
}
}

@ -43,12 +43,12 @@ public class Ngrams {
List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue()); List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
// if msd regex is set and this candidate doesn't pass it, skip this iteration // if msd regex is set and this candidate doesn't pass it, skip this iteration
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) { if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
continue; continue;
} }
// generate proper MultipleHMKeys depending on filter data // generate proper MultipleHMKeys depending on filter data
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor()); String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts());
// if last letter is ',' erase it // if last letter is ',' erase it
@ -67,14 +67,14 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys1(key); multipleKeys = new MultipleHMKeys1(key);
break; break;
case 1: case 1:
String k1_2 = wordToString(ngramCandidate, otherKeys.get(0)); String k1_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) // if (stats.getFilter().getNotePunctuations())
// k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length()-1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2; // k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length()-1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
multipleKeys = new MultipleHMKeys2(key, k1_2); multipleKeys = new MultipleHMKeys2(key, k1_2);
break; break;
case 2: case 2:
String k2_2 = wordToString(ngramCandidate, otherKeys.get(0)); String k2_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k2_3 = wordToString(ngramCandidate, otherKeys.get(1)); String k2_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) { // if (stats.getFilter().getNotePunctuations()) {
// k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2; // k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
// k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3; // k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
@ -82,9 +82,9 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3); multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
break; break;
case 3: case 3:
String k3_2 = wordToString(ngramCandidate, otherKeys.get(0)); String k3_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k3_3 = wordToString(ngramCandidate, otherKeys.get(1)); String k3_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k3_4 = wordToString(ngramCandidate, otherKeys.get(2)); String k3_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) { // if (stats.getFilter().getNotePunctuations()) {
// k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2; // k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
// k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3; // k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
@ -93,10 +93,10 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4); multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
break; break;
case 4: case 4:
String k4_2 = wordToString(ngramCandidate, otherKeys.get(0)); String k4_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k4_3 = wordToString(ngramCandidate, otherKeys.get(1)); String k4_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k4_4 = wordToString(ngramCandidate, otherKeys.get(2)); String k4_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
String k4_5 = wordToString(ngramCandidate, otherKeys.get(3)); String k4_5 = wordToString(ngramCandidate, otherKeys.get(3), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) { // if (stats.getFilter().getNotePunctuations()) {
// k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2; // k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
// k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3; // k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
@ -137,7 +137,7 @@ public class Ngrams {
/** /**
* Checks whether an ngram candidate passes specified regex filter. * Checks whether an ngram candidate passes specified regex filter.
*/ */
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) { private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex, ArrayList<CalculateFor> wordParts) {
if (ngramCandidate.size() != regex.size()) { if (ngramCandidate.size() != regex.size()) {
logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
return false; return false;
@ -145,7 +145,7 @@ public class Ngrams {
for (int i = 0; i < regex.size(); i++) { for (int i = 0; i < regex.size(); i++) {
//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) { //if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern() + ".*")) { if (!ngramCandidate.get(i).getMsd(wordParts).matches(regex.get(i).pattern() + ".*")) {
return false; return false;
} }
} }
@ -153,33 +153,33 @@ public class Ngrams {
return true; return true;
} }
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) { private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor, ArrayList<CalculateFor> wordParts) {
ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size()); ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());
switch (calculateFor) { switch (calculateFor) {
case LEMMA: case LEMMA:
candidate.addAll(ngramCandidate candidate.addAll(ngramCandidate
.stream() .stream()
.map(Word::getLemma) .map(w -> w.getLemma(wordParts))
.collect(Collectors.toList())); .collect(Collectors.toList()));
return StringUtils.join(candidate, " "); return StringUtils.join(candidate, " ");
case WORD: case WORD:
candidate.addAll(ngramCandidate candidate.addAll(ngramCandidate
.stream() .stream()
.map(Word::getWord) .map(w -> w.getWord(wordParts))
.collect(Collectors.toList())); .collect(Collectors.toList()));
return StringUtils.join(candidate, " "); return StringUtils.join(candidate, " ");
case MORPHOSYNTACTIC_SPECS: case MORPHOSYNTACTIC_SPECS:
case MORPHOSYNTACTIC_PROPERTY: case MORPHOSYNTACTIC_PROPERTY:
candidate.addAll(ngramCandidate candidate.addAll(ngramCandidate
.stream() .stream()
.map(Word::getMsd) .map(w -> w.getMsd(wordParts))
.collect(Collectors.toList())); .collect(Collectors.toList()));
return StringUtils.join(candidate, " "); return StringUtils.join(candidate, " ");
case WORD_TYPE: case WORD_TYPE:
candidate.addAll(ngramCandidate candidate.addAll(ngramCandidate
.stream() .stream()
.map(w -> Character.toString(w.getMsd().charAt(0))) .map(w -> Character.toString(w.getMsd(wordParts).charAt(0)))
.collect(Collectors.toList())); .collect(Collectors.toList()));
// candidate.addAll(ngramCandidate // candidate.addAll(ngramCandidate
// .stream() // .stream()
@ -190,7 +190,7 @@ public class Ngrams {
case NORMALIZED_WORD: case NORMALIZED_WORD:
candidate.addAll(ngramCandidate candidate.addAll(ngramCandidate
.stream() .stream()
.map(Word::getNormalizedWord) .map(w -> w.getNormalizedWord(wordParts))
.collect(Collectors.toList())); .collect(Collectors.toList()));
return StringUtils.join(candidate, " "); return StringUtils.join(candidate, " ");
} }
@ -208,14 +208,14 @@ public class Ngrams {
for (Sentence s : corpus) { for (Sentence s : corpus) {
for (Word w : s.getWords()) { for (Word w : s.getWords()) {
List<String> taxonomy = s.getTaxonomy(); List<String> taxonomy = s.getTaxonomy();
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv()); String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv(), stats.getFilter().getWordParts());
// skip this iteration if: // skip this iteration if:
// - word doesn't contain a proper version (missing lemma for example) // - word doesn't contain a proper version (missing lemma for example)
// - msd regex is given but this word's msd doesn't match it, skip this iteration // - msd regex is given but this word's msd doesn't match it, skip this iteration
// - given substring length is larger than the word length // - given substring length is larger than the word length
if (ValidationUtil.isEmpty(word) if (ValidationUtil.isEmpty(word)
|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern()) || stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern())
|| word.length() < stats.getFilter().getStringLength()) { || word.length() < stats.getFilter().getStringLength()) {
continue; continue;
} }
@ -331,7 +331,7 @@ public class Ngrams {
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats, List<String> taxonomy) { private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats, List<String> taxonomy) {
// count if no regex is set or if it is & candidate passes it // count if no regex is set or if it is & candidate passes it
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) { if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
// String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()); // String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
// key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key; // key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
// stats.updateTaxonomyResults(new MultipleHMKeys1(key), // stats.updateTaxonomyResults(new MultipleHMKeys1(key),
@ -340,7 +340,7 @@ public class Ngrams {
ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys(); ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();
String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()); String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts());
// if last letter is ',' erase it // if last letter is ',' erase it
@ -359,14 +359,14 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys1(key); multipleKeys = new MultipleHMKeys1(key);
break; break;
case 1: case 1:
String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0)); String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) // if (stats.getFilter().getNotePunctuations())
// k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length() - 1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2; // k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length() - 1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
multipleKeys = new MultipleHMKeys2(key, k1_2); multipleKeys = new MultipleHMKeys2(key, k1_2);
break; break;
case 2: case 2:
String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0)); String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1)); String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) { // if (stats.getFilter().getNotePunctuations()) {
// k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2; // k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
// k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3; // k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
@ -374,9 +374,9 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3); multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
break; break;
case 3: case 3:
String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0)); String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1)); String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2)); String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) { // if (stats.getFilter().getNotePunctuations()) {
// k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2; // k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
// k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3; // k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
@ -385,10 +385,10 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4); multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
break; break;
case 4: case 4:
String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0)); String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1)); String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2)); String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3)); String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3), stats.getFilter().getWordParts());
// if (stats.getFilter().getNotePunctuations()) { // if (stats.getFilter().getNotePunctuations()) {
// k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2; // k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
// k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3; // k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;

@ -10,84 +10,84 @@ import data.Sentence;
import data.Statistics; import data.Statistics;
import data.Word; import data.Word;
class WordCount { //class WordCount {
private static void calculateNoFilter(List<Sentence> corpus, Statistics stats) { // private static void calculateNoFilter(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) { // for (Sentence s : corpus) {
List<String> sentence = new ArrayList<>(s.getWords().size()); // List<String> sentence = new ArrayList<>(s.getWords().size());
//
if (stats.getCf() == CalculateFor.LEMMA) { // if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(s.getWords() // sentence.addAll(s.getWords()
.stream() // .stream()
.map(Word::getLemma) // .map(Word::getLemma)
.collect(Collectors.toList())); // .collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) { // } else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(s.getWords() // sentence.addAll(s.getWords()
.stream() // .stream()
.map(Word::getWord) // .map(Word::getWord)
.collect(Collectors.toList())); // .collect(Collectors.toList()));
} // }
//
for (String word : sentence) { // for (String word : sentence) {
Common.updateMap(stats.result, word); // Common.updateMap(stats.result, word);
} // }
} // }
} // }
//
private static void calculateVCC(List<Sentence> corpus, Statistics stats) { // private static void calculateVCC(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) { // for (Sentence s : corpus) {
List<String> sentence = new ArrayList<>(s.getWords().size()); // List<String> sentence = new ArrayList<>(s.getWords().size());
//
if (stats.getCf() == CalculateFor.LEMMA) { // if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(s.getWords() // sentence.addAll(s.getWords()
.stream() // .stream()
.map(Word::getCVVLemma) // .map(Word::getCVVLemma)
.collect(Collectors.toList())); // .collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) { // } else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(s.getWords() // sentence.addAll(s.getWords()
.stream() // .stream()
.map(Word::getCVVWord) // .map(Word::getCVVWord)
.collect(Collectors.toList())); // .collect(Collectors.toList()));
} // }
//
for (String word : sentence) { // for (String word : sentence) {
if (word.length() > stats.getSubstringLength()) { // if (word.length() > stats.getSubstringLength()) {
for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) { // for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) {
String substring = word.substring(i, i + stats.getSubstringLength()); // String substring = word.substring(i, i + stats.getSubstringLength());
Common.updateMap(stats.result, substring); // Common.updateMap(stats.result, substring);
} // }
} // }
} // }
} // }
} // }
//
private static void calculateForJosType(List<Sentence> corpus, Statistics stats) { // private static void calculateForJosType(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) { // for (Sentence s : corpus) {
List<String> sentence = new ArrayList<>(s.getWords().size()); // List<String> sentence = new ArrayList<>(s.getWords().size());
List<Word> filteredWords = new ArrayList<>(); // List<Word> filteredWords = new ArrayList<>();
//
for (Word word : s.getWords()) { // for (Word word : s.getWords()) {
if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) { // if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
filteredWords.add(word); // filteredWords.add(word);
} // }
} // }
//
if (stats.getCf() == CalculateFor.LEMMA) { // if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(filteredWords // sentence.addAll(filteredWords
.stream() // .stream()
.map(Word::getLemma) // .map(Word::getLemma)
.collect(Collectors.toList())); // .collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) { // } else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(filteredWords // sentence.addAll(filteredWords
.stream() // .stream()
.map(Word::getWord) // .map(Word::getWord)
.collect(Collectors.toList())); // .collect(Collectors.toList()));
} // }
//
for (String word : sentence) { // for (String word : sentence) {
Common.updateMap(stats.result, word); // Common.updateMap(stats.result, word);
} // }
} // }
} // }
// private static void calculateForTaxonomyAndJosType(List<Sentence> corpus, Statistics stats) { // private static void calculateForTaxonomyAndJosType(List<Sentence> corpus, Statistics stats) {
// for (Sentence s : corpus) { // for (Sentence s : corpus) {
@ -164,4 +164,4 @@ class WordCount {
// } // }
// } // }
// } // }
} //}

@ -34,8 +34,8 @@ public class WordLevel {
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) { public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) { for (Sentence s : corpus) {
for (Word word : s.getWords()) { for (Word word : s.getWords()) {
calculateForSuffixes(word.getWord(), stats); calculateForSuffixes(word.getWord(stats.getFilter().getWordParts()), stats);
calculateForPrefixes(word.getWord(), stats); calculateForPrefixes(word.getWord(stats.getFilter().getWordParts()), stats);
} }
} }
} }

@ -8,6 +8,7 @@ import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import javafx.collections.FXCollections;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
@ -15,6 +16,7 @@ import org.apache.logging.log4j.Logger;
import data.Enums.solar.SolarFilters; import data.Enums.solar.SolarFilters;
import gui.ValidationUtil; import gui.ValidationUtil;
import javafx.collections.ObservableList; import javafx.collections.ObservableList;
import org.controlsfx.control.CheckComboBox;
public class Corpus { public class Corpus {
public final static Logger logger = LogManager.getLogger(Corpus.class); public final static Logger logger = LogManager.getLogger(Corpus.class);
@ -82,6 +84,11 @@ public class Corpus {
public ObservableList<String> getTaxonomy() { public ObservableList<String> getTaxonomy() {
return taxonomy; return taxonomy;
} }
//
// public ObservableList<String> getFormattedTaxonomy() {
// ArrayList<String> al = Tax.getTaxonomyFormatted(new ArrayList<>(taxonomy), corpusType);
// return FXCollections.observableArrayList(al);
// }
public void setTaxonomy(ObservableList<String> taxonomy) { public void setTaxonomy(ObservableList<String> taxonomy) {
this.taxonomy = taxonomy; this.taxonomy = taxonomy;

@ -2,10 +2,7 @@ package data;
import static data.Filter.filterName.*; import static data.Filter.filterName.*;
import java.util.ArrayList; import java.util.*;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import gui.ValidationUtil; import gui.ValidationUtil;
@ -17,6 +14,7 @@ public class Filter {
public enum filterName { public enum filterName {
ANALYSIS_LEVEL, ANALYSIS_LEVEL,
CALCULATE_FOR, CALCULATE_FOR,
WORD_PARTS,
NGRAM_VALUE, NGRAM_VALUE,
SKIP_VALUE, SKIP_VALUE,
IS_CVV, IS_CVV,
@ -36,6 +34,7 @@ public class Filter {
public Filter() { public Filter() {
filter = new HashMap<>(); filter = new HashMap<>();
filter.put(WRITE_MSD_AT_THE_END, false); filter.put(WRITE_MSD_AT_THE_END, false);
filter.put(WORD_PARTS, new ArrayList<CalculateFor>());
} }
public Filter(AnalysisLevel al, CalculateFor cf) { public Filter(AnalysisLevel al, CalculateFor cf) {
@ -43,6 +42,10 @@ public class Filter {
filter.put(ANALYSIS_LEVEL, al); filter.put(ANALYSIS_LEVEL, al);
filter.put(CALCULATE_FOR, cf); filter.put(CALCULATE_FOR, cf);
filter.put(WORD_PARTS, new ArrayList<CalculateFor>());
addWordPart(cf);
filter.put(WRITE_MSD_AT_THE_END, false); filter.put(WRITE_MSD_AT_THE_END, false);
} }
@ -56,6 +59,8 @@ public class Filter {
public void setCalculateFor(CalculateFor cf) { public void setCalculateFor(CalculateFor cf) {
filter.put(CALCULATE_FOR, cf); filter.put(CALCULATE_FOR, cf);
filter.put(WORD_PARTS, new ArrayList<CalculateFor>());
addWordPart(cf);
} }
public CalculateFor getCalculateFor() { public CalculateFor getCalculateFor() {
@ -137,6 +142,8 @@ public class Filter {
public void setHasMsd(boolean hasMsd) { public void setHasMsd(boolean hasMsd) {
filter.put(HAS_MSD, hasMsd); filter.put(HAS_MSD, hasMsd);
if (hasMsd)
addWordPart(CalculateFor.MORPHOSYNTACTIC_SPECS);
} }
public boolean hasMsd() { public boolean hasMsd() {
@ -170,7 +177,9 @@ public class Filter {
ArrayList<CalculateFor> newKeys = new ArrayList<>(); ArrayList<CalculateFor> newKeys = new ArrayList<>();
if (keys != null) { if (keys != null) {
for (String key : keys) { for (String key : keys) {
newKeys.add(CalculateFor.factory(key)); CalculateFor cf = CalculateFor.factory(key);
newKeys.add(cf);
addWordPart(cf);
} }
} }
@ -185,6 +194,14 @@ public class Filter {
} }
} }
public ArrayList<CalculateFor> getWordParts() {
if (filter.containsKey(WORD_PARTS) && filter.get(WORD_PARTS) != null) {
return (ArrayList<CalculateFor>) filter.get(WORD_PARTS);
} else {
return new ArrayList<>();
}
}
public void setNotePunctuations(boolean notePunctuations) { public void setNotePunctuations(boolean notePunctuations) {
filter.put(NOTE_PUNCTUATIONS, notePunctuations); filter.put(NOTE_PUNCTUATIONS, notePunctuations);
} }
@ -209,4 +226,32 @@ public class Filter {
public Integer getMinimalTaxonomy() { public Integer getMinimalTaxonomy() {
return (Integer) filter.get(MINIMAL_TAXONOMY); return (Integer) filter.get(MINIMAL_TAXONOMY);
} }
private void addWordPart(CalculateFor wp){
ArrayList<CalculateFor> oldWp = ((ArrayList<CalculateFor>) filter.get(WORD_PARTS));
switch (wp) {
case WORD:
case DIST_WORDS:
if (!oldWp.contains(CalculateFor.WORD))
oldWp.add(CalculateFor.WORD);
break;
case LEMMA:
case DIST_LEMMAS:
if (!oldWp.contains(CalculateFor.LEMMA))
oldWp.add(CalculateFor.LEMMA);
break;
case MORPHOSYNTACTIC_PROPERTY:
case MORPHOSYNTACTIC_SPECS:
case WORD_TYPE:
if (!oldWp.contains(CalculateFor.MORPHOSYNTACTIC_SPECS))
oldWp.add(CalculateFor.MORPHOSYNTACTIC_SPECS);
break;
case NORMALIZED_WORD:
if (!oldWp.contains(CalculateFor.NORMALIZED_WORD))
oldWp.add(CalculateFor.NORMALIZED_WORD);
break;
}
}
} }

@ -16,67 +16,67 @@ public class Tax {
// GIGAFIDA ---------------------------- // GIGAFIDA ----------------------------
GIGAFIDA_TAXONOMY = new LinkedHashMap<>(); GIGAFIDA_TAXONOMY = new LinkedHashMap<>();
GIGAFIDA_TAXONOMY.put("SSJ.T", "tisk"); GIGAFIDA_TAXONOMY.put("SSJ.T", "SSJ.T - tisk");
GIGAFIDA_TAXONOMY.put("SSJ.T.K", "tisk-knjižno"); GIGAFIDA_TAXONOMY.put("SSJ.T.K", "SSJ.T.K - tisk-knjižno");
GIGAFIDA_TAXONOMY.put("SSJ.T.K.L", "tisk-knjižno-leposlovno"); GIGAFIDA_TAXONOMY.put("SSJ.T.K.L", "SSJ.T.K.L - tisk-knjižno-leposlovno");
GIGAFIDA_TAXONOMY.put("SSJ.T.K.S", "tisk-knjižno-strokovno"); GIGAFIDA_TAXONOMY.put("SSJ.T.K.S", "SSJ.T.K.S - tisk-knjižno-strokovno");
GIGAFIDA_TAXONOMY.put("SSJ.T.P", "tisk-periodično"); GIGAFIDA_TAXONOMY.put("SSJ.T.P", "SSJ.T.P - tisk-periodično");
GIGAFIDA_TAXONOMY.put("SSJ.T.P.C", "tisk-periodično-časopis"); GIGAFIDA_TAXONOMY.put("SSJ.T.P.C", "SSJ.T.P.C - tisk-periodično-časopis");
GIGAFIDA_TAXONOMY.put("SSJ.T.P.R", "tisk-periodično-revija"); GIGAFIDA_TAXONOMY.put("SSJ.T.P.R", "SSJ.T.P.R - tisk-periodično-revija");
GIGAFIDA_TAXONOMY.put("SSJ.T.D", "tisk-drugo"); GIGAFIDA_TAXONOMY.put("SSJ.T.D", "SSJ.T.D - tisk-drugo");
GIGAFIDA_TAXONOMY.put("SSJ.I", "internet"); GIGAFIDA_TAXONOMY.put("SSJ.I", "SSJ.I - internet");
GIGAFIDA_TAXONOMY.put("Ft.P", "prenosnik"); GIGAFIDA_TAXONOMY.put("Ft.P", "Ft.P - prenosnik");
GIGAFIDA_TAXONOMY.put("Ft.P.G", "prenosnik-govorni"); GIGAFIDA_TAXONOMY.put("Ft.P.G", "Ft.P.G - prenosnik-govorni");
GIGAFIDA_TAXONOMY.put("Ft.P.E", "prenosnik-elektronski"); GIGAFIDA_TAXONOMY.put("Ft.P.E", "Ft.P.E - prenosnik-elektronski");
GIGAFIDA_TAXONOMY.put("Ft.P.P", "prenosnik-pisni"); GIGAFIDA_TAXONOMY.put("Ft.P.P", "Ft.P.P - prenosnik-pisni");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O", "prenosnik-pisni-objavljeno"); GIGAFIDA_TAXONOMY.put("Ft.P.P.O", "Ft.P.P.O - prenosnik-pisni-objavljeno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.K", "prenosnik-pisni-objavljeno-knjižno"); GIGAFIDA_TAXONOMY.put("Ft.P.P.O.K", "Ft.P.P.O.K - prenosnik-pisni-objavljeno-knjižno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P", "prenosnik-pisni-objavljeno-periodično"); GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P", "Ft.P.P.O.P - prenosnik-pisni-objavljeno-periodično");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C", "prenosnik-pisni-objavljeno-periodično-časopisno"); GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C", "Ft.P.P.O.P.C - prenosnik-pisni-objavljeno-periodično-časopisno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.D", "prenosnik-pisni-objavljeno-periodično-časopisno-dnevno"); GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.D", "Ft.P.P.O.P.C.D - prenosnik-pisni-objavljeno-periodično-časopisno-dnevno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.V", "prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko"); GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.V", "Ft.P.P.O.P.C.V - prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.T", "prenosnik-pisni-objavljeno-periodično-časopisno-tedensko"); GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.T", "Ft.P.P.O.P.C.T - prenosnik-pisni-objavljeno-periodično-časopisno-tedensko");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R", "prenosnik-pisni-objavljeno-periodično-revialno"); GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R", "Ft.P.P.O.P.R - prenosnik-pisni-objavljeno-periodično-revialno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.T", "prenosnik-pisni-objavljeno-periodično-revialno-tedensko"); GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.T", "Ft.P.P.O.P.R.T - prenosnik-pisni-objavljeno-periodično-revialno-tedensko");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.S", "prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno"); GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.S", "Ft.P.P.O.P.R.S - prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.M", "prenosnik-pisni-objavljeno-periodično-revialno-mesečno"); GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.M", "Ft.P.P.O.P.R.M - prenosnik-pisni-objavljeno-periodično-revialno-mesečno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.D", "prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec"); GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.D", "Ft.P.P.O.P.R.D - prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.O", "prenosnik-pisni-objavljeno-periodično-revialno-občasno"); GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.O", "Ft.P.P.O.P.R.O - prenosnik-pisni-objavljeno-periodično-revialno-občasno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N", "prenosnik-pisni-neobjavljeno"); GIGAFIDA_TAXONOMY.put("Ft.P.P.N", "Ft.P.P.N - prenosnik-pisni-neobjavljeno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.J", "prenosnik-pisni-neobjavljeno-javno"); GIGAFIDA_TAXONOMY.put("Ft.P.P.N.J", "Ft.P.P.N.J - prenosnik-pisni-neobjavljeno-javno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.I", "prenosnik-pisni-neobjavljeno-interno"); GIGAFIDA_TAXONOMY.put("Ft.P.P.N.I", "Ft.P.P.N.I - prenosnik-pisni-neobjavljeno-interno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.Z", "prenosnik-pisni-neobjavljeno-zasebno"); GIGAFIDA_TAXONOMY.put("Ft.P.P.N.Z", "Ft.P.P.N.Z - prenosnik-pisni-neobjavljeno-zasebno");
GIGAFIDA_TAXONOMY.put("Ft.Z", "zvrst"); GIGAFIDA_TAXONOMY.put("Ft.Z", "Ft.Z - zvrst");
GIGAFIDA_TAXONOMY.put("Ft.Z.U", "zvrst-umetnostna"); GIGAFIDA_TAXONOMY.put("Ft.Z.U", "Ft.Z.U - zvrst-umetnostna");
GIGAFIDA_TAXONOMY.put("Ft.Z.U.P", "zvrst-umetnostna-pesniška"); GIGAFIDA_TAXONOMY.put("Ft.Z.U.P", "Ft.Z.U.P - zvrst-umetnostna-pesniška");
GIGAFIDA_TAXONOMY.put("Ft.Z.U.R", "zvrst-umetnostna-prozna"); GIGAFIDA_TAXONOMY.put("Ft.Z.U.R", "Ft.Z.U.R - zvrst-umetnostna-prozna");
GIGAFIDA_TAXONOMY.put("Ft.Z.U.D", "zvrst-umetnostna-dramska"); GIGAFIDA_TAXONOMY.put("Ft.Z.U.D", "Ft.Z.U.D - zvrst-umetnostna-dramska");
GIGAFIDA_TAXONOMY.put("Ft.Z.N", "zvrst-neumetnostna"); GIGAFIDA_TAXONOMY.put("Ft.Z.N", "Ft.Z.N - zvrst-neumetnostna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S", "zvrst-neumetnostna-strokovna"); GIGAFIDA_TAXONOMY.put("Ft.Z.N.S", "Ft.Z.N.S - zvrst-neumetnostna-strokovna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.H", "zvrst-neumetnostna-strokovna-humanistična in družboslovna"); GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.H", "Ft.Z.N.S.H - zvrst-neumetnostna-strokovna-humanistična in družboslovna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.N", "zvrst-neumetnostna-strokovna-naravoslovna in tehnična"); GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.N", "Ft.Z.N.S.N - zvrst-neumetnostna-strokovna-naravoslovna in tehnična");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.N", "zvrst-neumetnostna-nestrokovna"); GIGAFIDA_TAXONOMY.put("Ft.Z.N.N", "Ft.Z.N.N - zvrst-neumetnostna-nestrokovna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.P", "zvrst-neumetnostna-pravna"); GIGAFIDA_TAXONOMY.put("Ft.Z.N.P", "Ft.Z.N.P - zvrst-neumetnostna-pravna");
GIGAFIDA_TAXONOMY.put("Ft.L", "zvrst-lektorirano"); GIGAFIDA_TAXONOMY.put("Ft.L", "Ft.L - zvrst-lektorirano");
GIGAFIDA_TAXONOMY.put("Ft.L.D", "zvrst-lektorirano-da"); GIGAFIDA_TAXONOMY.put("Ft.L.D", "Ft.L.D - zvrst-lektorirano-da");
GIGAFIDA_TAXONOMY.put("Ft.L.N", "zvrst-lektorirano-ne"); GIGAFIDA_TAXONOMY.put("Ft.L.N", "Ft.L.N - zvrst-lektorirano-ne");
// GOS ---------------------------------- // GOS ----------------------------------
GOS_TAXONOMY = new LinkedHashMap<>(); GOS_TAXONOMY = new LinkedHashMap<>();
GOS_TAXONOMY.put("gos.T", "diskurz"); GOS_TAXONOMY.put("gos.T", "gos.T - diskurz");
GOS_TAXONOMY.put("gos.T.J", "diskurz-javni"); GOS_TAXONOMY.put("gos.T.J", "gos.T.J - diskurz-javni");
GOS_TAXONOMY.put("gos.T.J.I", "diskurz-javni-informativno-izobraževalni"); GOS_TAXONOMY.put("gos.T.J.I", "gos.T.J.I - diskurz-javni-informativno-izobraževalni");
GOS_TAXONOMY.put("gos.T.J.R", "diskurz-javni-razvedrilni"); GOS_TAXONOMY.put("gos.T.J.R", "gos.T.J.R - diskurz-javni-razvedrilni");
GOS_TAXONOMY.put("gos.T.N", "diskurz-nejavni"); GOS_TAXONOMY.put("gos.T.N", "gos.T.N - diskurz-nejavni");
GOS_TAXONOMY.put("gos.T.N.N", "diskurz-nejavni-nezasebni"); GOS_TAXONOMY.put("gos.T.N.N", "gos.T.N.N - diskurz-nejavni-nezasebni");
GOS_TAXONOMY.put("gos.T.N.Z", "diskurz-nejavni-zasebni"); GOS_TAXONOMY.put("gos.T.N.Z", "gos.T.N.Z - diskurz-nejavni-zasebni");
GOS_TAXONOMY.put("gos.S", "situacija"); GOS_TAXONOMY.put("gos.S", "gos.S - situacija");
GOS_TAXONOMY.put("gos.S.R", "situacija-radio"); GOS_TAXONOMY.put("gos.S.R", "gos.S.R - situacija-radio");
GOS_TAXONOMY.put("gos.S.T", "situacija-televizija"); GOS_TAXONOMY.put("gos.S.T", "gos.S.T - situacija-televizija");
} }
/** /**
@ -147,6 +147,33 @@ public class Tax {
return result; return result;
} }
// public static ArrayList<String> getTaxonomyFormatted(ArrayList<String> taxonomyNames, CorpusType corpusType) {
// ArrayList<String> result = new ArrayList<>();
//
// if (ValidationUtil.isEmpty(taxonomyNames)) {
// return result;
// }
//
// LinkedHashMap<String, String> tax = new LinkedHashMap<>();
//
// if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
// tax = GIGAFIDA_TAXONOMY;
// } else if (corpusType == CorpusType.GOS) {
// tax = GOS_TAXONOMY;
// }
//
// // for easier lookup
// Map<String, String> taxInversed = tax.entrySet()
// .stream()
// .collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey));
//
// for (String taxonomyName : taxonomyNames) {
// result.add(taxInversed.get(taxonomyName) + " - " + taxonomyName);
// }
//
// return result;
// }
/** /**
* Returns a list of proper names for codes * Returns a list of proper names for codes
* *

@ -1,110 +1,94 @@
package data; package data;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.Objects;
import org.apache.commons.lang3.StringUtils; /*
Created for when words are sorted by multiple keys, i.e. not just lemmas but lemmas and msd simultaneously.
import data.Enums.Msd; */
import gui.ValidationUtil; public interface Word {
String getW1();
public class Word implements Serializable { default String getW2(){ return null; }
public static final char PAD_CHARACTER = '-'; default String getW3(){ return null; }
default String getW4(){ return null; }
private String word;
private String lemma; default String get(ArrayList<CalculateFor> wordParts, CalculateFor cf){
private String msd; if (wordParts.size() > 0 && wordParts.get(0).equals(cf))
private String normalizedWord; return getW1();
private final HashSet<Character> VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u')); if (wordParts.size() > 1 && wordParts.get(1).equals(cf))
return getW2();
/** if (wordParts.size() > 2 && wordParts.get(2).equals(cf))
* Possible values: return getW3();
* <p> if (wordParts.size() > 3 && wordParts.get(3).equals(cf))
* <ul> return getW4();
* <li>S = samostalnik</li> return null;
* <li>G = glagol</li> }
* <li>P = pridevnik</li>
* <li>R = prislov</li> default String getWord(ArrayList<CalculateFor> wordParts){
* <li>Z = zaimek</li> return get(wordParts, CalculateFor.WORD);
* <li>K = števnik</li> }
* <li>D = predlog</li>
* <li>V = veznik</li> default String getLemma(ArrayList<CalculateFor> wordParts){
* <li>L = členek</li> return get(wordParts, CalculateFor.LEMMA);
* <li>M = medmet</li> }
* <li>O = okrajšava</li>
* <li>N = neuvrščeno</li> default String getMsd(ArrayList<CalculateFor> wordParts){
* </ul> return get(wordParts, CalculateFor.MORPHOSYNTACTIC_SPECS);
*/ }
//private char besedna_vrsta;
public Word(String word, String lemma, String msd) { default String getNormalizedWord(ArrayList<CalculateFor> wordParts){
this.lemma = lemma; return get(wordParts, CalculateFor.NORMALIZED_WORD);
this.msd = msd; //normalizeMsd(msd); }
this.normalizedWord = "";
void setW1(String w);
// veliko zacetnico ohranimo samo za lastna imena default void setW2(String w){}
if (!ValidationUtil.isEmpty(this.msd) && !(this.msd.charAt(0) == 'S' default void setW3(String w){}
&& this.msd.length() >= 2 default void setW4(String w){}
&& this.msd.charAt(1) == 'l')) {
this.word = word.toLowerCase(); default void set(String w, ArrayList<CalculateFor> wordParts, CalculateFor cf){
} else { switch(wordParts.indexOf(cf)){
this.word = word; case 0:
setW1(w);
break;
case 1:
setW2(w);
break;
case 2:
setW3(w);
break;
case 3:
setW4(w);
break;
} }
} }
public Word(String word, String lemma, String msd, String normalizedWord) { default void setLemma(String w, ArrayList<CalculateFor> wordParts){
this.lemma = lemma; set(w, wordParts, CalculateFor.LEMMA);
// this.msd = normalizeMsd(msd);
this.msd = msd;
this.normalizedWord = normalizedWord;
// veliko zacetnico ohranimo samo za lastna imena
if (!ValidationUtil.isEmpty(this.msd) && !(this.msd.charAt(0) == 'S'
&& this.msd.length() >= 2
&& this.msd.charAt(1) == 'l')) {
this.word = word.toLowerCase();
} else {
this.word = word;
}
} }
public Word() { default void setMsd(String w, ArrayList<CalculateFor> wordParts){
set(w, wordParts, CalculateFor.MORPHOSYNTACTIC_SPECS);
} }
// /** default void setNormalizedWord(String w, ArrayList<CalculateFor> wordParts){
// * Appends a number of '-' to msds which are not properly sized. set(w, wordParts, CalculateFor.NORMALIZED_WORD);
// * E.g. nouns should have 5 attributes, yet the last one isn't always defined (Somei vs. Sometd)
// *
// * @param msdInput
// *
// * @return
// */
// private String normalizeMsd(String msdInput) {
// if (ValidationUtil.isEmpty(msdInput)) {
// return "";
// } else {
// return StringUtils.rightPad(msdInput, Msd.getMsdLengthForType(msdInput), PAD_CHARACTER);
// }
// }
public Word(String word) {
this.word = word;
} }
public String getWord() {
return word;
}
public String getCVVWord() { default String getCVVWord(ArrayList<CalculateFor> cf) {
return covertToCvv(word); return covertToCvv(getWord(cf));
} }
public String getCVVLemma() { default String getCVVLemma(ArrayList<CalculateFor> cf) {
return covertToCvv(lemma); return covertToCvv(getLemma(cf));
} }
private String covertToCvv(String s) { default String covertToCvv(String s) {
final HashSet<Character> VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u'));
char[] StringCA = s.toCharArray(); char[] StringCA = s.toCharArray();
for (int i = 0; i < StringCA.length; i++) { for (int i = 0; i < StringCA.length; i++) {
@ -114,59 +98,13 @@ public class Word implements Serializable {
return new String(StringCA); return new String(StringCA);
} }
public void setWord(String word) { default String getForCf(CalculateFor calculateFor, boolean cvv, ArrayList<CalculateFor> cf) {
this.word = word;
}
public String getLemma() {
return lemma;
}
public void setLemma(String lemma) {
this.lemma = lemma;
}
public String getMsd() {
return msd;
}
public void setMsd(String msd) {
this.msd = msd;
}
public String getNormalizedWord() {
return normalizedWord;
}
public void setNormalizedWord(String normalizedWord) {
this.normalizedWord = normalizedWord;
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("beseda:\t")
.append(getWord())
.append("\n")
.append("lema:\t")
.append(getLemma())
.append("\n")
.append("msd:\t")
.append(getMsd())
.append("normalized word:\t")
.append(getNormalizedWord())
.append("\n");
return sb.toString();
}
public String getForCf(CalculateFor calculateFor, boolean cvv) {
String returnValue = ""; String returnValue = "";
if (cvv) { if (cvv) {
returnValue = calculateFor == CalculateFor.WORD ? getCVVWord() : getCVVLemma(); returnValue = calculateFor == CalculateFor.WORD ? getCVVWord(cf) : getCVVLemma(cf);
} else { } else {
returnValue = calculateFor == CalculateFor.WORD ? getWord() : getLemma(); returnValue = calculateFor == CalculateFor.WORD ? getWord(cf) : getLemma(cf);
} }
return returnValue; return returnValue;

@ -0,0 +1,17 @@
package data;
import java.io.Serializable;
public class Word1 implements Serializable, Word {
private String w1;
public Word1(String w1) {
this.w1 = w1;
}
public String getW1() {
return w1;
}
public void setW1(String w){w1 = w;}
}

@ -0,0 +1,22 @@
package data;
import java.io.Serializable;
public class Word2 implements Serializable, Word {
private String w1, w2;
public Word2(String w1, String w2) {
this.w1 = w1;
this.w2 = w2;
}
public String getW1() {
return w1;
}
public String getW2() {
return w2;
}
public void setW1(String w){w1 = w;}
public void setW2(String w){w2 = w;}
}

@ -0,0 +1,27 @@
package data;
import java.io.Serializable;
public class Word3 implements Serializable, Word {
private String w1, w2, w3;
public Word3(String w1, String w2, String w3) {
this.w1 = w1;
this.w2 = w2;
this.w3 = w3;
}
public String getW1() {
return w1;
}
public String getW2() {
return w2;
}
public String getW3() {
return w3;
}
public void setW1(String w){w1 = w;}
public void setW2(String w){w2 = w;}
public void setW3(String w){w3 = w;}
}

@ -0,0 +1,32 @@
package data;
import java.io.Serializable;
public class Word4 implements Serializable, Word {
private String w1, w2, w3, w4;
public Word4(String w1, String w2, String w3, String w4) {
this.w1 = w1;
this.w2 = w2;
this.w3 = w3;
this.w4 = w4;
}
public String getW1() {
return w1;
}
public String getW2() {
return w2;
}
public String getW3() {
return w3;
}
public String getW4() {
return w4;
}
public void setW1(String w){w1 = w;}
public void setW2(String w){w2 = w;}
public void setW3(String w){w3 = w;}
public void setW4(String w){w4 = w;}
}

@ -10,6 +10,7 @@ import java.util.*;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import javafx.application.HostServices; import javafx.application.HostServices;
import javafx.collections.transformation.SortedList;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
@ -380,87 +381,87 @@ public class StringAnalysisTabNew2 {
* iscvv: false * iscvv: false
* string length: 1 * string length: 1
*/ */
public void populateFields() { // public void populateFields() {
// corpus changed if: current one is null (this is first run of the app) // // corpus changed if: current one is null (this is first run of the app)
// or if currentCorpus != gui's corpus // // or if currentCorpus != gui's corpus
boolean corpusChanged = currentCorpusType == null // boolean corpusChanged = currentCorpusType == null
|| currentCorpusType != corpus.getCorpusType(); // || currentCorpusType != corpus.getCorpusType();
//
// keep ngram value if set // // keep ngram value if set
if (ngramValue == null) { // if (ngramValue == null) {
ngramValueCB.getSelectionModel().select("1"); // ngramValueCB.getSelectionModel().select("1");
ngramValue = 1; // ngramValue = 1;
} // }
//
// TODO: check for GOS, GIGAFIDA, SOLAR... // // TODO: check for GOS, GIGAFIDA, SOLAR...
// refresh and: // // refresh and:
// TODO if current value != null && is in new calculateFor ? keep : otherwise reset // // TODO if current value != null && is in new calculateFor ? keep : otherwise reset
if (calculateFor == null) { // if (calculateFor == null) {
calculateForCB.getSelectionModel().select(calculateForCB.getItems().get(0)); // calculateForCB.getSelectionModel().select(calculateForCB.getItems().get(0));
calculateFor = CalculateFor.factory(calculateForCB.getItems().get(0)); // calculateFor = CalculateFor.factory(calculateForCB.getItems().get(0));
} // }
//
if (!filter.hasMsd()) { // if (!filter.hasMsd()) {
// if current corpus doesn't have msd data, disable this field // // if current corpus doesn't have msd data, disable this field
msd = new ArrayList<>(); // msd = new ArrayList<>();
msdTF.setText(""); // msdTF.setText("");
msdTF.setDisable(true); // msdTF.setDisable(true);
logger.info("no msd data"); // logger.info("no msd data");
} else { // } else {
if (ValidationUtil.isEmpty(msd) // if (ValidationUtil.isEmpty(msd)
|| (!ValidationUtil.isEmpty(msd) && corpusChanged)) { // || (!ValidationUtil.isEmpty(msd) && corpusChanged)) {
// msd has not been set previously // // msd has not been set previously
// or msd has been set but the corpus changed -> reset // // or msd has been set but the corpus changed -> reset
msd = new ArrayList<>(); // msd = new ArrayList<>();
msdTF.setText(""); // msdTF.setText("");
msdTF.setDisable(false); // msdTF.setDisable(false);
logger.info("msd reset"); // logger.info("msd reset");
} else if (!ValidationUtil.isEmpty(msd) && !corpusChanged) { // } else if (!ValidationUtil.isEmpty(msd) && !corpusChanged) {
// if msd has been set, but corpus type remained the same, we can keep any set msd value // // if msd has been set, but corpus type remained the same, we can keep any set msd value
msdTF.setText(StringUtils.join(msdStrings, " ")); // msdTF.setText(StringUtils.join(msdStrings, " "));
msdTF.setDisable(false); // msdTF.setDisable(false);
logger.info("msd kept"); // logger.info("msd kept");
} // }
} // }
//
// TODO: taxonomy: refresh and keep if in new taxonomy, otherwise empty (no selection) // // TODO: taxonomy: refresh and keep if in new taxonomy, otherwise empty (no selection)
//
// keep skip value // // keep skip value
if (skipValue == null) { // if (skipValue == null) {
skipValueCB.getSelectionModel().select("0"); // skipValueCB.getSelectionModel().select("0");
skipValue = 0; // skipValue = 0;
} // }
//
// keep calculateCvv // // keep calculateCvv
calculatecvvCB.setSelected(calculateCvv); // calculatecvvCB.setSelected(calculateCvv);
//
// keep string length if set // // keep string length if set
if (stringLength != null) { // if (stringLength != null) {
stringLengthTF.setText(String.valueOf(stringLength)); // stringLengthTF.setText(String.valueOf(stringLength));
} else { // } else {
stringLengthTF.setText("1"); // stringLengthTF.setText("1");
stringLength = 1; // stringLength = 1;
} // }
//
// TODO: trigger on rescan // // TODO: trigger on rescan
if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) { // if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) {
// user changed corpus (by type) or by selection & triggered a rescan of headers // // user changed corpus (by type) or by selection & triggered a rescan of headers
// see if we read taxonomy from headers, otherwise use default values for given corpus // // see if we read taxonomy from headers, otherwise use default values for given corpus
ObservableList<String> tax = corpus.getTaxonomy(); // ObservableList<String> tax = corpus.getTaxonomy();
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType()); // taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
//
currentCorpusType = corpus.getCorpusType(); // currentCorpusType = corpus.getCorpusType();
// setTaxonomyIsDirty(false); // // setTaxonomyIsDirty(false);
} else { // } else {
//
} // }
//
// see if we read taxonomy from headers, otherwise use default values for given corpus // // see if we read taxonomy from headers, otherwise use default values for given corpus
ObservableList<String> tax = corpus.getTaxonomy(); // ObservableList<String> tax = corpus.getTaxonomy();
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType()); // taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
taxonomyCCB.getItems().addAll(taxonomyCCBValues); // taxonomyCCB.getItems().addAll(taxonomyCCBValues);
//
} // }
/** /**
* Toggles visibility for panes which hold fields for skipgram value (not applicable when calculating for letters) etc., * Toggles visibility for panes which hold fields for skipgram value (not applicable when calculating for letters) etc.,

@ -125,9 +125,11 @@ public class Export {
// for (Map<MultipleHMKeys, AtomicLong> value : taxonomyResults.values()) { // for (Map<MultipleHMKeys, AtomicLong> value : taxonomyResults.values()) {
for (CalculateFor otherKey : filter.getMultipleKeys()) { for (CalculateFor otherKey : filter.getMultipleKeys()) {
FILE_HEADER_AL.add(otherKey.toHeaderString()); if (num_taxonomy_frequencies.get(otherKey) > 0) {
if (otherKey.equals(CalculateFor.LEMMA)) FILE_HEADER_AL.add(otherKey.toHeaderString());
FILE_HEADER_AL.add("Lema male črke"); if (otherKey.equals(CalculateFor.LEMMA))
FILE_HEADER_AL.add("Lema male črke");
}
} }
// if(otherKey.equals(CalculateFor.LEMMA)){ // if(otherKey.equals(CalculateFor.LEMMA)){
@ -164,7 +166,7 @@ public class Export {
// } // }
FILE_HEADER_AL.add("Skupna relativna pogostost (na milijon pojavitev)"); FILE_HEADER_AL.add("Skupna relativna pogostost (na milijon pojavitev)");
for (String key : taxonomyResults.keySet()) { for (String key : taxonomyResults.keySet()) {
if(!key.equals("Total")) { if(!key.equals("Total") && num_taxonomy_frequencies.get(key) > 0) {
FILE_HEADER_AL.add("Absolutna pogostost [" + key + "]"); FILE_HEADER_AL.add("Absolutna pogostost [" + key + "]");
FILE_HEADER_AL.add("Delež [" + key + "]"); FILE_HEADER_AL.add("Delež [" + key + "]");
FILE_HEADER_AL.add("Relativna pogostost [" + key + "]"); FILE_HEADER_AL.add("Relativna pogostost [" + key + "]");
@ -257,7 +259,7 @@ public class Export {
dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies)); dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies));
dataEntry.add(String.format("%.2f", ((double) e.getValue() * 1000000)/num_frequencies)); dataEntry.add(String.format("%.2f", ((double) e.getValue() * 1000000)/num_frequencies));
for (String key : taxonomyResults.keySet()){ for (String key : taxonomyResults.keySet()){
if(!key.equals("Total")) { if(!key.equals("Total") && num_taxonomy_frequencies.get(key) > 0) {
AtomicLong frequency = taxonomyResults.get(key).get(e.getKey()); AtomicLong frequency = taxonomyResults.get(key).get(e.getKey());
dataEntry.add(frequency.toString()); dataEntry.add(frequency.toString());
dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_taxonomy_frequencies.get(key))); dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_taxonomy_frequencies.get(key)));

@ -13,6 +13,7 @@
<?import javafx.scene.layout.Pane?> <?import javafx.scene.layout.Pane?>
<?import org.controlsfx.control.CheckComboBox?> <?import org.controlsfx.control.CheckComboBox?>
<?import javafx.scene.control.Separator?>
<AnchorPane fx:id="stringAnalysisTabPaneNew2" prefHeight="600.0" prefWidth="800.0" xmlns="http://javafx.com/javafx/8.0.121" xmlns:fx="http://javafx.com/fxml/1" fx:controller="gui.StringAnalysisTabNew2"> <AnchorPane fx:id="stringAnalysisTabPaneNew2" prefHeight="600.0" prefWidth="800.0" xmlns="http://javafx.com/javafx/8.0.121" xmlns:fx="http://javafx.com/fxml/1" fx:controller="gui.StringAnalysisTabNew2">
<Pane> <Pane>
@ -80,7 +81,16 @@
<Label layoutX="10.0" layoutY="320.0" prefHeight="25.0" text="Oznaka MSD" /> <Label layoutX="10.0" layoutY="320.0" prefHeight="25.0" text="Oznaka MSD" />
<TextField fx:id="msdTF" layoutX="185.0" layoutY="320.0" prefWidth="180.0" /> <TextField fx:id="msdTF" layoutX="185.0" layoutY="320.0" prefWidth="180.0" />
<Label layoutX="10.0" layoutY="360.0" prefHeight="25.0" text="Taksonomija" /> <Label layoutX="10.0" layoutY="360.0" prefHeight="25.0" text="Taksonomija" />
<CheckComboBox fx:id="taxonomyCCB" layoutX="185.0" layoutY="360.0" prefHeight="25.0" prefWidth="180.0" /> <CheckComboBox fx:id="taxonomyCCB" layoutX="185.0" layoutY="360.0" prefHeight="25.0" prefWidth="180.0" >
<items>
<FXCollections fx:factory="observableArrayList">
<String fx:value="2" />
<String fx:value="3" />
<String fx:value="4" />
<String fx:value="5" />
</FXCollections>
</items>
</CheckComboBox>
<Label layoutX="10.0" layoutY="400.0" prefHeight="25.0" text="Min. št. pojavitev" /> <Label layoutX="10.0" layoutY="400.0" prefHeight="25.0" text="Min. št. pojavitev" />
<TextField fx:id="minimalOccurrencesTF" layoutX="185.0" layoutY="400.0" prefWidth="180.0" /> <TextField fx:id="minimalOccurrencesTF" layoutX="185.0" layoutY="400.0" prefWidth="180.0" />

@ -1,87 +1,87 @@
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
//
import data.Sentence; //import data.Sentence;
import data.Word; //import data.Word;
//
public class Common { //public class Common {
//
public static List<Sentence> corpus; // public static List<Sentence> corpus;
public static List<Sentence> minCorpus; // public static List<Sentence> minCorpus;
public static List<Sentence> midCorpus; // public static List<Sentence> midCorpus;
public static List<Sentence> midCorpusSkip; // public static List<Sentence> midCorpusSkip;
public static List<Sentence> josTest; // public static List<Sentence> josTest;
//
static { // static {
Sentence testSentence; // Sentence testSentence;
//
// full sentence // // full sentence
ArrayList<String> taxonomy = new ArrayList<>(); // ArrayList<String> taxonomy = new ArrayList<>();
taxonomy.add("#Ft.Z.N.N"); // taxonomy.add("#Ft.Z.N.N");
List<Word> words = new ArrayList<>(); // List<Word> words = new ArrayList<>();
words.add(new Word("ker", "ker", "Vd")); // words.add(new Word("ker", "ker", "Vd"));
words.add(new Word("ima", "imeti", "Ggnste-n")); // words.add(new Word("ima", "imeti", "Ggnste-n"));
words.add(new Word("junak", "junak", "Somei")); // words.add(new Word("junak", "junak", "Somei"));
words.add(new Word("v", "v", "Dm")); // words.add(new Word("v", "v", "Dm"));
words.add(new Word("posesti", "posest", "Sozem")); // words.add(new Word("posesti", "posest", "Sozem"));
words.add(new Word("nekaj", "nekaj", "Rsn")); // words.add(new Word("nekaj", "nekaj", "Rsn"));
words.add(new Word("o", "o", "Dm")); // words.add(new Word("o", "o", "Dm"));
words.add(new Word("čemer", "kar", "Zz-sem")); // words.add(new Word("čemer", "kar", "Zz-sem"));
words.add(new Word("se", "se", "Zp------k")); // words.add(new Word("se", "se", "Zp------k"));
words.add(new Word("mu", "on", "Zotmed--k")); // words.add(new Word("mu", "on", "Zotmed--k"));
words.add(new Word("ne", "ne", "L")); // words.add(new Word("ne", "ne", "L"));
words.add(new Word("sanja", "sanjati", "Ggnste")); // words.add(new Word("sanja", "sanjati", "Ggnste"));
words.add(new Word("a", "a", "Vp")); // words.add(new Word("a", "a", "Vp"));
words.add(new Word("se", "se", "Zp------k")); // words.add(new Word("se", "se", "Zp------k"));
words.add(new Word("onemu", "oni", "Zk-sed")); // words.add(new Word("onemu", "oni", "Zk-sed"));
words.add(new Word("zdi", "zdeti", "Ggnste")); // words.add(new Word("zdi", "zdeti", "Ggnste"));
words.add(new Word("ključno", "ključen", "Ppnsei")); // words.add(new Word("ključno", "ključen", "Ppnsei"));
words.add(new Word("pri", "pri", "Dm")); // words.add(new Word("pri", "pri", "Dm"));
words.add(new Word("operaciji", "operacija", "Sozem")); // words.add(new Word("operaciji", "operacija", "Sozem"));
words.add(new Word("666", "666", "Kag")); // words.add(new Word("666", "666", "Kag"));
//
testSentence = new Sentence(words, taxonomy); // testSentence = new Sentence(words, taxonomy);
corpus = new ArrayList<>(); // corpus = new ArrayList<>();
corpus.add(testSentence); // corpus.add(testSentence);
//
// three word sentence // // three word sentence
testSentence = new Sentence(corpus.get(0).getSublist(0, 3), taxonomy); // testSentence = new Sentence(corpus.get(0).getSublist(0, 3), taxonomy);
minCorpus = new ArrayList<>(); // minCorpus = new ArrayList<>();
minCorpus.add(testSentence); // minCorpus.add(testSentence);
//
// five word sentence // // five word sentence
words = new ArrayList<>(); // words = new ArrayList<>();
words.add(new Word("ker", "ker", "Vd")); // words.add(new Word("ker", "ker", "Vd"));
words.add(new Word("ima", "imeti", "Ggnste-n")); // words.add(new Word("ima", "imeti", "Ggnste-n"));
words.add(new Word("junak", "junak", "Somei")); // words.add(new Word("junak", "junak", "Somei"));
words.add(new Word("ima", "imeti", "Ggnste-n")); // words.add(new Word("ima", "imeti", "Ggnste-n"));
words.add(new Word("posesti", "posest", "Sozem")); // words.add(new Word("posesti", "posest", "Sozem"));
testSentence = new Sentence(words, taxonomy); // testSentence = new Sentence(words, taxonomy);
//
midCorpus = new ArrayList<>(); // midCorpus = new ArrayList<>();
midCorpus.add(testSentence); // midCorpus.add(testSentence);
//
// five word sentence - for skipgrams // // five word sentence - for skipgrams
words = new ArrayList<>(); // words = new ArrayList<>();
words.add(new Word("ker", "ker", "Vd")); // words.add(new Word("ker", "ker", "Vd"));
words.add(new Word("ima", "imeti", "Ggnste-n")); // words.add(new Word("ima", "imeti", "Ggnste-n"));
words.add(new Word("junak", "junak", "Somei")); // words.add(new Word("junak", "junak", "Somei"));
words.add(new Word("v", "v", "Dm")); // words.add(new Word("v", "v", "Dm"));
words.add(new Word("posesti", "posest", "Sozem")); // words.add(new Word("posesti", "posest", "Sozem"));
testSentence = new Sentence(words, taxonomy); // testSentence = new Sentence(words, taxonomy);
//
midCorpusSkip = new ArrayList<>(); // midCorpusSkip = new ArrayList<>();
midCorpusSkip.add(testSentence); // midCorpusSkip.add(testSentence);
//
// JOS test // // JOS test
words = new ArrayList<>(); // words = new ArrayList<>();
words.add(new Word("junak", "junak", "Somei")); // words.add(new Word("junak", "junak", "Somei"));
words.add(new Word("ima", "imeti", "Ggnste-n")); // words.add(new Word("ima", "imeti", "Ggnste-n"));
words.add(new Word("posesti", "posest", "Sozem")); // words.add(new Word("posesti", "posest", "Sozem"));
testSentence = new Sentence(words, taxonomy); // testSentence = new Sentence(words, taxonomy);
//
josTest = new ArrayList<>(); // josTest = new ArrayList<>();
josTest.add(testSentence); // josTest.add(testSentence);
} // }
//
} //}

@ -1,362 +1,362 @@
import static org.junit.Assert.*; //import static org.junit.Assert.*;
//
import java.util.*; //import java.util.*;
import java.util.concurrent.atomic.AtomicLong; //import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern; //import java.util.regex.Pattern;
import java.util.stream.Collectors; //import java.util.stream.Collectors;
//
import javafx.collections.FXCollections; //import javafx.collections.FXCollections;
import org.junit.Test; //import org.junit.Test;
//
import alg.ngram.Ngrams; //import alg.ngram.Ngrams;
import data.*; //import data.*;
//
@SuppressWarnings({"Duplicates", "unused"}) //@SuppressWarnings({"Duplicates", "unused"})
public class NgramTests { //public class NgramTests {
//
@Test // @Test
public void letterNgramsTest() { // public void letterNgramsTest() {
Map<String, AtomicLong> result = null; // Map<String, AtomicLong> result = null;
//
Filter filter = new Filter(); // Filter filter = new Filter();
filter.setAl(AnalysisLevel.STRING_LEVEL); // filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setStringLength(4); // filter.setStringLength(4);
filter.setNgramValue(0); // letters // filter.setNgramValue(0); // letters
filter.setCalculateFor(CalculateFor.WORD); // filter.setCalculateFor(CalculateFor.WORD);
ArrayList<String> tax= new ArrayList<>(); // ArrayList<String> tax= new ArrayList<>();
tax.add("SSJ.T.P.C"); // tax.add("SSJ.T.P.C");
filter.setTaxonomy(tax); // filter.setTaxonomy(tax);
//
//
Corpus testCorpus = new Corpus(); // Corpus testCorpus = new Corpus();
testCorpus.setCorpusType(CorpusType.GIGAFIDA); // testCorpus.setCorpusType(CorpusType.GIGAFIDA);
testCorpus.setDetectedCorpusFiles(new ArrayList<>()); // testCorpus.setDetectedCorpusFiles(new ArrayList<>());
ArrayList<String> taxForCombo = new ArrayList<>(); // ArrayList<String> taxForCombo = new ArrayList<>();
taxForCombo.add("SSJ.T.P.C"); // taxForCombo.add("SSJ.T.P.C");
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo)); // testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
//
// tests: // // tests:
// - no regex // // - no regex
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false); // StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.minCorpus, stats); // Ngrams.calculateForAll(Common.minCorpus, stats);
result = stats.getResult(); // result = stats.getResult();
//
// tests: // // tests:
// - algorithm skips words that are shorter than set length value // // - algorithm skips words that are shorter than set length value
assertEquals(2, result.size()); // assertEquals(2, result.size());
assertTrue(result.containsKey("juna")); // assertTrue(result.containsKey("juna"));
assertEquals(1, result.get("juna").longValue()); // assertEquals(1, result.get("juna").longValue());
assertTrue(result.containsKey("unak")); // assertTrue(result.containsKey("unak"));
assertEquals(1, result.get("unak").longValue()); // assertEquals(1, result.get("unak").longValue());
//
// tests: // // tests:
// - map update (count) works ok // // - map update (count) works ok
filter.setStringLength(3); // filter.setStringLength(3);
stats = new StatisticsNew(testCorpus, filter, false); // stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats); // Ngrams.calculateForAll(Common.midCorpus, stats);
result = stats.getResult(); // result = stats.getResult();
//
assertEquals(2, result.get("ima").longValue()); // assertEquals(2, result.get("ima").longValue());
//
// tests: // // tests:
// - pre-check for the following regex test - this one should include word "ima", next one shouldn't // // - pre-check for the following regex test - this one should include word "ima", next one shouldn't
filter.setStringLength(3); // filter.setStringLength(3);
//
stats = new StatisticsNew(testCorpus, filter, false); // stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats); // Ngrams.calculateForAll(Common.midCorpus, stats);
result = stats.getResult(); // result = stats.getResult();
//
assertTrue(result.containsKey("ima")); // assertTrue(result.containsKey("ima"));
//
// tests: // // tests:
// - regex: S.* // vsi samostalniki // // - regex: S.* // vsi samostalniki
ArrayList<Pattern> msdRegex = new ArrayList<>(); // ArrayList<Pattern> msdRegex = new ArrayList<>();
msdRegex.add(Pattern.compile("S.*")); // msdRegex.add(Pattern.compile("S.*"));
filter.setMsd(msdRegex); // filter.setMsd(msdRegex);
//
stats = new StatisticsNew(testCorpus, filter, false); // stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats); // Ngrams.calculateForAll(Common.midCorpus, stats);
result = stats.getResult(); // result = stats.getResult();
//
assertFalse(result.containsKey("ima")); // assertFalse(result.containsKey("ima"));
//
// tests: // // tests:
// - more precise regex // // - more precise regex
msdRegex = new ArrayList<>(); // msdRegex = new ArrayList<>();
msdRegex.add(Pattern.compile("S.z.*")); // should include "posesti", but not "junak" // msdRegex.add(Pattern.compile("S.z.*")); // should include "posesti", but not "junak"
filter.setMsd(msdRegex); // filter.setMsd(msdRegex);
filter.setStringLength(5); // filter.setStringLength(5);
//
stats = new StatisticsNew(testCorpus, filter, false); // stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats); // Ngrams.calculateForAll(Common.midCorpus, stats);
result = stats.getResult(); // result = stats.getResult();
//
assertFalse(result.containsKey("junak")); // assertFalse(result.containsKey("junak"));
assertEquals(3, result.size()); // assertEquals(3, result.size());
//
// tests: // // tests:
// - trickier regex // // - trickier regex
msdRegex = new ArrayList<>(); // msdRegex = new ArrayList<>();
msdRegex.add(Pattern.compile(".{2}")); // should count only for msd="Vd" - "ker" // msdRegex.add(Pattern.compile(".{2}")); // should count only for msd="Vd" - "ker"
filter.setMsd(msdRegex); // filter.setMsd(msdRegex);
filter.setStringLength(3); // filter.setStringLength(3);
//
stats = new StatisticsNew(testCorpus, filter, false); // stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats); // Ngrams.calculateForAll(Common.midCorpus, stats);
result = stats.getResult(); // result = stats.getResult();
//
assertEquals(1, result.size()); // assertEquals(1, result.size());
assertTrue(result.containsKey("ker")); // assertTrue(result.containsKey("ker"));
assertEquals(1, result.get("ker").longValue()); // assertEquals(1, result.get("ker").longValue());
} // }
//
@Test // @Test
public void wordsNgramsTest() { // public void wordsNgramsTest() {
Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult; // Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
//
Filter filter = new Filter(); // Filter filter = new Filter();
filter.setAl(AnalysisLevel.STRING_LEVEL); // filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setNgramValue(3); // filter.setNgramValue(3);
ArrayList<String> tax= new ArrayList<>(); // ArrayList<String> tax= new ArrayList<>();
tax.add("SSJ.T.P.C"); // tax.add("SSJ.T.P.C");
filter.setTaxonomy(tax); // filter.setTaxonomy(tax);
ArrayList<String> mKeys = new ArrayList<>(); // ArrayList<String> mKeys = new ArrayList<>();
//mKeys.add("lema"); // //mKeys.add("lema");
filter.setMultipleKeys(mKeys); // filter.setMultipleKeys(mKeys);
//
Corpus testCorpus = new Corpus(); // Corpus testCorpus = new Corpus();
testCorpus.setCorpusType(CorpusType.GIGAFIDA); // testCorpus.setCorpusType(CorpusType.GIGAFIDA);
testCorpus.setDetectedCorpusFiles(new ArrayList<>()); // testCorpus.setDetectedCorpusFiles(new ArrayList<>());
ArrayList<String> taxForCombo = new ArrayList<>(); // ArrayList<String> taxForCombo = new ArrayList<>();
taxForCombo.add("SSJ.T.P.C"); // taxForCombo.add("SSJ.T.P.C");
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo)); // testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
//
// tests: // // tests:
// - normal ngrams - word // // - normal ngrams - word
// midCorpus contains 5 words which should make for 3 3-grams // // midCorpus contains 5 words which should make for 3 3-grams
filter.setCalculateFor(CalculateFor.WORD); // filter.setCalculateFor(CalculateFor.WORD);
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false); // StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats); // Ngrams.calculateForAll(Common.midCorpus, stats);
taxonomyResult = stats.getTaxonomyResult(); // taxonomyResult = stats.getTaxonomyResult();
//
assertEquals(3, taxonomyResult.get("Total").size()); // assertEquals(3, taxonomyResult.get("Total").size());
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker ima junak"))); // assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker ima junak")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak ima"))); // assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak ima")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti"))); // assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
//
// tests: // // tests:
// - normal ngrams - lemmas // // - normal ngrams - lemmas
filter.setCalculateFor(CalculateFor.LEMMA); // filter.setCalculateFor(CalculateFor.LEMMA);
stats = new StatisticsNew(testCorpus, filter, false); // stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats); // Ngrams.calculateForAll(Common.midCorpus, stats);
taxonomyResult = stats.getTaxonomyResult(); // taxonomyResult = stats.getTaxonomyResult();
//
assertEquals(3, taxonomyResult.get("Total").size()); // assertEquals(3, taxonomyResult.get("Total").size());
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker imeti junak"))); // assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker imeti junak")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("imeti junak imeti"))); // assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("imeti junak imeti")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak imeti posest"))); // assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak imeti posest")));
//
// tests: // // tests:
// - normal ngrams - msd // // - normal ngrams - msd
filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY); // filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
stats = new StatisticsNew(testCorpus, filter, false); // stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats); // Ngrams.calculateForAll(Common.midCorpus, stats);
taxonomyResult = stats.getTaxonomyResult(); // taxonomyResult = stats.getTaxonomyResult();
//
assertEquals(3, taxonomyResult.get("Total").size()); // assertEquals(3, taxonomyResult.get("Total").size());
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Vd Ggnste-n Somei"))); // assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Vd Ggnste-n Somei")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Ggnste-n Somei Ggnste-n"))); // assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Ggnste-n Somei Ggnste-n")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Somei Ggnste-n Sozem"))); // assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Somei Ggnste-n Sozem")));
//
// tests: // // tests:
// - ngrams - word - regex filter // // - ngrams - word - regex filter
filter.setCalculateFor(CalculateFor.WORD); // filter.setCalculateFor(CalculateFor.WORD);
ArrayList<Pattern> msdRegex = new ArrayList<>(); // ArrayList<Pattern> msdRegex = new ArrayList<>();
msdRegex.add(Pattern.compile("S.*")); // msdRegex.add(Pattern.compile("S.*"));
msdRegex.add(Pattern.compile("G.*")); // msdRegex.add(Pattern.compile("G.*"));
msdRegex.add(Pattern.compile(".*")); // msdRegex.add(Pattern.compile(".*"));
filter.setMsd(msdRegex); // filter.setMsd(msdRegex);
//
stats = new StatisticsNew(testCorpus, filter, false); // stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats); // Ngrams.calculateForAll(Common.midCorpus, stats);
taxonomyResult = stats.getTaxonomyResult(); // taxonomyResult = stats.getTaxonomyResult();
//
assertEquals(1, taxonomyResult.get("Total").size()); // assertEquals(1, taxonomyResult.get("Total").size());
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti"))); // assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
//
// tests: // // tests:
// - ngrams - word - regex filter // // - ngrams - word - regex filter
filter.setCalculateFor(CalculateFor.WORD); // filter.setCalculateFor(CalculateFor.WORD);
filter.setNgramValue(2); // filter.setNgramValue(2);
msdRegex = new ArrayList<>(); // msdRegex = new ArrayList<>();
msdRegex.add(Pattern.compile("G.*")); // msdRegex.add(Pattern.compile("G.*"));
msdRegex.add(Pattern.compile("Some.*")); // msdRegex.add(Pattern.compile("Some.*"));
filter.setMsd(msdRegex); // filter.setMsd(msdRegex);
//
stats = new StatisticsNew(testCorpus, filter, false); // stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats); // Ngrams.calculateForAll(Common.midCorpus, stats);
taxonomyResult = stats.getTaxonomyResult(); // taxonomyResult = stats.getTaxonomyResult();
//
assertEquals(1, taxonomyResult.get("Total").size()); // assertEquals(1, taxonomyResult.get("Total").size());
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak"))); // assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak")));
} // }
//
//
// @Test // // @Test
// public void ngramsTest() { // // public void ngramsTest() {
// // minimal compliance test // // // minimal compliance test
// Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_SPECS); // // Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
// // //
// Map<String, AtomicLong> results = recalculate(minCorpus, stats); // // Map<String, AtomicLong> results = recalculate(minCorpus, stats);
// // //
// // 1-gram minCorpusa should equal minCorpus' size // // // 1-gram minCorpusa should equal minCorpus' size
// assertEquals(minCorpus.get(0).getWords().size(), results.size()); // // assertEquals(minCorpus.get(0).getWords().size(), results.size());
// // //
// // each resulting word should have a frequency of 1 // // // each resulting word should have a frequency of 1
// List<Word> words = minCorpus.get(0).getWords(); // // List<Word> words = minCorpus.get(0).getWords();
// for (int i = 0; i < results.size(); i++) { // // for (int i = 0; i < results.size(); i++) {
// Word w = words.get(i); // // Word w = words.get(i);
// AtomicLong frequency = results.get(w.getMsd()); // // AtomicLong frequency = results.get(w.getMsd());
// assertEquals(1, frequency.intValue()); // // assertEquals(1, frequency.intValue());
// } // // }
// // //
// // repeat for 2grams // // // repeat for 2grams
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_SPECS); // // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
// results = recalculate(minCorpus, stats); // // results = recalculate(minCorpus, stats);
// // //
// // 2-gram of a 3 item corpus should equal 2 (first two words and second two words) // // // 2-gram of a 3 item corpus should equal 2 (first two words and second two words)
// assertEquals(2, results.size()); // // assertEquals(2, results.size());
// // //
// // add a filter // // // add a filter
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY); // // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// // //
// List<String> morphosyntacticFilter = new ArrayList<>(); // // List<String> morphosyntacticFilter = new ArrayList<>();
// morphosyntacticFilter.add("Sozem"); // // morphosyntacticFilter.add("Sozem");
// stats.setMorphosyntacticFilter(morphosyntacticFilter); // // stats.setMorphosyntacticFilter(morphosyntacticFilter);
// // //
// results = recalculate(minCorpus, stats); // // results = recalculate(minCorpus, stats);
// // //
// // since min corpus doesn't contain Sozem, results should be empty // // // since min corpus doesn't contain Sozem, results should be empty
// assertEquals(0, results.size()); // // assertEquals(0, results.size());
// // //
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY); // // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// morphosyntacticFilter = new ArrayList<>(); // // morphosyntacticFilter = new ArrayList<>();
// morphosyntacticFilter.add("Somei"); // // morphosyntacticFilter.add("Somei");
// stats.setMorphosyntacticFilter(morphosyntacticFilter); // // stats.setMorphosyntacticFilter(morphosyntacticFilter);
// results = recalculate(minCorpus, stats); // // results = recalculate(minCorpus, stats);
// // //
// // since we have 1 Somei, 1 result // // // since we have 1 Somei, 1 result
// assertEquals(1, results.size()); // // assertEquals(1, results.size());
// assertEquals(1, results.get("Somei").intValue()); // // assertEquals(1, results.get("Somei").intValue());
// // //
// // actual filter with wildcards // // // actual filter with wildcards
// // 1gram // // // 1gram
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY); // // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// morphosyntacticFilter = new ArrayList<>(); // // morphosyntacticFilter = new ArrayList<>();
// morphosyntacticFilter.add("So***"); // // morphosyntacticFilter.add("So***");
// stats.setMorphosyntacticFilter(morphosyntacticFilter); // // stats.setMorphosyntacticFilter(morphosyntacticFilter);
// results = recalculate(minCorpus, stats); // // results = recalculate(minCorpus, stats);
// // //
// assertEquals(1, results.size()); // // assertEquals(1, results.size());
// assertEquals(1, results.get("Somei").intValue()); // // assertEquals(1, results.get("Somei").intValue());
// // //
// // 2gram // // // 2gram
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY); // // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// morphosyntacticFilter = new ArrayList<>(); // // morphosyntacticFilter = new ArrayList<>();
// morphosyntacticFilter.add("Ggns*e-n"); // // morphosyntacticFilter.add("Ggns*e-n");
// morphosyntacticFilter.add("So***"); // // morphosyntacticFilter.add("So***");
// stats.setMorphosyntacticFilter(morphosyntacticFilter); // // stats.setMorphosyntacticFilter(morphosyntacticFilter);
// results = recalculate(minCorpus, stats); // // results = recalculate(minCorpus, stats);
// // //
// assertEquals(1, results.size()); // // assertEquals(1, results.size());
// assertEquals(1, results.get("Ggnste-n Somei").intValue()); // // assertEquals(1, results.get("Ggnste-n Somei").intValue());
// // //
// // 2gram midCorpus // // // 2gram midCorpus
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY); // // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// morphosyntacticFilter = new ArrayList<>(); // // morphosyntacticFilter = new ArrayList<>();
// morphosyntacticFilter.add("Ggns*e-n"); // // morphosyntacticFilter.add("Ggns*e-n");
// morphosyntacticFilter.add("So***"); // // morphosyntacticFilter.add("So***");
// stats.setMorphosyntacticFilter(morphosyntacticFilter); // // stats.setMorphosyntacticFilter(morphosyntacticFilter);
// results = recalculate(midCorpus, stats); // // results = recalculate(midCorpus, stats);
// // //
// assertEquals(2, results.size()); // // assertEquals(2, results.size());
// assertEquals(1, results.get("Ggnste-n Somei").intValue()); // // assertEquals(1, results.get("Ggnste-n Somei").intValue());
// assertEquals(1, results.get("Ggnste-n Sozem").intValue()); // // assertEquals(1, results.get("Ggnste-n Sozem").intValue());
// } // // }
//
private Map<String, AtomicLong> recalculate(List<Sentence> corpus, Statistics stats) { // private Map<String, AtomicLong> recalculate(List<Sentence> corpus, Statistics stats) {
// calculateForAll(corpus, stats); // // calculateForAll(corpus, stats);
return stats.getResult(); // return stats.getResult();
} // }
//
@Test // @Test
public void skipgramsTest() { // public void skipgramsTest() {
Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult; // Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
//
Filter filter = new Filter(); // Filter filter = new Filter();
filter.setAl(AnalysisLevel.STRING_LEVEL); // filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setCalculateFor(CalculateFor.WORD); // filter.setCalculateFor(CalculateFor.WORD);
ArrayList<String> tax= new ArrayList<>(); // ArrayList<String> tax= new ArrayList<>();
tax.add("SSJ.T.P.C"); // tax.add("SSJ.T.P.C");
filter.setTaxonomy(tax); // filter.setTaxonomy(tax);
//
Corpus testCorpus = new Corpus(); // Corpus testCorpus = new Corpus();
testCorpus.setCorpusType(CorpusType.GIGAFIDA); // testCorpus.setCorpusType(CorpusType.GIGAFIDA);
testCorpus.setDetectedCorpusFiles(new ArrayList<>()); // testCorpus.setDetectedCorpusFiles(new ArrayList<>());
ArrayList<String> taxForCombo = new ArrayList<>(); // ArrayList<String> taxForCombo = new ArrayList<>();
taxForCombo.add("tisk-periodično-časopis"); // taxForCombo.add("tisk-periodično-časopis");
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo)); // testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
//
// tests: // // tests:
// - bigrams // // - bigrams
filter.setNgramValue(2); // filter.setNgramValue(2);
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false); // StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpusSkip, stats); // Ngrams.calculateForAll(Common.midCorpusSkip, stats);
taxonomyResult = stats.getTaxonomyResult(); // taxonomyResult = stats.getTaxonomyResult();
//
Set<String> bigrams = new HashSet<>(Arrays.asList("ker ima", "ima junak", "junak v", "v posesti")); // Set<String> bigrams = new HashSet<>(Arrays.asList("ker ima", "ima junak", "junak v", "v posesti"));
Set<MultipleHMKeys> bigramsMultipleHMKeys = taxonomyResult.get("Total").keySet(); // Set<MultipleHMKeys> bigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
Set<String> bigramsActual = new HashSet<>(bigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList())); // Set<String> bigramsActual = new HashSet<>(bigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
assertEquals(bigrams, bigramsActual); // assertEquals(bigrams, bigramsActual);
//
// test: // // test:
// - two skip bigrams // // - two skip bigrams
filter.setNgramValue(2); // filter.setNgramValue(2);
filter.setSkipValue(2); // filter.setSkipValue(2);
stats = new StatisticsNew(testCorpus, filter, false); // stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpusSkip, stats); // Ngrams.calculateForAll(Common.midCorpusSkip, stats);
taxonomyResult = stats.getTaxonomyResult(); // taxonomyResult = stats.getTaxonomyResult();
//
Set<String> twoSkipBigrams = new HashSet<>(Arrays.asList("ker ima", "ker junak", "ker v", "ima junak", "ima v", "ima posesti", "junak v", "junak posesti", "v posesti")); // Set<String> twoSkipBigrams = new HashSet<>(Arrays.asList("ker ima", "ker junak", "ker v", "ima junak", "ima v", "ima posesti", "junak v", "junak posesti", "v posesti"));
Set<MultipleHMKeys> twoSkipBigramsMultipleHMKeys = taxonomyResult.get("Total").keySet(); // Set<MultipleHMKeys> twoSkipBigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
Set<String> twoSkipBigramsActual = new HashSet<>(twoSkipBigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList())); // Set<String> twoSkipBigramsActual = new HashSet<>(twoSkipBigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
//
assertEquals(twoSkipBigrams, twoSkipBigramsActual); // assertEquals(twoSkipBigrams, twoSkipBigramsActual);
//
// tests: // // tests:
// - trigrams // // - trigrams
filter.setNgramValue(3); // filter.setNgramValue(3);
filter.setSkipValue(null); // filter.setSkipValue(null);
stats = new StatisticsNew(testCorpus, filter, false); // stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpusSkip, stats); // Ngrams.calculateForAll(Common.midCorpusSkip, stats);
taxonomyResult = stats.getTaxonomyResult(); // taxonomyResult = stats.getTaxonomyResult();
Set<String> trigrams = new HashSet<>(Arrays.asList("ker ima junak", "ima junak v", "junak v posesti")); // Set<String> trigrams = new HashSet<>(Arrays.asList("ker ima junak", "ima junak v", "junak v posesti"));
Set<MultipleHMKeys> trigramsMultipleHMKeys = taxonomyResult.get("Total").keySet(); // Set<MultipleHMKeys> trigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
Set<String> trigramsActual = new HashSet<>(trigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList())); // Set<String> trigramsActual = new HashSet<>(trigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
//
assertEquals(trigrams, trigramsActual); // assertEquals(trigrams, trigramsActual);
//
// tests: // // tests:
// - two skip trigrams // // - two skip trigrams
filter.setNgramValue(3); // filter.setNgramValue(3);
filter.setSkipValue(2); // filter.setSkipValue(2);
stats = new StatisticsNew(testCorpus, filter, false); // stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpusSkip, stats); // Ngrams.calculateForAll(Common.midCorpusSkip, stats);
taxonomyResult = stats.getTaxonomyResult(); // taxonomyResult = stats.getTaxonomyResult();
HashSet<String> twoSkipTrigrams = new HashSet<>(Arrays.asList("ker ima junak", "ker ima v", "ker ima posesti", "ker junak v", "ker junak posesti", "ker v posesti", "ima junak v", "ima junak posesti", "ima v posesti", "junak v posesti")); // HashSet<String> twoSkipTrigrams = new HashSet<>(Arrays.asList("ker ima junak", "ker ima v", "ker ima posesti", "ker junak v", "ker junak posesti", "ker v posesti", "ima junak v", "ima junak posesti", "ima v posesti", "junak v posesti"));
Set<MultipleHMKeys> twoSkipTrigramsMultipleHMKeys = taxonomyResult.get("Total").keySet(); // Set<MultipleHMKeys> twoSkipTrigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
Set<String> twoSkipTrigramsActual = new HashSet<>(twoSkipTrigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList())); // Set<String> twoSkipTrigramsActual = new HashSet<>(twoSkipTrigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
//
assertEquals(twoSkipTrigrams, twoSkipTrigramsActual); // assertEquals(twoSkipTrigrams, twoSkipTrigramsActual);
} // }
} //}

@ -1,55 +1,55 @@
import java.io.UnsupportedEncodingException; //import java.io.UnsupportedEncodingException;
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.Map; //import java.util.Map;
import java.util.concurrent.atomic.AtomicLong; //import java.util.concurrent.atomic.AtomicLong;
//
import javafx.collections.FXCollections; //import javafx.collections.FXCollections;
import org.junit.Test; //import org.junit.Test;
//
import alg.inflectedJOS.WordFormation; //import alg.inflectedJOS.WordFormation;
import alg.ngram.Ngrams; //import alg.ngram.Ngrams;
import data.*; //import data.*;
//
public class WordFormationTest { //public class WordFormationTest {
//
@Test // @Test
public void calculationTest() throws UnsupportedEncodingException { // public void calculationTest() throws UnsupportedEncodingException {
Map<String, AtomicLong> result = null; // Map<String, AtomicLong> result = null;
//
Filter filter = new Filter(); // Filter filter = new Filter();
filter.setAl(AnalysisLevel.STRING_LEVEL); // filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setNgramValue(1); // filter.setNgramValue(1);
//
Corpus testCorpus = new Corpus(); // Corpus testCorpus = new Corpus();
testCorpus.setCorpusType(CorpusType.GIGAFIDA); // testCorpus.setCorpusType(CorpusType.GIGAFIDA);
testCorpus.setDetectedCorpusFiles(new ArrayList<>()); // testCorpus.setDetectedCorpusFiles(new ArrayList<>());
ArrayList<String> taxForCombo = new ArrayList<>(); // ArrayList<String> taxForCombo = new ArrayList<>();
taxForCombo.add("tisk-periodično-časopis"); // taxForCombo.add("tisk-periodično-časopis");
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo)); // testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
//
// tests: // // tests:
// - normal ngrams - word // // - normal ngrams - word
// midCorpus contains 5 words which should make for 3 3-grams // // midCorpus contains 5 words which should make for 3 3-grams
filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY); // filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false); // StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.josTest, stats); // Ngrams.calculateForAll(Common.josTest, stats);
result = stats.getResult(); // result = stats.getResult();
WordFormation.calculateStatistics(stats); // WordFormation.calculateStatistics(stats);
Object[][] resultArr = stats.getResultCustom(); // Object[][] resultArr = stats.getResultCustom();
String debug = ""; // String debug = "";
//
} // }
//
@Test // @Test
public void testAnything() { // public void testAnything() {
String a = "Somei"; // String a = "Somei";
String b = "SomeiD"; // String b = "SomeiD";
//
String c = a.substring(0, 5); // String c = a.substring(0, 5);
String d = b.substring(0, 5); // String d = b.substring(0, 5);
//
String debug = ""; // String debug = "";
//
} // }
//
} //}

@ -1,39 +1,39 @@
import static org.junit.Assert.*; //import static org.junit.Assert.*;
//
import org.junit.Test; //import org.junit.Test;
//
import data.Word; //import data.Word;
//
public class WordTest { //public class WordTest {
@Test // @Test
public void paddingTest() { // public void paddingTest() {
Word w1 = new Word("w1", "l1", "Somei"); // Word w1 = new Word("w1", "l1", "Somei");
Word w2 = new Word("w2", "l2", "Sometd"); // Word w2 = new Word("w2", "l2", "Sometd");
//
// w1's msd should get padded // // w1's msd should get padded
String msd1 = w1.getMsd(); // String msd1 = w1.getMsd();
String msd2 = w2.getMsd(); // String msd2 = w2.getMsd();
assertEquals(msd1.length(), msd2.length()); // assertEquals(msd1.length(), msd2.length());
assertEquals(Word.PAD_CHARACTER, msd1.charAt(msd1.length() - 1)); // assertEquals(Word.PAD_CHARACTER, msd1.charAt(msd1.length() - 1));
//
w1 = new Word("w1", "l1", "Gp-g"); // w1 = new Word("w1", "l1", "Gp-g");
w2 = new Word("w2", "l2", "Gp-g---d"); // w2 = new Word("w2", "l2", "Gp-g---d");
//
// w1's msd should get padded // // w1's msd should get padded
msd1 = w1.getMsd(); // msd1 = w1.getMsd();
msd2 = w2.getMsd(); // msd2 = w2.getMsd();
assertEquals(msd1.length(), msd2.length()); // assertEquals(msd1.length(), msd2.length());
assertEquals(Word.PAD_CHARACTER, msd1.charAt(msd1.length() - 1)); // assertEquals(Word.PAD_CHARACTER, msd1.charAt(msd1.length() - 1));
assertEquals(Word.PAD_CHARACTER, msd2.charAt(2)); // assertEquals(Word.PAD_CHARACTER, msd2.charAt(2));
//
} // }
//
@Test // @Test
public void cvvTest() { // public void cvvTest() {
String siAlphabet = "abcčdefghijklmnoprsštuvzž"; // String siAlphabet = "abcčdefghijklmnoprsštuvzž";
String siAlphabetCvv = "VCCCCVCCCVCCCCCVCCCCCVCCC"; // String siAlphabetCvv = "VCCCCVCCCVCCCCCVCCCCCVCCC";
//
Word w1 = new Word(siAlphabet, "l1", null); // Word w1 = new Word(siAlphabet, "l1", null);
assertEquals(siAlphabetCvv, w1.getCVVWord()); // assertEquals(siAlphabetCvv, w1.getCVVWord());
} // }
} //}

Loading…
Cancel
Save