Added some optimizations and new taxonomy names
This commit is contained in:
parent
1c00f1a283
commit
426a9ccc46
|
@ -262,7 +262,7 @@ public class XML_processing {
|
|||
|
||||
if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() &&
|
||||
stavek.size() > 0){
|
||||
stavek.add(new Word(c3Content, c3Content, "/"));
|
||||
stavek.add(createWord(c3Content, c3Content, "/", "", stats.getFilter()));
|
||||
|
||||
}
|
||||
|
||||
|
@ -297,7 +297,7 @@ public class XML_processing {
|
|||
|
||||
// "word" node value
|
||||
if (in_word) {
|
||||
stavek.add(new Word(characters.getData(), lemma, msd));
|
||||
stavek.add(createWord(characters.getData(), lemma, msd, "", stats.getFilter()));
|
||||
in_word = false;
|
||||
}
|
||||
break;
|
||||
|
@ -537,12 +537,12 @@ public class XML_processing {
|
|||
// "word" node value
|
||||
if (inWord) {
|
||||
String word = characters.getData();
|
||||
sentence.add(new Word(word, lemma, msd));
|
||||
sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
|
||||
inWord = false;
|
||||
}
|
||||
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
|
||||
String punctuation = characters.getData();
|
||||
sentence.add(new Word(punctuation, punctuation, "/"));
|
||||
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
|
||||
inPunctuation = false;
|
||||
|
||||
// String punctuation = ",";
|
||||
|
@ -761,7 +761,7 @@ public class XML_processing {
|
|||
// GOSCorpusHM.put(GOSCorpusHMKey, sentence);
|
||||
String word = "";
|
||||
Characters characters = event.asCharacters();
|
||||
sentence.add(new Word(characters.getData(), "", ""));
|
||||
sentence.add(createWord(characters.getData(), "", "", "", stats.getFilter()));
|
||||
// if algorithm is in normalized part find orthodox word and add other info to it
|
||||
} else {
|
||||
Characters characters = event.asCharacters();
|
||||
|
@ -769,15 +769,16 @@ public class XML_processing {
|
|||
// System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex);
|
||||
if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) {
|
||||
Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex);
|
||||
currentWord.setLemma(lemma);
|
||||
currentWord.setMsd(msd);
|
||||
currentWord.setNormalizedWord(characters.getData());
|
||||
currentWord.setLemma(lemma, stats.getFilter().getWordParts());
|
||||
currentWord.setMsd(msd, stats.getFilter().getWordParts());
|
||||
currentWord.setNormalizedWord(characters.getData(), stats.getFilter().getWordParts());
|
||||
|
||||
wordIndex += 1;
|
||||
|
||||
// when a word is separated from one to many we have to create these duplicates
|
||||
if (inSeparatedWord){
|
||||
GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, new Word(currentWord.getWord(), "", ""));
|
||||
GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, createWord(currentWord.getWord(stats.getFilter().getWordParts()),
|
||||
"", "", "", stats.getFilter()));
|
||||
}
|
||||
} //else {
|
||||
// System.out.println("Error");
|
||||
|
@ -893,8 +894,8 @@ public class XML_processing {
|
|||
|
||||
// if we're calculating values for letters, omit words that are shorter than string length
|
||||
if (filter.getNgramValue() == 0) {
|
||||
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength())
|
||||
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength()));
|
||||
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord(filter.getWordParts()).length() < filter.getStringLength())
|
||||
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma(filter.getWordParts()).length() < filter.getStringLength()));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -912,4 +913,38 @@ public class XML_processing {
|
|||
|
||||
return atts;
|
||||
}
|
||||
|
||||
private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
|
||||
List<String> wString = new ArrayList<>();
|
||||
if (f.getWordParts().contains(CalculateFor.WORD))
|
||||
wString.add(word);
|
||||
if (f.getWordParts().contains(CalculateFor.LEMMA))
|
||||
wString.add(lemma);
|
||||
if (f.getWordParts().contains(CalculateFor.MORPHOSYNTACTIC_SPECS))
|
||||
wString.add(msd);
|
||||
if (f.getWordParts().contains(CalculateFor.NORMALIZED_WORD))
|
||||
wString.add(normalizedWord);
|
||||
|
||||
// find appropriate strings and put them in word
|
||||
Word w;
|
||||
|
||||
switch (f.getWordParts().size()) {
|
||||
case 1:
|
||||
w = new Word1(wString.get(0));
|
||||
break;
|
||||
case 2:
|
||||
w = new Word2(wString.get(0), wString.get(1));
|
||||
break;
|
||||
case 3:
|
||||
w = new Word3(wString.get(0), wString.get(1), wString.get(2));
|
||||
break;
|
||||
case 4:
|
||||
w = new Word4(wString.get(0), wString.get(1), wString.get(2), wString.get(3));
|
||||
break;
|
||||
default:
|
||||
w = null;
|
||||
|
||||
}
|
||||
return w;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,67 +1,67 @@
|
|||
package alg.inflectedJOS;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.RecursiveAction;
|
||||
|
||||
import data.Sentence;
|
||||
import data.Statistics;
|
||||
|
||||
public class ForkJoin extends RecursiveAction {
|
||||
private static final long serialVersionUID = -1260951004477299634L;
|
||||
|
||||
private static final int ACCEPTABLE_SIZE = 1000;
|
||||
private List<Sentence> corpus;
|
||||
private Statistics stats;
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
/**
|
||||
* Constructor for subproblems.
|
||||
*/
|
||||
private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default constructor for the initial problem
|
||||
*/
|
||||
public ForkJoin(List<Sentence> corpus, Statistics stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = 0;
|
||||
this.end = corpus.size();
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
private void computeDirectly() {
|
||||
List<Sentence> subCorpus = corpus.subList(start, end);
|
||||
|
||||
if (stats.isTaxonomySet()) {
|
||||
InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
|
||||
} else {
|
||||
InflectedJOSCount.calculateForAll(subCorpus, stats, null);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void compute() {
|
||||
int subCorpusSize = end - start;
|
||||
|
||||
if (subCorpusSize < ACCEPTABLE_SIZE) {
|
||||
computeDirectly();
|
||||
} else {
|
||||
int mid = start + subCorpusSize / 2;
|
||||
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
|
||||
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
|
||||
|
||||
// fork (push to queue)-> compute -> join
|
||||
left.fork();
|
||||
right.fork();
|
||||
left.join();
|
||||
right.join();
|
||||
}
|
||||
}
|
||||
}
|
||||
//package alg.inflectedJOS;
|
||||
//
|
||||
//import java.util.List;
|
||||
//import java.util.concurrent.RecursiveAction;
|
||||
//
|
||||
//import data.Sentence;
|
||||
//import data.Statistics;
|
||||
//
|
||||
//public class ForkJoin extends RecursiveAction {
|
||||
// private static final long serialVersionUID = -1260951004477299634L;
|
||||
//
|
||||
// private static final int ACCEPTABLE_SIZE = 1000;
|
||||
// private List<Sentence> corpus;
|
||||
// private Statistics stats;
|
||||
// private int start;
|
||||
// private int end;
|
||||
//
|
||||
//
|
||||
// /**
|
||||
// * Constructor for subproblems.
|
||||
// */
|
||||
// private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
|
||||
// this.corpus = corpus;
|
||||
// this.start = start;
|
||||
// this.end = end;
|
||||
// this.stats = stats;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * Default constructor for the initial problem
|
||||
// */
|
||||
// public ForkJoin(List<Sentence> corpus, Statistics stats) {
|
||||
// this.corpus = corpus;
|
||||
// this.start = 0;
|
||||
// this.end = corpus.size();
|
||||
// this.stats = stats;
|
||||
// }
|
||||
//
|
||||
// private void computeDirectly() {
|
||||
// List<Sentence> subCorpus = corpus.subList(start, end);
|
||||
//
|
||||
// if (stats.isTaxonomySet()) {
|
||||
// InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
|
||||
// } else {
|
||||
// InflectedJOSCount.calculateForAll(subCorpus, stats, null);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// protected void compute() {
|
||||
// int subCorpusSize = end - start;
|
||||
//
|
||||
// if (subCorpusSize < ACCEPTABLE_SIZE) {
|
||||
// computeDirectly();
|
||||
// } else {
|
||||
// int mid = start + subCorpusSize / 2;
|
||||
// ForkJoin left = new ForkJoin(corpus, start, mid, stats);
|
||||
// ForkJoin right = new ForkJoin(corpus, mid, end, stats);
|
||||
//
|
||||
// // fork (push to queue)-> compute -> join
|
||||
// left.fork();
|
||||
// right.fork();
|
||||
// left.join();
|
||||
// right.join();
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
|
|
@ -1,170 +1,170 @@
|
|||
package alg.inflectedJOS;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import alg.Common;
|
||||
import data.Sentence;
|
||||
import data.Statistics;
|
||||
import data.StatisticsNew;
|
||||
import data.Word;
|
||||
|
||||
public class InflectedJOSCount {
|
||||
|
||||
public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
|
||||
|
||||
// static {
|
||||
// // calculate all possible combinations of indices we will substitute with a '-' for substring statistics
|
||||
// indices = new HashMap<>();
|
||||
// for (int i = 5; i <= 8; i++) {
|
||||
// indices.put(i, calculateCombinations(i));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// private static List<Integer> calculateCombinations(int i) {
|
||||
// int arr[] = {1, 2, 3, 4, 5};
|
||||
// int r = 3;
|
||||
// int n = arr.length;
|
||||
// ArrayList<ArrayList<Integer>> result = new ArrayList<>();
|
||||
//
|
||||
// return printCombination(arr, n, r);
|
||||
// }
|
||||
//
|
||||
// /* arr[] ---> Input Array
|
||||
// data[] ---> Temporary array to store current combination
|
||||
// start & end ---> Staring and Ending indexes in arr[]
|
||||
// index ---> Current index in data[]
|
||||
// r ---> Size of a combination to be printed */
|
||||
// static void combinationUtil(int arr[], int data[], int start,
|
||||
// int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
|
||||
// // Current combination is ready to be printed, print it
|
||||
// ArrayList<Integer> tmpResult = new ArrayList<>();
|
||||
//
|
||||
// if (index == r) {
|
||||
// ArrayList<Integer> tmpResult = new ArrayList<>();
|
||||
// for (int j = 0; j < r; j++)
|
||||
// System.out.print(data[j] + " ");
|
||||
// System.out.println("");
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// // replace index with all possible elements. The condition
|
||||
// // "end-i+1 >= r-index" makes sure that including one element
|
||||
// // at index will make a combination with remaining elements
|
||||
// // at remaining positions
|
||||
// for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
|
||||
// data[index] = arr[i];
|
||||
// combinationUtil(arr, data, i + 1, end, index + 1, r);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // The main function that prints all combinations of size r
|
||||
// // in arr[] of size n. This function mainly uses combinationUtil()
|
||||
// static void printCombination(int arr[], int n, int r) {
|
||||
// // A temporary array to store all combination one by one
|
||||
// int data[] = new int[r];
|
||||
//
|
||||
// // Print all combination using temprary array 'data[]'
|
||||
// combinationUtil(arr, data, 0, n - 1, 0, r);
|
||||
// }
|
||||
|
||||
// public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
|
||||
// for (Sentence s : corpus) {
|
||||
// // disregard if wrong taxonomy
|
||||
// if (!(s.getTaxonomy().startsWith(taxonomy))) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// calculateCommon(s, stats.result);
|
||||
//
|
||||
// for (Word word : s.getWords()) {
|
||||
// // skip if current word is not inflected
|
||||
// if (!(word.getMsd().length() > 0)) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// String msd = word.getMsd();
|
||||
//
|
||||
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
//
|
||||
// for (int i = 1; i < msd.length(); i++) {
|
||||
// entry.setCharAt(i, msd.charAt(i));
|
||||
// Common.updateMap(stats.result, entry.toString());
|
||||
// entry.setCharAt(i, '-');
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
|
||||
// for (Sentence s : corpus) {
|
||||
// for (Word word : s.getWords()) {
|
||||
// if (!(word.getMsd().length() > 0)) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// String msd = word.getMsd();
|
||||
//
|
||||
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
//
|
||||
// for (int i = 1; i < msd.length(); i++) {
|
||||
// entry.setCharAt(i, msd.charAt(i));
|
||||
// Common.updateMap(stats.result, entry.toString());
|
||||
// entry.setCharAt(i, '-');
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
|
||||
for (Sentence s : corpus) {
|
||||
// disregard if wrong taxonomy
|
||||
// if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
|
||||
// continue;
|
||||
//package alg.inflectedJOS;
|
||||
//
|
||||
//import java.util.ArrayList;
|
||||
//import java.util.HashMap;
|
||||
//import java.util.List;
|
||||
//
|
||||
//import org.apache.commons.lang3.StringUtils;
|
||||
//
|
||||
//import alg.Common;
|
||||
//import data.Sentence;
|
||||
//import data.Statistics;
|
||||
//import data.StatisticsNew;
|
||||
//import data.Word;
|
||||
//
|
||||
//public class InflectedJOSCount {
|
||||
//
|
||||
// public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
|
||||
//
|
||||
// // static {
|
||||
// // // calculate all possible combinations of indices we will substitute with a '-' for substring statistics
|
||||
// // indices = new HashMap<>();
|
||||
// // for (int i = 5; i <= 8; i++) {
|
||||
// // indices.put(i, calculateCombinations(i));
|
||||
// // }
|
||||
// // }
|
||||
// //
|
||||
// // private static List<Integer> calculateCombinations(int i) {
|
||||
// // int arr[] = {1, 2, 3, 4, 5};
|
||||
// // int r = 3;
|
||||
// // int n = arr.length;
|
||||
// // ArrayList<ArrayList<Integer>> result = new ArrayList<>();
|
||||
// //
|
||||
// // return printCombination(arr, n, r);
|
||||
// // }
|
||||
// //
|
||||
// // /* arr[] ---> Input Array
|
||||
// // data[] ---> Temporary array to store current combination
|
||||
// // start & end ---> Staring and Ending indexes in arr[]
|
||||
// // index ---> Current index in data[]
|
||||
// // r ---> Size of a combination to be printed */
|
||||
// // static void combinationUtil(int arr[], int data[], int start,
|
||||
// // int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
|
||||
// // // Current combination is ready to be printed, print it
|
||||
// // ArrayList<Integer> tmpResult = new ArrayList<>();
|
||||
// //
|
||||
// // if (index == r) {
|
||||
// // ArrayList<Integer> tmpResult = new ArrayList<>();
|
||||
// // for (int j = 0; j < r; j++)
|
||||
// // System.out.print(data[j] + " ");
|
||||
// // System.out.println("");
|
||||
// // return;
|
||||
// // }
|
||||
// //
|
||||
// // // replace index with all possible elements. The condition
|
||||
// // // "end-i+1 >= r-index" makes sure that including one element
|
||||
// // // at index will make a combination with remaining elements
|
||||
// // // at remaining positions
|
||||
// // for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
|
||||
// // data[index] = arr[i];
|
||||
// // combinationUtil(arr, data, i + 1, end, index + 1, r);
|
||||
// // }
|
||||
// // }
|
||||
// //
|
||||
// // // The main function that prints all combinations of size r
|
||||
// // // in arr[] of size n. This function mainly uses combinationUtil()
|
||||
// // static void printCombination(int arr[], int n, int r) {
|
||||
// // // A temporary array to store all combination one by one
|
||||
// // int data[] = new int[r];
|
||||
// //
|
||||
// // // Print all combination using temprary array 'data[]'
|
||||
// // combinationUtil(arr, data, 0, n - 1, 0, r);
|
||||
// // }
|
||||
//
|
||||
// // public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
|
||||
// // for (Sentence s : corpus) {
|
||||
// // // disregard if wrong taxonomy
|
||||
// // if (!(s.getTaxonomy().startsWith(taxonomy))) {
|
||||
// // continue;
|
||||
// // }
|
||||
// //
|
||||
// // calculateCommon(s, stats.result);
|
||||
// //
|
||||
// // for (Word word : s.getWords()) {
|
||||
// // // skip if current word is not inflected
|
||||
// // if (!(word.getMsd().length() > 0)) {
|
||||
// // continue;
|
||||
// // }
|
||||
// //
|
||||
// // String msd = word.getMsd();
|
||||
// //
|
||||
// // StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
// //
|
||||
// // for (int i = 1; i < msd.length(); i++) {
|
||||
// // entry.setCharAt(i, msd.charAt(i));
|
||||
// // Common.updateMap(stats.result, entry.toString());
|
||||
// // entry.setCharAt(i, '-');
|
||||
// // }
|
||||
// // }
|
||||
// // }
|
||||
// // }
|
||||
//
|
||||
// // public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
|
||||
// // for (Sentence s : corpus) {
|
||||
// // for (Word word : s.getWords()) {
|
||||
// // if (!(word.getMsd().length() > 0)) {
|
||||
// // continue;
|
||||
// // }
|
||||
// //
|
||||
// // String msd = word.getMsd();
|
||||
// //
|
||||
// // StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
// //
|
||||
// // for (int i = 1; i < msd.length(); i++) {
|
||||
// // entry.setCharAt(i, msd.charAt(i));
|
||||
// // Common.updateMap(stats.result, entry.toString());
|
||||
// // entry.setCharAt(i, '-');
|
||||
// // }
|
||||
// // }
|
||||
// // }
|
||||
// // }
|
||||
//
|
||||
// static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
|
||||
// for (Sentence s : corpus) {
|
||||
// // disregard if wrong taxonomy
|
||||
//// if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
|
||||
//// continue;
|
||||
//// }
|
||||
//
|
||||
// for (Word word : s.getWords()) {
|
||||
// // skip if current word is not inflected
|
||||
// if (!(word.getMsd().length() > 0)) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// String msd = word.getMsd();
|
||||
//
|
||||
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
//
|
||||
// for (int i = 1; i < msd.length(); i++) {
|
||||
// entry.setCharAt(i, msd.charAt(i));
|
||||
// Common.updateMap(stats.result, entry.toString());
|
||||
// entry.setCharAt(i, '-');
|
||||
// }
|
||||
// }
|
||||
|
||||
for (Word word : s.getWords()) {
|
||||
// skip if current word is not inflected
|
||||
if (!(word.getMsd().length() > 0)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String msd = word.getMsd();
|
||||
|
||||
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
|
||||
for (int i = 1; i < msd.length(); i++) {
|
||||
entry.setCharAt(i, msd.charAt(i));
|
||||
Common.updateMap(stats.result, entry.toString());
|
||||
entry.setCharAt(i, '-');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
|
||||
for (Sentence s : corpus) {
|
||||
|
||||
for (Word word : s.getWords()) {
|
||||
// skip if current word is not inflected
|
||||
// // TODO: if has defined msd and is of correct type (create a set)
|
||||
// if (!(word.getMsd().length() > 0)) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
String msd = word.getMsd();
|
||||
|
||||
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
|
||||
for (int i = 1; i < msd.length(); i++) {
|
||||
entry.setCharAt(i, msd.charAt(i));
|
||||
stats.updateResults(entry.toString());
|
||||
entry.setCharAt(i, '-');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
|
||||
// for (Sentence s : corpus) {
|
||||
//
|
||||
// for (Word word : s.getWords()) {
|
||||
// // skip if current word is not inflected
|
||||
// // // TODO: if has defined msd and is of correct type (create a set)
|
||||
// // if (!(word.getMsd().length() > 0)) {
|
||||
// // continue;
|
||||
// // }
|
||||
//
|
||||
// String msd = word.getMsd();
|
||||
//
|
||||
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
//
|
||||
// for (int i = 1; i < msd.length(); i++) {
|
||||
// entry.setCharAt(i, msd.charAt(i));
|
||||
// stats.updateResults(entry.toString());
|
||||
// entry.setCharAt(i, '-');
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
|
|
@ -43,12 +43,12 @@ public class Ngrams {
|
|||
List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
|
||||
|
||||
// if msd regex is set and this candidate doesn't pass it, skip this iteration
|
||||
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {
|
||||
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// generate proper MultipleHMKeys depending on filter data
|
||||
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor());
|
||||
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts());
|
||||
|
||||
// if last letter is ',' erase it
|
||||
|
||||
|
@ -67,14 +67,14 @@ public class Ngrams {
|
|||
multipleKeys = new MultipleHMKeys1(key);
|
||||
break;
|
||||
case 1:
|
||||
String k1_2 = wordToString(ngramCandidate, otherKeys.get(0));
|
||||
String k1_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
|
||||
// if (stats.getFilter().getNotePunctuations())
|
||||
// k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length()-1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
|
||||
multipleKeys = new MultipleHMKeys2(key, k1_2);
|
||||
break;
|
||||
case 2:
|
||||
String k2_2 = wordToString(ngramCandidate, otherKeys.get(0));
|
||||
String k2_3 = wordToString(ngramCandidate, otherKeys.get(1));
|
||||
String k2_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
|
||||
String k2_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
|
||||
// if (stats.getFilter().getNotePunctuations()) {
|
||||
// k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
|
||||
// k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
|
||||
|
@ -82,9 +82,9 @@ public class Ngrams {
|
|||
multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
|
||||
break;
|
||||
case 3:
|
||||
String k3_2 = wordToString(ngramCandidate, otherKeys.get(0));
|
||||
String k3_3 = wordToString(ngramCandidate, otherKeys.get(1));
|
||||
String k3_4 = wordToString(ngramCandidate, otherKeys.get(2));
|
||||
String k3_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
|
||||
String k3_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
|
||||
String k3_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
|
||||
// if (stats.getFilter().getNotePunctuations()) {
|
||||
// k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
|
||||
// k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
|
||||
|
@ -93,10 +93,10 @@ public class Ngrams {
|
|||
multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
|
||||
break;
|
||||
case 4:
|
||||
String k4_2 = wordToString(ngramCandidate, otherKeys.get(0));
|
||||
String k4_3 = wordToString(ngramCandidate, otherKeys.get(1));
|
||||
String k4_4 = wordToString(ngramCandidate, otherKeys.get(2));
|
||||
String k4_5 = wordToString(ngramCandidate, otherKeys.get(3));
|
||||
String k4_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
|
||||
String k4_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
|
||||
String k4_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
|
||||
String k4_5 = wordToString(ngramCandidate, otherKeys.get(3), stats.getFilter().getWordParts());
|
||||
// if (stats.getFilter().getNotePunctuations()) {
|
||||
// k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
|
||||
// k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
|
||||
|
@ -137,7 +137,7 @@ public class Ngrams {
|
|||
/**
|
||||
* Checks whether an ngram candidate passes specified regex filter.
|
||||
*/
|
||||
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {
|
||||
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex, ArrayList<CalculateFor> wordParts) {
|
||||
if (ngramCandidate.size() != regex.size()) {
|
||||
logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
|
||||
return false;
|
||||
|
@ -145,7 +145,7 @@ public class Ngrams {
|
|||
|
||||
for (int i = 0; i < regex.size(); i++) {
|
||||
//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
|
||||
if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern() + ".*")) {
|
||||
if (!ngramCandidate.get(i).getMsd(wordParts).matches(regex.get(i).pattern() + ".*")) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -153,33 +153,33 @@ public class Ngrams {
|
|||
return true;
|
||||
}
|
||||
|
||||
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {
|
||||
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor, ArrayList<CalculateFor> wordParts) {
|
||||
ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());
|
||||
|
||||
switch (calculateFor) {
|
||||
case LEMMA:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.map(w -> w.getLemma(wordParts))
|
||||
.collect(Collectors.toList()));
|
||||
return StringUtils.join(candidate, " ");
|
||||
case WORD:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.map(w -> w.getWord(wordParts))
|
||||
.collect(Collectors.toList()));
|
||||
return StringUtils.join(candidate, " ");
|
||||
case MORPHOSYNTACTIC_SPECS:
|
||||
case MORPHOSYNTACTIC_PROPERTY:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(Word::getMsd)
|
||||
.map(w -> w.getMsd(wordParts))
|
||||
.collect(Collectors.toList()));
|
||||
return StringUtils.join(candidate, " ");
|
||||
case WORD_TYPE:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(w -> Character.toString(w.getMsd().charAt(0)))
|
||||
.map(w -> Character.toString(w.getMsd(wordParts).charAt(0)))
|
||||
.collect(Collectors.toList()));
|
||||
// candidate.addAll(ngramCandidate
|
||||
// .stream()
|
||||
|
@ -190,7 +190,7 @@ public class Ngrams {
|
|||
case NORMALIZED_WORD:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(Word::getNormalizedWord)
|
||||
.map(w -> w.getNormalizedWord(wordParts))
|
||||
.collect(Collectors.toList()));
|
||||
return StringUtils.join(candidate, " ");
|
||||
}
|
||||
|
@ -208,14 +208,14 @@ public class Ngrams {
|
|||
for (Sentence s : corpus) {
|
||||
for (Word w : s.getWords()) {
|
||||
List<String> taxonomy = s.getTaxonomy();
|
||||
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
|
||||
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv(), stats.getFilter().getWordParts());
|
||||
|
||||
// skip this iteration if:
|
||||
// - word doesn't contain a proper version (missing lemma for example)
|
||||
// - msd regex is given but this word's msd doesn't match it, skip this iteration
|
||||
// - given substring length is larger than the word length
|
||||
if (ValidationUtil.isEmpty(word)
|
||||
|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())
|
||||
|| stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern())
|
||||
|| word.length() < stats.getFilter().getStringLength()) {
|
||||
continue;
|
||||
}
|
||||
|
@ -331,7 +331,7 @@ public class Ngrams {
|
|||
|
||||
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats, List<String> taxonomy) {
|
||||
// count if no regex is set or if it is & candidate passes it
|
||||
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
|
||||
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
|
||||
// String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
|
||||
// key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
|
||||
// stats.updateTaxonomyResults(new MultipleHMKeys1(key),
|
||||
|
@ -340,7 +340,7 @@ public class Ngrams {
|
|||
|
||||
ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();
|
||||
|
||||
String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
|
||||
String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts());
|
||||
|
||||
// if last letter is ',' erase it
|
||||
|
||||
|
@ -359,14 +359,14 @@ public class Ngrams {
|
|||
multipleKeys = new MultipleHMKeys1(key);
|
||||
break;
|
||||
case 1:
|
||||
String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0));
|
||||
String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
|
||||
// if (stats.getFilter().getNotePunctuations())
|
||||
// k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length() - 1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
|
||||
multipleKeys = new MultipleHMKeys2(key, k1_2);
|
||||
break;
|
||||
case 2:
|
||||
String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0));
|
||||
String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1));
|
||||
String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
|
||||
String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
|
||||
// if (stats.getFilter().getNotePunctuations()) {
|
||||
// k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
|
||||
// k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
|
||||
|
@ -374,9 +374,9 @@ public class Ngrams {
|
|||
multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
|
||||
break;
|
||||
case 3:
|
||||
String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0));
|
||||
String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1));
|
||||
String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2));
|
||||
String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
|
||||
String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
|
||||
String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
|
||||
// if (stats.getFilter().getNotePunctuations()) {
|
||||
// k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
|
||||
// k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
|
||||
|
@ -385,10 +385,10 @@ public class Ngrams {
|
|||
multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
|
||||
break;
|
||||
case 4:
|
||||
String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0));
|
||||
String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1));
|
||||
String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2));
|
||||
String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3));
|
||||
String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
|
||||
String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
|
||||
String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
|
||||
String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3), stats.getFilter().getWordParts());
|
||||
// if (stats.getFilter().getNotePunctuations()) {
|
||||
// k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
|
||||
// k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
|
||||
|
|
|
@ -10,84 +10,84 @@ import data.Sentence;
|
|||
import data.Statistics;
|
||||
import data.Word;
|
||||
|
||||
class WordCount {
|
||||
private static void calculateNoFilter(List<Sentence> corpus, Statistics stats) {
|
||||
for (Sentence s : corpus) {
|
||||
List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
|
||||
if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
} else if (stats.getCf() == CalculateFor.WORD) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
for (String word : sentence) {
|
||||
Common.updateMap(stats.result, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void calculateVCC(List<Sentence> corpus, Statistics stats) {
|
||||
for (Sentence s : corpus) {
|
||||
List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
|
||||
if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getCVVLemma)
|
||||
.collect(Collectors.toList()));
|
||||
} else if (stats.getCf() == CalculateFor.WORD) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getCVVWord)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
for (String word : sentence) {
|
||||
if (word.length() > stats.getSubstringLength()) {
|
||||
for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) {
|
||||
String substring = word.substring(i, i + stats.getSubstringLength());
|
||||
Common.updateMap(stats.result, substring);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void calculateForJosType(List<Sentence> corpus, Statistics stats) {
|
||||
for (Sentence s : corpus) {
|
||||
List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
List<Word> filteredWords = new ArrayList<>();
|
||||
|
||||
for (Word word : s.getWords()) {
|
||||
if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
|
||||
filteredWords.add(word);
|
||||
}
|
||||
}
|
||||
|
||||
if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
sentence.addAll(filteredWords
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
} else if (stats.getCf() == CalculateFor.WORD) {
|
||||
sentence.addAll(filteredWords
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
for (String word : sentence) {
|
||||
Common.updateMap(stats.result, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
//class WordCount {
|
||||
// private static void calculateNoFilter(List<Sentence> corpus, Statistics stats) {
|
||||
// for (Sentence s : corpus) {
|
||||
// List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
//
|
||||
// if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
// sentence.addAll(s.getWords()
|
||||
// .stream()
|
||||
// .map(Word::getLemma)
|
||||
// .collect(Collectors.toList()));
|
||||
// } else if (stats.getCf() == CalculateFor.WORD) {
|
||||
// sentence.addAll(s.getWords()
|
||||
// .stream()
|
||||
// .map(Word::getWord)
|
||||
// .collect(Collectors.toList()));
|
||||
// }
|
||||
//
|
||||
// for (String word : sentence) {
|
||||
// Common.updateMap(stats.result, word);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// private static void calculateVCC(List<Sentence> corpus, Statistics stats) {
|
||||
// for (Sentence s : corpus) {
|
||||
// List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
//
|
||||
// if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
// sentence.addAll(s.getWords()
|
||||
// .stream()
|
||||
// .map(Word::getCVVLemma)
|
||||
// .collect(Collectors.toList()));
|
||||
// } else if (stats.getCf() == CalculateFor.WORD) {
|
||||
// sentence.addAll(s.getWords()
|
||||
// .stream()
|
||||
// .map(Word::getCVVWord)
|
||||
// .collect(Collectors.toList()));
|
||||
// }
|
||||
//
|
||||
// for (String word : sentence) {
|
||||
// if (word.length() > stats.getSubstringLength()) {
|
||||
// for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) {
|
||||
// String substring = word.substring(i, i + stats.getSubstringLength());
|
||||
// Common.updateMap(stats.result, substring);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// private static void calculateForJosType(List<Sentence> corpus, Statistics stats) {
|
||||
// for (Sentence s : corpus) {
|
||||
// List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
// List<Word> filteredWords = new ArrayList<>();
|
||||
//
|
||||
// for (Word word : s.getWords()) {
|
||||
// if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
|
||||
// filteredWords.add(word);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
// sentence.addAll(filteredWords
|
||||
// .stream()
|
||||
// .map(Word::getLemma)
|
||||
// .collect(Collectors.toList()));
|
||||
// } else if (stats.getCf() == CalculateFor.WORD) {
|
||||
// sentence.addAll(filteredWords
|
||||
// .stream()
|
||||
// .map(Word::getWord)
|
||||
// .collect(Collectors.toList()));
|
||||
// }
|
||||
//
|
||||
// for (String word : sentence) {
|
||||
// Common.updateMap(stats.result, word);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// private static void calculateForTaxonomyAndJosType(List<Sentence> corpus, Statistics stats) {
|
||||
// for (Sentence s : corpus) {
|
||||
|
@ -164,4 +164,4 @@ class WordCount {
|
|||
// }
|
||||
// }
|
||||
// }
|
||||
}
|
||||
//}
|
|
@ -34,8 +34,8 @@ public class WordLevel {
|
|||
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
|
||||
for (Sentence s : corpus) {
|
||||
for (Word word : s.getWords()) {
|
||||
calculateForSuffixes(word.getWord(), stats);
|
||||
calculateForPrefixes(word.getWord(), stats);
|
||||
calculateForSuffixes(word.getWord(stats.getFilter().getWordParts()), stats);
|
||||
calculateForPrefixes(word.getWord(stats.getFilter().getWordParts()), stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,6 +8,7 @@ import java.util.Collection;
|
|||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
||||
import javafx.collections.FXCollections;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
@ -15,6 +16,7 @@ import org.apache.logging.log4j.Logger;
|
|||
import data.Enums.solar.SolarFilters;
|
||||
import gui.ValidationUtil;
|
||||
import javafx.collections.ObservableList;
|
||||
import org.controlsfx.control.CheckComboBox;
|
||||
|
||||
public class Corpus {
|
||||
public final static Logger logger = LogManager.getLogger(Corpus.class);
|
||||
|
@ -82,6 +84,11 @@ public class Corpus {
|
|||
public ObservableList<String> getTaxonomy() {
|
||||
return taxonomy;
|
||||
}
|
||||
//
|
||||
// public ObservableList<String> getFormattedTaxonomy() {
|
||||
// ArrayList<String> al = Tax.getTaxonomyFormatted(new ArrayList<>(taxonomy), corpusType);
|
||||
// return FXCollections.observableArrayList(al);
|
||||
// }
|
||||
|
||||
public void setTaxonomy(ObservableList<String> taxonomy) {
|
||||
this.taxonomy = taxonomy;
|
||||
|
|
|
@ -2,10 +2,7 @@ package data;
|
|||
|
||||
import static data.Filter.filterName.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import gui.ValidationUtil;
|
||||
|
@ -17,6 +14,7 @@ public class Filter {
|
|||
public enum filterName {
|
||||
ANALYSIS_LEVEL,
|
||||
CALCULATE_FOR,
|
||||
WORD_PARTS,
|
||||
NGRAM_VALUE,
|
||||
SKIP_VALUE,
|
||||
IS_CVV,
|
||||
|
@ -36,6 +34,7 @@ public class Filter {
|
|||
public Filter() {
|
||||
filter = new HashMap<>();
|
||||
filter.put(WRITE_MSD_AT_THE_END, false);
|
||||
filter.put(WORD_PARTS, new ArrayList<CalculateFor>());
|
||||
}
|
||||
|
||||
public Filter(AnalysisLevel al, CalculateFor cf) {
|
||||
|
@ -43,6 +42,10 @@ public class Filter {
|
|||
|
||||
filter.put(ANALYSIS_LEVEL, al);
|
||||
filter.put(CALCULATE_FOR, cf);
|
||||
|
||||
filter.put(WORD_PARTS, new ArrayList<CalculateFor>());
|
||||
addWordPart(cf);
|
||||
|
||||
filter.put(WRITE_MSD_AT_THE_END, false);
|
||||
}
|
||||
|
||||
|
@ -56,6 +59,8 @@ public class Filter {
|
|||
|
||||
public void setCalculateFor(CalculateFor cf) {
|
||||
filter.put(CALCULATE_FOR, cf);
|
||||
filter.put(WORD_PARTS, new ArrayList<CalculateFor>());
|
||||
addWordPart(cf);
|
||||
}
|
||||
|
||||
public CalculateFor getCalculateFor() {
|
||||
|
@ -137,6 +142,8 @@ public class Filter {
|
|||
|
||||
public void setHasMsd(boolean hasMsd) {
|
||||
filter.put(HAS_MSD, hasMsd);
|
||||
if (hasMsd)
|
||||
addWordPart(CalculateFor.MORPHOSYNTACTIC_SPECS);
|
||||
}
|
||||
|
||||
public boolean hasMsd() {
|
||||
|
@ -170,7 +177,9 @@ public class Filter {
|
|||
ArrayList<CalculateFor> newKeys = new ArrayList<>();
|
||||
if (keys != null) {
|
||||
for (String key : keys) {
|
||||
newKeys.add(CalculateFor.factory(key));
|
||||
CalculateFor cf = CalculateFor.factory(key);
|
||||
newKeys.add(cf);
|
||||
addWordPart(cf);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -185,6 +194,14 @@ public class Filter {
|
|||
}
|
||||
}
|
||||
|
||||
public ArrayList<CalculateFor> getWordParts() {
|
||||
if (filter.containsKey(WORD_PARTS) && filter.get(WORD_PARTS) != null) {
|
||||
return (ArrayList<CalculateFor>) filter.get(WORD_PARTS);
|
||||
} else {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
}
|
||||
|
||||
public void setNotePunctuations(boolean notePunctuations) {
|
||||
filter.put(NOTE_PUNCTUATIONS, notePunctuations);
|
||||
}
|
||||
|
@ -209,4 +226,32 @@ public class Filter {
|
|||
public Integer getMinimalTaxonomy() {
|
||||
return (Integer) filter.get(MINIMAL_TAXONOMY);
|
||||
}
|
||||
|
||||
private void addWordPart(CalculateFor wp){
|
||||
ArrayList<CalculateFor> oldWp = ((ArrayList<CalculateFor>) filter.get(WORD_PARTS));
|
||||
|
||||
switch (wp) {
|
||||
case WORD:
|
||||
case DIST_WORDS:
|
||||
if (!oldWp.contains(CalculateFor.WORD))
|
||||
oldWp.add(CalculateFor.WORD);
|
||||
break;
|
||||
case LEMMA:
|
||||
case DIST_LEMMAS:
|
||||
if (!oldWp.contains(CalculateFor.LEMMA))
|
||||
oldWp.add(CalculateFor.LEMMA);
|
||||
break;
|
||||
case MORPHOSYNTACTIC_PROPERTY:
|
||||
case MORPHOSYNTACTIC_SPECS:
|
||||
case WORD_TYPE:
|
||||
if (!oldWp.contains(CalculateFor.MORPHOSYNTACTIC_SPECS))
|
||||
oldWp.add(CalculateFor.MORPHOSYNTACTIC_SPECS);
|
||||
break;
|
||||
case NORMALIZED_WORD:
|
||||
if (!oldWp.contains(CalculateFor.NORMALIZED_WORD))
|
||||
oldWp.add(CalculateFor.NORMALIZED_WORD);
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,67 +16,67 @@ public class Tax {
|
|||
// GIGAFIDA ----------------------------
|
||||
GIGAFIDA_TAXONOMY = new LinkedHashMap<>();
|
||||
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T", "tisk");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.K", "tisk-knjižno");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.K.L", "tisk-knjižno-leposlovno");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.K.S", "tisk-knjižno-strokovno");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.P", "tisk-periodično");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.P.C", "tisk-periodično-časopis");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.P.R", "tisk-periodično-revija");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.D", "tisk-drugo");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.I", "internet");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T", "SSJ.T - tisk");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.K", "SSJ.T.K - tisk-knjižno");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.K.L", "SSJ.T.K.L - tisk-knjižno-leposlovno");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.K.S", "SSJ.T.K.S - tisk-knjižno-strokovno");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.P", "SSJ.T.P - tisk-periodično");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.P.C", "SSJ.T.P.C - tisk-periodično-časopis");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.P.R", "SSJ.T.P.R - tisk-periodično-revija");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.D", "SSJ.T.D - tisk-drugo");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.I", "SSJ.I - internet");
|
||||
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P", "prenosnik");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.G", "prenosnik-govorni");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.E", "prenosnik-elektronski");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P", "prenosnik-pisni");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O", "prenosnik-pisni-objavljeno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.K", "prenosnik-pisni-objavljeno-knjižno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P", "prenosnik-pisni-objavljeno-periodično");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C", "prenosnik-pisni-objavljeno-periodično-časopisno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.D", "prenosnik-pisni-objavljeno-periodično-časopisno-dnevno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.V", "prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.T", "prenosnik-pisni-objavljeno-periodično-časopisno-tedensko");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R", "prenosnik-pisni-objavljeno-periodično-revialno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.T", "prenosnik-pisni-objavljeno-periodično-revialno-tedensko");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.S", "prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.M", "prenosnik-pisni-objavljeno-periodično-revialno-mesečno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.D", "prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.O", "prenosnik-pisni-objavljeno-periodično-revialno-občasno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N", "prenosnik-pisni-neobjavljeno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.J", "prenosnik-pisni-neobjavljeno-javno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.I", "prenosnik-pisni-neobjavljeno-interno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.Z", "prenosnik-pisni-neobjavljeno-zasebno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P", "Ft.P - prenosnik");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.G", "Ft.P.G - prenosnik-govorni");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.E", "Ft.P.E - prenosnik-elektronski");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P", "Ft.P.P - prenosnik-pisni");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O", "Ft.P.P.O - prenosnik-pisni-objavljeno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.K", "Ft.P.P.O.K - prenosnik-pisni-objavljeno-knjižno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P", "Ft.P.P.O.P - prenosnik-pisni-objavljeno-periodično");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C", "Ft.P.P.O.P.C - prenosnik-pisni-objavljeno-periodično-časopisno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.D", "Ft.P.P.O.P.C.D - prenosnik-pisni-objavljeno-periodično-časopisno-dnevno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.V", "Ft.P.P.O.P.C.V - prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.T", "Ft.P.P.O.P.C.T - prenosnik-pisni-objavljeno-periodično-časopisno-tedensko");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R", "Ft.P.P.O.P.R - prenosnik-pisni-objavljeno-periodično-revialno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.T", "Ft.P.P.O.P.R.T - prenosnik-pisni-objavljeno-periodično-revialno-tedensko");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.S", "Ft.P.P.O.P.R.S - prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.M", "Ft.P.P.O.P.R.M - prenosnik-pisni-objavljeno-periodično-revialno-mesečno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.D", "Ft.P.P.O.P.R.D - prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.O", "Ft.P.P.O.P.R.O - prenosnik-pisni-objavljeno-periodično-revialno-občasno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N", "Ft.P.P.N - prenosnik-pisni-neobjavljeno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.J", "Ft.P.P.N.J - prenosnik-pisni-neobjavljeno-javno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.I", "Ft.P.P.N.I - prenosnik-pisni-neobjavljeno-interno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.Z", "Ft.P.P.N.Z - prenosnik-pisni-neobjavljeno-zasebno");
|
||||
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z", "zvrst");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U", "zvrst-umetnostna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U.P", "zvrst-umetnostna-pesniška");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U.R", "zvrst-umetnostna-prozna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U.D", "zvrst-umetnostna-dramska");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N", "zvrst-neumetnostna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S", "zvrst-neumetnostna-strokovna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.H", "zvrst-neumetnostna-strokovna-humanistična in družboslovna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.N", "zvrst-neumetnostna-strokovna-naravoslovna in tehnična");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.N", "zvrst-neumetnostna-nestrokovna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.P", "zvrst-neumetnostna-pravna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.L", "zvrst-lektorirano");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.L.D", "zvrst-lektorirano-da");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.L.N", "zvrst-lektorirano-ne");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z", "Ft.Z - zvrst");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U", "Ft.Z.U - zvrst-umetnostna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U.P", "Ft.Z.U.P - zvrst-umetnostna-pesniška");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U.R", "Ft.Z.U.R - zvrst-umetnostna-prozna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U.D", "Ft.Z.U.D - zvrst-umetnostna-dramska");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N", "Ft.Z.N - zvrst-neumetnostna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S", "Ft.Z.N.S - zvrst-neumetnostna-strokovna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.H", "Ft.Z.N.S.H - zvrst-neumetnostna-strokovna-humanistična in družboslovna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.N", "Ft.Z.N.S.N - zvrst-neumetnostna-strokovna-naravoslovna in tehnična");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.N", "Ft.Z.N.N - zvrst-neumetnostna-nestrokovna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.P", "Ft.Z.N.P - zvrst-neumetnostna-pravna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.L", "Ft.L - zvrst-lektorirano");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.L.D", "Ft.L.D - zvrst-lektorirano-da");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.L.N", "Ft.L.N - zvrst-lektorirano-ne");
|
||||
|
||||
// GOS ----------------------------------
|
||||
GOS_TAXONOMY = new LinkedHashMap<>();
|
||||
|
||||
GOS_TAXONOMY.put("gos.T", "diskurz");
|
||||
GOS_TAXONOMY.put("gos.T.J", "diskurz-javni");
|
||||
GOS_TAXONOMY.put("gos.T.J.I", "diskurz-javni-informativno-izobraževalni");
|
||||
GOS_TAXONOMY.put("gos.T.J.R", "diskurz-javni-razvedrilni");
|
||||
GOS_TAXONOMY.put("gos.T.N", "diskurz-nejavni");
|
||||
GOS_TAXONOMY.put("gos.T.N.N", "diskurz-nejavni-nezasebni");
|
||||
GOS_TAXONOMY.put("gos.T.N.Z", "diskurz-nejavni-zasebni");
|
||||
GOS_TAXONOMY.put("gos.T", "gos.T - diskurz");
|
||||
GOS_TAXONOMY.put("gos.T.J", "gos.T.J - diskurz-javni");
|
||||
GOS_TAXONOMY.put("gos.T.J.I", "gos.T.J.I - diskurz-javni-informativno-izobraževalni");
|
||||
GOS_TAXONOMY.put("gos.T.J.R", "gos.T.J.R - diskurz-javni-razvedrilni");
|
||||
GOS_TAXONOMY.put("gos.T.N", "gos.T.N - diskurz-nejavni");
|
||||
GOS_TAXONOMY.put("gos.T.N.N", "gos.T.N.N - diskurz-nejavni-nezasebni");
|
||||
GOS_TAXONOMY.put("gos.T.N.Z", "gos.T.N.Z - diskurz-nejavni-zasebni");
|
||||
|
||||
GOS_TAXONOMY.put("gos.S", "situacija");
|
||||
GOS_TAXONOMY.put("gos.S.R", "situacija-radio");
|
||||
GOS_TAXONOMY.put("gos.S.T", "situacija-televizija");
|
||||
GOS_TAXONOMY.put("gos.S", "gos.S - situacija");
|
||||
GOS_TAXONOMY.put("gos.S.R", "gos.S.R - situacija-radio");
|
||||
GOS_TAXONOMY.put("gos.S.T", "gos.S.T - situacija-televizija");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -147,6 +147,33 @@ public class Tax {
|
|||
return result;
|
||||
}
|
||||
|
||||
// public static ArrayList<String> getTaxonomyFormatted(ArrayList<String> taxonomyNames, CorpusType corpusType) {
|
||||
// ArrayList<String> result = new ArrayList<>();
|
||||
//
|
||||
// if (ValidationUtil.isEmpty(taxonomyNames)) {
|
||||
// return result;
|
||||
// }
|
||||
//
|
||||
// LinkedHashMap<String, String> tax = new LinkedHashMap<>();
|
||||
//
|
||||
// if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
|
||||
// tax = GIGAFIDA_TAXONOMY;
|
||||
// } else if (corpusType == CorpusType.GOS) {
|
||||
// tax = GOS_TAXONOMY;
|
||||
// }
|
||||
//
|
||||
// // for easier lookup
|
||||
// Map<String, String> taxInversed = tax.entrySet()
|
||||
// .stream()
|
||||
// .collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey));
|
||||
//
|
||||
// for (String taxonomyName : taxonomyNames) {
|
||||
// result.add(taxInversed.get(taxonomyName) + " - " + taxonomyName);
|
||||
// }
|
||||
//
|
||||
// return result;
|
||||
// }
|
||||
|
||||
/**
|
||||
* Returns a list of proper names for codes
|
||||
*
|
||||
|
|
|
@ -1,110 +1,94 @@
|
|||
package data;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
/*
|
||||
Created for when words are sorted by multiple keys, i.e. not just lemmas but lemmas and msd simultaneously.
|
||||
*/
|
||||
public interface Word {
|
||||
String getW1();
|
||||
default String getW2(){ return null; }
|
||||
default String getW3(){ return null; }
|
||||
default String getW4(){ return null; }
|
||||
|
||||
import data.Enums.Msd;
|
||||
import gui.ValidationUtil;
|
||||
default String get(ArrayList<CalculateFor> wordParts, CalculateFor cf){
|
||||
if (wordParts.size() > 0 && wordParts.get(0).equals(cf))
|
||||
return getW1();
|
||||
if (wordParts.size() > 1 && wordParts.get(1).equals(cf))
|
||||
return getW2();
|
||||
if (wordParts.size() > 2 && wordParts.get(2).equals(cf))
|
||||
return getW3();
|
||||
if (wordParts.size() > 3 && wordParts.get(3).equals(cf))
|
||||
return getW4();
|
||||
return null;
|
||||
}
|
||||
|
||||
public class Word implements Serializable {
|
||||
public static final char PAD_CHARACTER = '-';
|
||||
default String getWord(ArrayList<CalculateFor> wordParts){
|
||||
return get(wordParts, CalculateFor.WORD);
|
||||
}
|
||||
|
||||
private String word;
|
||||
private String lemma;
|
||||
private String msd;
|
||||
private String normalizedWord;
|
||||
private final HashSet<Character> VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u'));
|
||||
default String getLemma(ArrayList<CalculateFor> wordParts){
|
||||
return get(wordParts, CalculateFor.LEMMA);
|
||||
}
|
||||
|
||||
/**
|
||||
* Possible values:
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>S = samostalnik</li>
|
||||
* <li>G = glagol</li>
|
||||
* <li>P = pridevnik</li>
|
||||
* <li>R = prislov</li>
|
||||
* <li>Z = zaimek</li>
|
||||
* <li>K = števnik</li>
|
||||
* <li>D = predlog</li>
|
||||
* <li>V = veznik</li>
|
||||
* <li>L = členek</li>
|
||||
* <li>M = medmet</li>
|
||||
* <li>O = okrajšava</li>
|
||||
* <li>N = neuvrščeno</li>
|
||||
* </ul>
|
||||
*/
|
||||
//private char besedna_vrsta;
|
||||
public Word(String word, String lemma, String msd) {
|
||||
this.lemma = lemma;
|
||||
this.msd = msd; //normalizeMsd(msd);
|
||||
this.normalizedWord = "";
|
||||
default String getMsd(ArrayList<CalculateFor> wordParts){
|
||||
return get(wordParts, CalculateFor.MORPHOSYNTACTIC_SPECS);
|
||||
}
|
||||
|
||||
// veliko zacetnico ohranimo samo za lastna imena
|
||||
if (!ValidationUtil.isEmpty(this.msd) && !(this.msd.charAt(0) == 'S'
|
||||
&& this.msd.length() >= 2
|
||||
&& this.msd.charAt(1) == 'l')) {
|
||||
this.word = word.toLowerCase();
|
||||
} else {
|
||||
this.word = word;
|
||||
default String getNormalizedWord(ArrayList<CalculateFor> wordParts){
|
||||
return get(wordParts, CalculateFor.NORMALIZED_WORD);
|
||||
}
|
||||
|
||||
void setW1(String w);
|
||||
default void setW2(String w){}
|
||||
default void setW3(String w){}
|
||||
default void setW4(String w){}
|
||||
|
||||
default void set(String w, ArrayList<CalculateFor> wordParts, CalculateFor cf){
|
||||
switch(wordParts.indexOf(cf)){
|
||||
case 0:
|
||||
setW1(w);
|
||||
break;
|
||||
case 1:
|
||||
setW2(w);
|
||||
break;
|
||||
case 2:
|
||||
setW3(w);
|
||||
break;
|
||||
case 3:
|
||||
setW4(w);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
public Word(String word, String lemma, String msd, String normalizedWord) {
|
||||
this.lemma = lemma;
|
||||
// this.msd = normalizeMsd(msd);
|
||||
this.msd = msd;
|
||||
this.normalizedWord = normalizedWord;
|
||||
|
||||
// veliko zacetnico ohranimo samo za lastna imena
|
||||
if (!ValidationUtil.isEmpty(this.msd) && !(this.msd.charAt(0) == 'S'
|
||||
&& this.msd.length() >= 2
|
||||
&& this.msd.charAt(1) == 'l')) {
|
||||
this.word = word.toLowerCase();
|
||||
} else {
|
||||
this.word = word;
|
||||
}
|
||||
default void setLemma(String w, ArrayList<CalculateFor> wordParts){
|
||||
set(w, wordParts, CalculateFor.LEMMA);
|
||||
}
|
||||
|
||||
public Word() {
|
||||
default void setMsd(String w, ArrayList<CalculateFor> wordParts){
|
||||
set(w, wordParts, CalculateFor.MORPHOSYNTACTIC_SPECS);
|
||||
}
|
||||
|
||||
// /**
|
||||
// * Appends a number of '-' to msds which are not properly sized.
|
||||
// * E.g. nouns should have 5 attributes, yet the last one isn't always defined (Somei vs. Sometd)
|
||||
// *
|
||||
// * @param msdInput
|
||||
// *
|
||||
// * @return
|
||||
// */
|
||||
// private String normalizeMsd(String msdInput) {
|
||||
// if (ValidationUtil.isEmpty(msdInput)) {
|
||||
// return "";
|
||||
// } else {
|
||||
// return StringUtils.rightPad(msdInput, Msd.getMsdLengthForType(msdInput), PAD_CHARACTER);
|
||||
// }
|
||||
// }
|
||||
|
||||
public Word(String word) {
|
||||
this.word = word;
|
||||
default void setNormalizedWord(String w, ArrayList<CalculateFor> wordParts){
|
||||
set(w, wordParts, CalculateFor.NORMALIZED_WORD);
|
||||
}
|
||||
|
||||
public String getWord() {
|
||||
return word;
|
||||
|
||||
default String getCVVWord(ArrayList<CalculateFor> cf) {
|
||||
return covertToCvv(getWord(cf));
|
||||
}
|
||||
|
||||
public String getCVVWord() {
|
||||
return covertToCvv(word);
|
||||
default String getCVVLemma(ArrayList<CalculateFor> cf) {
|
||||
return covertToCvv(getLemma(cf));
|
||||
}
|
||||
|
||||
public String getCVVLemma() {
|
||||
return covertToCvv(lemma);
|
||||
}
|
||||
default String covertToCvv(String s) {
|
||||
final HashSet<Character> VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u'));
|
||||
|
||||
private String covertToCvv(String s) {
|
||||
char[] StringCA = s.toCharArray();
|
||||
|
||||
for (int i = 0; i < StringCA.length; i++) {
|
||||
|
@ -114,59 +98,13 @@ public class Word implements Serializable {
|
|||
return new String(StringCA);
|
||||
}
|
||||
|
||||
public void setWord(String word) {
|
||||
this.word = word;
|
||||
}
|
||||
|
||||
public String getLemma() {
|
||||
return lemma;
|
||||
}
|
||||
|
||||
public void setLemma(String lemma) {
|
||||
this.lemma = lemma;
|
||||
}
|
||||
|
||||
public String getMsd() {
|
||||
return msd;
|
||||
}
|
||||
|
||||
public void setMsd(String msd) {
|
||||
this.msd = msd;
|
||||
}
|
||||
|
||||
public String getNormalizedWord() {
|
||||
return normalizedWord;
|
||||
}
|
||||
|
||||
public void setNormalizedWord(String normalizedWord) {
|
||||
this.normalizedWord = normalizedWord;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append("beseda:\t")
|
||||
.append(getWord())
|
||||
.append("\n")
|
||||
.append("lema:\t")
|
||||
.append(getLemma())
|
||||
.append("\n")
|
||||
.append("msd:\t")
|
||||
.append(getMsd())
|
||||
.append("normalized word:\t")
|
||||
.append(getNormalizedWord())
|
||||
.append("\n");
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String getForCf(CalculateFor calculateFor, boolean cvv) {
|
||||
default String getForCf(CalculateFor calculateFor, boolean cvv, ArrayList<CalculateFor> cf) {
|
||||
String returnValue = "";
|
||||
|
||||
if (cvv) {
|
||||
returnValue = calculateFor == CalculateFor.WORD ? getCVVWord() : getCVVLemma();
|
||||
returnValue = calculateFor == CalculateFor.WORD ? getCVVWord(cf) : getCVVLemma(cf);
|
||||
} else {
|
||||
returnValue = calculateFor == CalculateFor.WORD ? getWord() : getLemma();
|
||||
returnValue = calculateFor == CalculateFor.WORD ? getWord(cf) : getLemma(cf);
|
||||
}
|
||||
|
||||
return returnValue;
|
||||
|
|
17
src/main/java/data/Word1.java
Executable file
17
src/main/java/data/Word1.java
Executable file
|
@ -0,0 +1,17 @@
|
|||
package data;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Word1 implements Serializable, Word {
|
||||
private String w1;
|
||||
|
||||
public Word1(String w1) {
|
||||
this.w1 = w1;
|
||||
}
|
||||
|
||||
public String getW1() {
|
||||
return w1;
|
||||
}
|
||||
|
||||
public void setW1(String w){w1 = w;}
|
||||
}
|
22
src/main/java/data/Word2.java
Executable file
22
src/main/java/data/Word2.java
Executable file
|
@ -0,0 +1,22 @@
|
|||
package data;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Word2 implements Serializable, Word {
|
||||
private String w1, w2;
|
||||
|
||||
public Word2(String w1, String w2) {
|
||||
this.w1 = w1;
|
||||
this.w2 = w2;
|
||||
}
|
||||
|
||||
public String getW1() {
|
||||
return w1;
|
||||
}
|
||||
public String getW2() {
|
||||
return w2;
|
||||
}
|
||||
|
||||
public void setW1(String w){w1 = w;}
|
||||
public void setW2(String w){w2 = w;}
|
||||
}
|
27
src/main/java/data/Word3.java
Executable file
27
src/main/java/data/Word3.java
Executable file
|
@ -0,0 +1,27 @@
|
|||
package data;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Word3 implements Serializable, Word {
|
||||
private String w1, w2, w3;
|
||||
|
||||
public Word3(String w1, String w2, String w3) {
|
||||
this.w1 = w1;
|
||||
this.w2 = w2;
|
||||
this.w3 = w3;
|
||||
}
|
||||
|
||||
public String getW1() {
|
||||
return w1;
|
||||
}
|
||||
public String getW2() {
|
||||
return w2;
|
||||
}
|
||||
public String getW3() {
|
||||
return w3;
|
||||
}
|
||||
|
||||
public void setW1(String w){w1 = w;}
|
||||
public void setW2(String w){w2 = w;}
|
||||
public void setW3(String w){w3 = w;}
|
||||
}
|
32
src/main/java/data/Word4.java
Executable file
32
src/main/java/data/Word4.java
Executable file
|
@ -0,0 +1,32 @@
|
|||
package data;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Word4 implements Serializable, Word {
|
||||
private String w1, w2, w3, w4;
|
||||
|
||||
public Word4(String w1, String w2, String w3, String w4) {
|
||||
this.w1 = w1;
|
||||
this.w2 = w2;
|
||||
this.w3 = w3;
|
||||
this.w4 = w4;
|
||||
}
|
||||
|
||||
public String getW1() {
|
||||
return w1;
|
||||
}
|
||||
public String getW2() {
|
||||
return w2;
|
||||
}
|
||||
public String getW3() {
|
||||
return w3;
|
||||
}
|
||||
public String getW4() {
|
||||
return w4;
|
||||
}
|
||||
|
||||
public void setW1(String w){w1 = w;}
|
||||
public void setW2(String w){w2 = w;}
|
||||
public void setW3(String w){w3 = w;}
|
||||
public void setW4(String w){w4 = w;}
|
||||
}
|
|
@ -10,6 +10,7 @@ import java.util.*;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
import javafx.application.HostServices;
|
||||
import javafx.collections.transformation.SortedList;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
@ -380,87 +381,87 @@ public class StringAnalysisTabNew2 {
|
|||
* iscvv: false
|
||||
* string length: 1
|
||||
*/
|
||||
public void populateFields() {
|
||||
// corpus changed if: current one is null (this is first run of the app)
|
||||
// or if currentCorpus != gui's corpus
|
||||
boolean corpusChanged = currentCorpusType == null
|
||||
|| currentCorpusType != corpus.getCorpusType();
|
||||
|
||||
// keep ngram value if set
|
||||
if (ngramValue == null) {
|
||||
ngramValueCB.getSelectionModel().select("1");
|
||||
ngramValue = 1;
|
||||
}
|
||||
|
||||
// TODO: check for GOS, GIGAFIDA, SOLAR...
|
||||
// refresh and:
|
||||
// TODO if current value != null && is in new calculateFor ? keep : otherwise reset
|
||||
if (calculateFor == null) {
|
||||
calculateForCB.getSelectionModel().select(calculateForCB.getItems().get(0));
|
||||
calculateFor = CalculateFor.factory(calculateForCB.getItems().get(0));
|
||||
}
|
||||
|
||||
if (!filter.hasMsd()) {
|
||||
// if current corpus doesn't have msd data, disable this field
|
||||
msd = new ArrayList<>();
|
||||
msdTF.setText("");
|
||||
msdTF.setDisable(true);
|
||||
logger.info("no msd data");
|
||||
} else {
|
||||
if (ValidationUtil.isEmpty(msd)
|
||||
|| (!ValidationUtil.isEmpty(msd) && corpusChanged)) {
|
||||
// msd has not been set previously
|
||||
// or msd has been set but the corpus changed -> reset
|
||||
msd = new ArrayList<>();
|
||||
msdTF.setText("");
|
||||
msdTF.setDisable(false);
|
||||
logger.info("msd reset");
|
||||
} else if (!ValidationUtil.isEmpty(msd) && !corpusChanged) {
|
||||
// if msd has been set, but corpus type remained the same, we can keep any set msd value
|
||||
msdTF.setText(StringUtils.join(msdStrings, " "));
|
||||
msdTF.setDisable(false);
|
||||
logger.info("msd kept");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: taxonomy: refresh and keep if in new taxonomy, otherwise empty (no selection)
|
||||
|
||||
// keep skip value
|
||||
if (skipValue == null) {
|
||||
skipValueCB.getSelectionModel().select("0");
|
||||
skipValue = 0;
|
||||
}
|
||||
|
||||
// keep calculateCvv
|
||||
calculatecvvCB.setSelected(calculateCvv);
|
||||
|
||||
// keep string length if set
|
||||
if (stringLength != null) {
|
||||
stringLengthTF.setText(String.valueOf(stringLength));
|
||||
} else {
|
||||
stringLengthTF.setText("1");
|
||||
stringLength = 1;
|
||||
}
|
||||
|
||||
// TODO: trigger on rescan
|
||||
if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) {
|
||||
// user changed corpus (by type) or by selection & triggered a rescan of headers
|
||||
// see if we read taxonomy from headers, otherwise use default values for given corpus
|
||||
ObservableList<String> tax = corpus.getTaxonomy();
|
||||
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
|
||||
|
||||
currentCorpusType = corpus.getCorpusType();
|
||||
// setTaxonomyIsDirty(false);
|
||||
} else {
|
||||
|
||||
}
|
||||
|
||||
// see if we read taxonomy from headers, otherwise use default values for given corpus
|
||||
ObservableList<String> tax = corpus.getTaxonomy();
|
||||
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
|
||||
taxonomyCCB.getItems().addAll(taxonomyCCBValues);
|
||||
|
||||
}
|
||||
// public void populateFields() {
|
||||
// // corpus changed if: current one is null (this is first run of the app)
|
||||
// // or if currentCorpus != gui's corpus
|
||||
// boolean corpusChanged = currentCorpusType == null
|
||||
// || currentCorpusType != corpus.getCorpusType();
|
||||
//
|
||||
// // keep ngram value if set
|
||||
// if (ngramValue == null) {
|
||||
// ngramValueCB.getSelectionModel().select("1");
|
||||
// ngramValue = 1;
|
||||
// }
|
||||
//
|
||||
// // TODO: check for GOS, GIGAFIDA, SOLAR...
|
||||
// // refresh and:
|
||||
// // TODO if current value != null && is in new calculateFor ? keep : otherwise reset
|
||||
// if (calculateFor == null) {
|
||||
// calculateForCB.getSelectionModel().select(calculateForCB.getItems().get(0));
|
||||
// calculateFor = CalculateFor.factory(calculateForCB.getItems().get(0));
|
||||
// }
|
||||
//
|
||||
// if (!filter.hasMsd()) {
|
||||
// // if current corpus doesn't have msd data, disable this field
|
||||
// msd = new ArrayList<>();
|
||||
// msdTF.setText("");
|
||||
// msdTF.setDisable(true);
|
||||
// logger.info("no msd data");
|
||||
// } else {
|
||||
// if (ValidationUtil.isEmpty(msd)
|
||||
// || (!ValidationUtil.isEmpty(msd) && corpusChanged)) {
|
||||
// // msd has not been set previously
|
||||
// // or msd has been set but the corpus changed -> reset
|
||||
// msd = new ArrayList<>();
|
||||
// msdTF.setText("");
|
||||
// msdTF.setDisable(false);
|
||||
// logger.info("msd reset");
|
||||
// } else if (!ValidationUtil.isEmpty(msd) && !corpusChanged) {
|
||||
// // if msd has been set, but corpus type remained the same, we can keep any set msd value
|
||||
// msdTF.setText(StringUtils.join(msdStrings, " "));
|
||||
// msdTF.setDisable(false);
|
||||
// logger.info("msd kept");
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // TODO: taxonomy: refresh and keep if in new taxonomy, otherwise empty (no selection)
|
||||
//
|
||||
// // keep skip value
|
||||
// if (skipValue == null) {
|
||||
// skipValueCB.getSelectionModel().select("0");
|
||||
// skipValue = 0;
|
||||
// }
|
||||
//
|
||||
// // keep calculateCvv
|
||||
// calculatecvvCB.setSelected(calculateCvv);
|
||||
//
|
||||
// // keep string length if set
|
||||
// if (stringLength != null) {
|
||||
// stringLengthTF.setText(String.valueOf(stringLength));
|
||||
// } else {
|
||||
// stringLengthTF.setText("1");
|
||||
// stringLength = 1;
|
||||
// }
|
||||
//
|
||||
// // TODO: trigger on rescan
|
||||
// if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) {
|
||||
// // user changed corpus (by type) or by selection & triggered a rescan of headers
|
||||
// // see if we read taxonomy from headers, otherwise use default values for given corpus
|
||||
// ObservableList<String> tax = corpus.getTaxonomy();
|
||||
// taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
|
||||
//
|
||||
// currentCorpusType = corpus.getCorpusType();
|
||||
// // setTaxonomyIsDirty(false);
|
||||
// } else {
|
||||
//
|
||||
// }
|
||||
//
|
||||
// // see if we read taxonomy from headers, otherwise use default values for given corpus
|
||||
// ObservableList<String> tax = corpus.getTaxonomy();
|
||||
// taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
|
||||
// taxonomyCCB.getItems().addAll(taxonomyCCBValues);
|
||||
//
|
||||
// }
|
||||
|
||||
/**
|
||||
* Toggles visibility for panes which hold fields for skipgram value (not applicable when calculating for letters) etc.,
|
||||
|
|
|
@ -125,9 +125,11 @@ public class Export {
|
|||
|
||||
// for (Map<MultipleHMKeys, AtomicLong> value : taxonomyResults.values()) {
|
||||
for (CalculateFor otherKey : filter.getMultipleKeys()) {
|
||||
FILE_HEADER_AL.add(otherKey.toHeaderString());
|
||||
if (otherKey.equals(CalculateFor.LEMMA))
|
||||
FILE_HEADER_AL.add("Lema male črke");
|
||||
if (num_taxonomy_frequencies.get(otherKey) > 0) {
|
||||
FILE_HEADER_AL.add(otherKey.toHeaderString());
|
||||
if (otherKey.equals(CalculateFor.LEMMA))
|
||||
FILE_HEADER_AL.add("Lema male črke");
|
||||
}
|
||||
}
|
||||
|
||||
// if(otherKey.equals(CalculateFor.LEMMA)){
|
||||
|
@ -164,7 +166,7 @@ public class Export {
|
|||
// }
|
||||
FILE_HEADER_AL.add("Skupna relativna pogostost (na milijon pojavitev)");
|
||||
for (String key : taxonomyResults.keySet()) {
|
||||
if(!key.equals("Total")) {
|
||||
if(!key.equals("Total") && num_taxonomy_frequencies.get(key) > 0) {
|
||||
FILE_HEADER_AL.add("Absolutna pogostost [" + key + "]");
|
||||
FILE_HEADER_AL.add("Delež [" + key + "]");
|
||||
FILE_HEADER_AL.add("Relativna pogostost [" + key + "]");
|
||||
|
@ -257,7 +259,7 @@ public class Export {
|
|||
dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies));
|
||||
dataEntry.add(String.format("%.2f", ((double) e.getValue() * 1000000)/num_frequencies));
|
||||
for (String key : taxonomyResults.keySet()){
|
||||
if(!key.equals("Total")) {
|
||||
if(!key.equals("Total") && num_taxonomy_frequencies.get(key) > 0) {
|
||||
AtomicLong frequency = taxonomyResults.get(key).get(e.getKey());
|
||||
dataEntry.add(frequency.toString());
|
||||
dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_taxonomy_frequencies.get(key)));
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
<?import javafx.scene.layout.Pane?>
|
||||
<?import org.controlsfx.control.CheckComboBox?>
|
||||
|
||||
<?import javafx.scene.control.Separator?>
|
||||
<AnchorPane fx:id="stringAnalysisTabPaneNew2" prefHeight="600.0" prefWidth="800.0" xmlns="http://javafx.com/javafx/8.0.121" xmlns:fx="http://javafx.com/fxml/1" fx:controller="gui.StringAnalysisTabNew2">
|
||||
<Pane>
|
||||
|
||||
|
@ -80,7 +81,16 @@
|
|||
<Label layoutX="10.0" layoutY="320.0" prefHeight="25.0" text="Oznaka MSD" />
|
||||
<TextField fx:id="msdTF" layoutX="185.0" layoutY="320.0" prefWidth="180.0" />
|
||||
<Label layoutX="10.0" layoutY="360.0" prefHeight="25.0" text="Taksonomija" />
|
||||
<CheckComboBox fx:id="taxonomyCCB" layoutX="185.0" layoutY="360.0" prefHeight="25.0" prefWidth="180.0" />
|
||||
<CheckComboBox fx:id="taxonomyCCB" layoutX="185.0" layoutY="360.0" prefHeight="25.0" prefWidth="180.0" >
|
||||
<items>
|
||||
<FXCollections fx:factory="observableArrayList">
|
||||
<String fx:value="2" />
|
||||
<String fx:value="3" />
|
||||
<String fx:value="4" />
|
||||
<String fx:value="5" />
|
||||
</FXCollections>
|
||||
</items>
|
||||
</CheckComboBox>
|
||||
|
||||
<Label layoutX="10.0" layoutY="400.0" prefHeight="25.0" text="Min. št. pojavitev" />
|
||||
<TextField fx:id="minimalOccurrencesTF" layoutX="185.0" layoutY="400.0" prefWidth="180.0" />
|
||||
|
|
|
@ -1,87 +1,87 @@
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import data.Sentence;
|
||||
import data.Word;
|
||||
|
||||
public class Common {
|
||||
|
||||
public static List<Sentence> corpus;
|
||||
public static List<Sentence> minCorpus;
|
||||
public static List<Sentence> midCorpus;
|
||||
public static List<Sentence> midCorpusSkip;
|
||||
public static List<Sentence> josTest;
|
||||
|
||||
static {
|
||||
Sentence testSentence;
|
||||
|
||||
// full sentence
|
||||
ArrayList<String> taxonomy = new ArrayList<>();
|
||||
taxonomy.add("#Ft.Z.N.N");
|
||||
List<Word> words = new ArrayList<>();
|
||||
words.add(new Word("ker", "ker", "Vd"));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
words.add(new Word("junak", "junak", "Somei"));
|
||||
words.add(new Word("v", "v", "Dm"));
|
||||
words.add(new Word("posesti", "posest", "Sozem"));
|
||||
words.add(new Word("nekaj", "nekaj", "Rsn"));
|
||||
words.add(new Word("o", "o", "Dm"));
|
||||
words.add(new Word("čemer", "kar", "Zz-sem"));
|
||||
words.add(new Word("se", "se", "Zp------k"));
|
||||
words.add(new Word("mu", "on", "Zotmed--k"));
|
||||
words.add(new Word("ne", "ne", "L"));
|
||||
words.add(new Word("sanja", "sanjati", "Ggnste"));
|
||||
words.add(new Word("a", "a", "Vp"));
|
||||
words.add(new Word("se", "se", "Zp------k"));
|
||||
words.add(new Word("onemu", "oni", "Zk-sed"));
|
||||
words.add(new Word("zdi", "zdeti", "Ggnste"));
|
||||
words.add(new Word("ključno", "ključen", "Ppnsei"));
|
||||
words.add(new Word("pri", "pri", "Dm"));
|
||||
words.add(new Word("operaciji", "operacija", "Sozem"));
|
||||
words.add(new Word("666", "666", "Kag"));
|
||||
|
||||
testSentence = new Sentence(words, taxonomy);
|
||||
corpus = new ArrayList<>();
|
||||
corpus.add(testSentence);
|
||||
|
||||
// three word sentence
|
||||
testSentence = new Sentence(corpus.get(0).getSublist(0, 3), taxonomy);
|
||||
minCorpus = new ArrayList<>();
|
||||
minCorpus.add(testSentence);
|
||||
|
||||
// five word sentence
|
||||
words = new ArrayList<>();
|
||||
words.add(new Word("ker", "ker", "Vd"));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
words.add(new Word("junak", "junak", "Somei"));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
words.add(new Word("posesti", "posest", "Sozem"));
|
||||
testSentence = new Sentence(words, taxonomy);
|
||||
|
||||
midCorpus = new ArrayList<>();
|
||||
midCorpus.add(testSentence);
|
||||
|
||||
// five word sentence - for skipgrams
|
||||
words = new ArrayList<>();
|
||||
words.add(new Word("ker", "ker", "Vd"));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
words.add(new Word("junak", "junak", "Somei"));
|
||||
words.add(new Word("v", "v", "Dm"));
|
||||
words.add(new Word("posesti", "posest", "Sozem"));
|
||||
testSentence = new Sentence(words, taxonomy);
|
||||
|
||||
midCorpusSkip = new ArrayList<>();
|
||||
midCorpusSkip.add(testSentence);
|
||||
|
||||
// JOS test
|
||||
words = new ArrayList<>();
|
||||
words.add(new Word("junak", "junak", "Somei"));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
words.add(new Word("posesti", "posest", "Sozem"));
|
||||
testSentence = new Sentence(words, taxonomy);
|
||||
|
||||
josTest = new ArrayList<>();
|
||||
josTest.add(testSentence);
|
||||
}
|
||||
|
||||
}
|
||||
//import java.util.ArrayList;
|
||||
//import java.util.List;
|
||||
//
|
||||
//import data.Sentence;
|
||||
//import data.Word;
|
||||
//
|
||||
//public class Common {
|
||||
//
|
||||
// public static List<Sentence> corpus;
|
||||
// public static List<Sentence> minCorpus;
|
||||
// public static List<Sentence> midCorpus;
|
||||
// public static List<Sentence> midCorpusSkip;
|
||||
// public static List<Sentence> josTest;
|
||||
//
|
||||
// static {
|
||||
// Sentence testSentence;
|
||||
//
|
||||
// // full sentence
|
||||
// ArrayList<String> taxonomy = new ArrayList<>();
|
||||
// taxonomy.add("#Ft.Z.N.N");
|
||||
// List<Word> words = new ArrayList<>();
|
||||
// words.add(new Word("ker", "ker", "Vd"));
|
||||
// words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
// words.add(new Word("junak", "junak", "Somei"));
|
||||
// words.add(new Word("v", "v", "Dm"));
|
||||
// words.add(new Word("posesti", "posest", "Sozem"));
|
||||
// words.add(new Word("nekaj", "nekaj", "Rsn"));
|
||||
// words.add(new Word("o", "o", "Dm"));
|
||||
// words.add(new Word("čemer", "kar", "Zz-sem"));
|
||||
// words.add(new Word("se", "se", "Zp------k"));
|
||||
// words.add(new Word("mu", "on", "Zotmed--k"));
|
||||
// words.add(new Word("ne", "ne", "L"));
|
||||
// words.add(new Word("sanja", "sanjati", "Ggnste"));
|
||||
// words.add(new Word("a", "a", "Vp"));
|
||||
// words.add(new Word("se", "se", "Zp------k"));
|
||||
// words.add(new Word("onemu", "oni", "Zk-sed"));
|
||||
// words.add(new Word("zdi", "zdeti", "Ggnste"));
|
||||
// words.add(new Word("ključno", "ključen", "Ppnsei"));
|
||||
// words.add(new Word("pri", "pri", "Dm"));
|
||||
// words.add(new Word("operaciji", "operacija", "Sozem"));
|
||||
// words.add(new Word("666", "666", "Kag"));
|
||||
//
|
||||
// testSentence = new Sentence(words, taxonomy);
|
||||
// corpus = new ArrayList<>();
|
||||
// corpus.add(testSentence);
|
||||
//
|
||||
// // three word sentence
|
||||
// testSentence = new Sentence(corpus.get(0).getSublist(0, 3), taxonomy);
|
||||
// minCorpus = new ArrayList<>();
|
||||
// minCorpus.add(testSentence);
|
||||
//
|
||||
// // five word sentence
|
||||
// words = new ArrayList<>();
|
||||
// words.add(new Word("ker", "ker", "Vd"));
|
||||
// words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
// words.add(new Word("junak", "junak", "Somei"));
|
||||
// words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
// words.add(new Word("posesti", "posest", "Sozem"));
|
||||
// testSentence = new Sentence(words, taxonomy);
|
||||
//
|
||||
// midCorpus = new ArrayList<>();
|
||||
// midCorpus.add(testSentence);
|
||||
//
|
||||
// // five word sentence - for skipgrams
|
||||
// words = new ArrayList<>();
|
||||
// words.add(new Word("ker", "ker", "Vd"));
|
||||
// words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
// words.add(new Word("junak", "junak", "Somei"));
|
||||
// words.add(new Word("v", "v", "Dm"));
|
||||
// words.add(new Word("posesti", "posest", "Sozem"));
|
||||
// testSentence = new Sentence(words, taxonomy);
|
||||
//
|
||||
// midCorpusSkip = new ArrayList<>();
|
||||
// midCorpusSkip.add(testSentence);
|
||||
//
|
||||
// // JOS test
|
||||
// words = new ArrayList<>();
|
||||
// words.add(new Word("junak", "junak", "Somei"));
|
||||
// words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
// words.add(new Word("posesti", "posest", "Sozem"));
|
||||
// testSentence = new Sentence(words, taxonomy);
|
||||
//
|
||||
// josTest = new ArrayList<>();
|
||||
// josTest.add(testSentence);
|
||||
// }
|
||||
//
|
||||
//}
|
||||
|
|
|
@ -1,362 +1,362 @@
|
|||
import static org.junit.Assert.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javafx.collections.FXCollections;
|
||||
import org.junit.Test;
|
||||
|
||||
import alg.ngram.Ngrams;
|
||||
import data.*;
|
||||
|
||||
@SuppressWarnings({"Duplicates", "unused"})
|
||||
public class NgramTests {
|
||||
|
||||
@Test
|
||||
public void letterNgramsTest() {
|
||||
Map<String, AtomicLong> result = null;
|
||||
|
||||
Filter filter = new Filter();
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
filter.setStringLength(4);
|
||||
filter.setNgramValue(0); // letters
|
||||
filter.setCalculateFor(CalculateFor.WORD);
|
||||
ArrayList<String> tax= new ArrayList<>();
|
||||
tax.add("SSJ.T.P.C");
|
||||
filter.setTaxonomy(tax);
|
||||
|
||||
|
||||
Corpus testCorpus = new Corpus();
|
||||
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
taxForCombo.add("SSJ.T.P.C");
|
||||
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
|
||||
// tests:
|
||||
// - no regex
|
||||
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.minCorpus, stats);
|
||||
result = stats.getResult();
|
||||
|
||||
// tests:
|
||||
// - algorithm skips words that are shorter than set length value
|
||||
assertEquals(2, result.size());
|
||||
assertTrue(result.containsKey("juna"));
|
||||
assertEquals(1, result.get("juna").longValue());
|
||||
assertTrue(result.containsKey("unak"));
|
||||
assertEquals(1, result.get("unak").longValue());
|
||||
|
||||
// tests:
|
||||
// - map update (count) works ok
|
||||
filter.setStringLength(3);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
result = stats.getResult();
|
||||
|
||||
assertEquals(2, result.get("ima").longValue());
|
||||
|
||||
// tests:
|
||||
// - pre-check for the following regex test - this one should include word "ima", next one shouldn't
|
||||
filter.setStringLength(3);
|
||||
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
result = stats.getResult();
|
||||
|
||||
assertTrue(result.containsKey("ima"));
|
||||
|
||||
// tests:
|
||||
// - regex: S.* // vsi samostalniki
|
||||
ArrayList<Pattern> msdRegex = new ArrayList<>();
|
||||
msdRegex.add(Pattern.compile("S.*"));
|
||||
filter.setMsd(msdRegex);
|
||||
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
result = stats.getResult();
|
||||
|
||||
assertFalse(result.containsKey("ima"));
|
||||
|
||||
// tests:
|
||||
// - more precise regex
|
||||
msdRegex = new ArrayList<>();
|
||||
msdRegex.add(Pattern.compile("S.z.*")); // should include "posesti", but not "junak"
|
||||
filter.setMsd(msdRegex);
|
||||
filter.setStringLength(5);
|
||||
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
result = stats.getResult();
|
||||
|
||||
assertFalse(result.containsKey("junak"));
|
||||
assertEquals(3, result.size());
|
||||
|
||||
// tests:
|
||||
// - trickier regex
|
||||
msdRegex = new ArrayList<>();
|
||||
msdRegex.add(Pattern.compile(".{2}")); // should count only for msd="Vd" - "ker"
|
||||
filter.setMsd(msdRegex);
|
||||
filter.setStringLength(3);
|
||||
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
result = stats.getResult();
|
||||
|
||||
assertEquals(1, result.size());
|
||||
assertTrue(result.containsKey("ker"));
|
||||
assertEquals(1, result.get("ker").longValue());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void wordsNgramsTest() {
|
||||
Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
|
||||
|
||||
Filter filter = new Filter();
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
filter.setNgramValue(3);
|
||||
ArrayList<String> tax= new ArrayList<>();
|
||||
tax.add("SSJ.T.P.C");
|
||||
filter.setTaxonomy(tax);
|
||||
ArrayList<String> mKeys = new ArrayList<>();
|
||||
//mKeys.add("lema");
|
||||
filter.setMultipleKeys(mKeys);
|
||||
|
||||
Corpus testCorpus = new Corpus();
|
||||
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
taxForCombo.add("SSJ.T.P.C");
|
||||
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
|
||||
// tests:
|
||||
// - normal ngrams - word
|
||||
// midCorpus contains 5 words which should make for 3 3-grams
|
||||
filter.setCalculateFor(CalculateFor.WORD);
|
||||
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
assertEquals(3, taxonomyResult.get("Total").size());
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker ima junak")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak ima")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
|
||||
|
||||
// tests:
|
||||
// - normal ngrams - lemmas
|
||||
filter.setCalculateFor(CalculateFor.LEMMA);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
assertEquals(3, taxonomyResult.get("Total").size());
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker imeti junak")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("imeti junak imeti")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak imeti posest")));
|
||||
|
||||
// tests:
|
||||
// - normal ngrams - msd
|
||||
filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
assertEquals(3, taxonomyResult.get("Total").size());
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Vd Ggnste-n Somei")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Ggnste-n Somei Ggnste-n")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Somei Ggnste-n Sozem")));
|
||||
|
||||
// tests:
|
||||
// - ngrams - word - regex filter
|
||||
filter.setCalculateFor(CalculateFor.WORD);
|
||||
ArrayList<Pattern> msdRegex = new ArrayList<>();
|
||||
msdRegex.add(Pattern.compile("S.*"));
|
||||
msdRegex.add(Pattern.compile("G.*"));
|
||||
msdRegex.add(Pattern.compile(".*"));
|
||||
filter.setMsd(msdRegex);
|
||||
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
assertEquals(1, taxonomyResult.get("Total").size());
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
|
||||
|
||||
// tests:
|
||||
// - ngrams - word - regex filter
|
||||
filter.setCalculateFor(CalculateFor.WORD);
|
||||
filter.setNgramValue(2);
|
||||
msdRegex = new ArrayList<>();
|
||||
msdRegex.add(Pattern.compile("G.*"));
|
||||
msdRegex.add(Pattern.compile("Some.*"));
|
||||
filter.setMsd(msdRegex);
|
||||
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
assertEquals(1, taxonomyResult.get("Total").size());
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak")));
|
||||
}
|
||||
|
||||
|
||||
// @Test
|
||||
// public void ngramsTest() {
|
||||
// // minimal compliance test
|
||||
// Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
|
||||
//
|
||||
// Map<String, AtomicLong> results = recalculate(minCorpus, stats);
|
||||
//
|
||||
// // 1-gram minCorpusa should equal minCorpus' size
|
||||
// assertEquals(minCorpus.get(0).getWords().size(), results.size());
|
||||
//
|
||||
// // each resulting word should have a frequency of 1
|
||||
// List<Word> words = minCorpus.get(0).getWords();
|
||||
// for (int i = 0; i < results.size(); i++) {
|
||||
// Word w = words.get(i);
|
||||
// AtomicLong frequency = results.get(w.getMsd());
|
||||
// assertEquals(1, frequency.intValue());
|
||||
// }
|
||||
//
|
||||
// // repeat for 2grams
|
||||
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
|
||||
// results = recalculate(minCorpus, stats);
|
||||
//
|
||||
// // 2-gram of a 3 item corpus should equal 2 (first two words and second two words)
|
||||
// assertEquals(2, results.size());
|
||||
//
|
||||
// // add a filter
|
||||
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
//
|
||||
// List<String> morphosyntacticFilter = new ArrayList<>();
|
||||
// morphosyntacticFilter.add("Sozem");
|
||||
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
//
|
||||
// results = recalculate(minCorpus, stats);
|
||||
//
|
||||
// // since min corpus doesn't contain Sozem, results should be empty
|
||||
// assertEquals(0, results.size());
|
||||
//
|
||||
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// morphosyntacticFilter = new ArrayList<>();
|
||||
// morphosyntacticFilter.add("Somei");
|
||||
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// results = recalculate(minCorpus, stats);
|
||||
//
|
||||
// // since we have 1 Somei, 1 result
|
||||
// assertEquals(1, results.size());
|
||||
// assertEquals(1, results.get("Somei").intValue());
|
||||
//
|
||||
// // actual filter with wildcards
|
||||
// // 1gram
|
||||
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// morphosyntacticFilter = new ArrayList<>();
|
||||
// morphosyntacticFilter.add("So***");
|
||||
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// results = recalculate(minCorpus, stats);
|
||||
//
|
||||
// assertEquals(1, results.size());
|
||||
// assertEquals(1, results.get("Somei").intValue());
|
||||
//
|
||||
// // 2gram
|
||||
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// morphosyntacticFilter = new ArrayList<>();
|
||||
// morphosyntacticFilter.add("Ggns*e-n");
|
||||
// morphosyntacticFilter.add("So***");
|
||||
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// results = recalculate(minCorpus, stats);
|
||||
//
|
||||
// assertEquals(1, results.size());
|
||||
// assertEquals(1, results.get("Ggnste-n Somei").intValue());
|
||||
//
|
||||
// // 2gram midCorpus
|
||||
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// morphosyntacticFilter = new ArrayList<>();
|
||||
// morphosyntacticFilter.add("Ggns*e-n");
|
||||
// morphosyntacticFilter.add("So***");
|
||||
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// results = recalculate(midCorpus, stats);
|
||||
//
|
||||
// assertEquals(2, results.size());
|
||||
// assertEquals(1, results.get("Ggnste-n Somei").intValue());
|
||||
// assertEquals(1, results.get("Ggnste-n Sozem").intValue());
|
||||
// }
|
||||
|
||||
private Map<String, AtomicLong> recalculate(List<Sentence> corpus, Statistics stats) {
|
||||
// calculateForAll(corpus, stats);
|
||||
return stats.getResult();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void skipgramsTest() {
|
||||
Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
|
||||
|
||||
Filter filter = new Filter();
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
filter.setCalculateFor(CalculateFor.WORD);
|
||||
ArrayList<String> tax= new ArrayList<>();
|
||||
tax.add("SSJ.T.P.C");
|
||||
filter.setTaxonomy(tax);
|
||||
|
||||
Corpus testCorpus = new Corpus();
|
||||
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
taxForCombo.add("tisk-periodično-časopis");
|
||||
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
|
||||
// tests:
|
||||
// - bigrams
|
||||
filter.setNgramValue(2);
|
||||
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
Set<String> bigrams = new HashSet<>(Arrays.asList("ker ima", "ima junak", "junak v", "v posesti"));
|
||||
Set<MultipleHMKeys> bigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
Set<String> bigramsActual = new HashSet<>(bigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
assertEquals(bigrams, bigramsActual);
|
||||
|
||||
// test:
|
||||
// - two skip bigrams
|
||||
filter.setNgramValue(2);
|
||||
filter.setSkipValue(2);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
Set<String> twoSkipBigrams = new HashSet<>(Arrays.asList("ker ima", "ker junak", "ker v", "ima junak", "ima v", "ima posesti", "junak v", "junak posesti", "v posesti"));
|
||||
Set<MultipleHMKeys> twoSkipBigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
Set<String> twoSkipBigramsActual = new HashSet<>(twoSkipBigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
|
||||
assertEquals(twoSkipBigrams, twoSkipBigramsActual);
|
||||
|
||||
// tests:
|
||||
// - trigrams
|
||||
filter.setNgramValue(3);
|
||||
filter.setSkipValue(null);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
Set<String> trigrams = new HashSet<>(Arrays.asList("ker ima junak", "ima junak v", "junak v posesti"));
|
||||
Set<MultipleHMKeys> trigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
Set<String> trigramsActual = new HashSet<>(trigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
|
||||
assertEquals(trigrams, trigramsActual);
|
||||
|
||||
// tests:
|
||||
// - two skip trigrams
|
||||
filter.setNgramValue(3);
|
||||
filter.setSkipValue(2);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
HashSet<String> twoSkipTrigrams = new HashSet<>(Arrays.asList("ker ima junak", "ker ima v", "ker ima posesti", "ker junak v", "ker junak posesti", "ker v posesti", "ima junak v", "ima junak posesti", "ima v posesti", "junak v posesti"));
|
||||
Set<MultipleHMKeys> twoSkipTrigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
Set<String> twoSkipTrigramsActual = new HashSet<>(twoSkipTrigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
|
||||
assertEquals(twoSkipTrigrams, twoSkipTrigramsActual);
|
||||
}
|
||||
}
|
||||
//import static org.junit.Assert.*;
|
||||
//
|
||||
//import java.util.*;
|
||||
//import java.util.concurrent.atomic.AtomicLong;
|
||||
//import java.util.regex.Pattern;
|
||||
//import java.util.stream.Collectors;
|
||||
//
|
||||
//import javafx.collections.FXCollections;
|
||||
//import org.junit.Test;
|
||||
//
|
||||
//import alg.ngram.Ngrams;
|
||||
//import data.*;
|
||||
//
|
||||
//@SuppressWarnings({"Duplicates", "unused"})
|
||||
//public class NgramTests {
|
||||
//
|
||||
// @Test
|
||||
// public void letterNgramsTest() {
|
||||
// Map<String, AtomicLong> result = null;
|
||||
//
|
||||
// Filter filter = new Filter();
|
||||
// filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
// filter.setStringLength(4);
|
||||
// filter.setNgramValue(0); // letters
|
||||
// filter.setCalculateFor(CalculateFor.WORD);
|
||||
// ArrayList<String> tax= new ArrayList<>();
|
||||
// tax.add("SSJ.T.P.C");
|
||||
// filter.setTaxonomy(tax);
|
||||
//
|
||||
//
|
||||
// Corpus testCorpus = new Corpus();
|
||||
// testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
// testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
// ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
// taxForCombo.add("SSJ.T.P.C");
|
||||
// testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
//
|
||||
// // tests:
|
||||
// // - no regex
|
||||
// StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.minCorpus, stats);
|
||||
// result = stats.getResult();
|
||||
//
|
||||
// // tests:
|
||||
// // - algorithm skips words that are shorter than set length value
|
||||
// assertEquals(2, result.size());
|
||||
// assertTrue(result.containsKey("juna"));
|
||||
// assertEquals(1, result.get("juna").longValue());
|
||||
// assertTrue(result.containsKey("unak"));
|
||||
// assertEquals(1, result.get("unak").longValue());
|
||||
//
|
||||
// // tests:
|
||||
// // - map update (count) works ok
|
||||
// filter.setStringLength(3);
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// result = stats.getResult();
|
||||
//
|
||||
// assertEquals(2, result.get("ima").longValue());
|
||||
//
|
||||
// // tests:
|
||||
// // - pre-check for the following regex test - this one should include word "ima", next one shouldn't
|
||||
// filter.setStringLength(3);
|
||||
//
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// result = stats.getResult();
|
||||
//
|
||||
// assertTrue(result.containsKey("ima"));
|
||||
//
|
||||
// // tests:
|
||||
// // - regex: S.* // vsi samostalniki
|
||||
// ArrayList<Pattern> msdRegex = new ArrayList<>();
|
||||
// msdRegex.add(Pattern.compile("S.*"));
|
||||
// filter.setMsd(msdRegex);
|
||||
//
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// result = stats.getResult();
|
||||
//
|
||||
// assertFalse(result.containsKey("ima"));
|
||||
//
|
||||
// // tests:
|
||||
// // - more precise regex
|
||||
// msdRegex = new ArrayList<>();
|
||||
// msdRegex.add(Pattern.compile("S.z.*")); // should include "posesti", but not "junak"
|
||||
// filter.setMsd(msdRegex);
|
||||
// filter.setStringLength(5);
|
||||
//
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// result = stats.getResult();
|
||||
//
|
||||
// assertFalse(result.containsKey("junak"));
|
||||
// assertEquals(3, result.size());
|
||||
//
|
||||
// // tests:
|
||||
// // - trickier regex
|
||||
// msdRegex = new ArrayList<>();
|
||||
// msdRegex.add(Pattern.compile(".{2}")); // should count only for msd="Vd" - "ker"
|
||||
// filter.setMsd(msdRegex);
|
||||
// filter.setStringLength(3);
|
||||
//
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// result = stats.getResult();
|
||||
//
|
||||
// assertEquals(1, result.size());
|
||||
// assertTrue(result.containsKey("ker"));
|
||||
// assertEquals(1, result.get("ker").longValue());
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void wordsNgramsTest() {
|
||||
// Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
|
||||
//
|
||||
// Filter filter = new Filter();
|
||||
// filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
// filter.setNgramValue(3);
|
||||
// ArrayList<String> tax= new ArrayList<>();
|
||||
// tax.add("SSJ.T.P.C");
|
||||
// filter.setTaxonomy(tax);
|
||||
// ArrayList<String> mKeys = new ArrayList<>();
|
||||
// //mKeys.add("lema");
|
||||
// filter.setMultipleKeys(mKeys);
|
||||
//
|
||||
// Corpus testCorpus = new Corpus();
|
||||
// testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
// testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
// ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
// taxForCombo.add("SSJ.T.P.C");
|
||||
// testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
//
|
||||
// // tests:
|
||||
// // - normal ngrams - word
|
||||
// // midCorpus contains 5 words which should make for 3 3-grams
|
||||
// filter.setCalculateFor(CalculateFor.WORD);
|
||||
// StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
//
|
||||
// assertEquals(3, taxonomyResult.get("Total").size());
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker ima junak")));
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak ima")));
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
|
||||
//
|
||||
// // tests:
|
||||
// // - normal ngrams - lemmas
|
||||
// filter.setCalculateFor(CalculateFor.LEMMA);
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
//
|
||||
// assertEquals(3, taxonomyResult.get("Total").size());
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker imeti junak")));
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("imeti junak imeti")));
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak imeti posest")));
|
||||
//
|
||||
// // tests:
|
||||
// // - normal ngrams - msd
|
||||
// filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
//
|
||||
// assertEquals(3, taxonomyResult.get("Total").size());
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Vd Ggnste-n Somei")));
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Ggnste-n Somei Ggnste-n")));
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Somei Ggnste-n Sozem")));
|
||||
//
|
||||
// // tests:
|
||||
// // - ngrams - word - regex filter
|
||||
// filter.setCalculateFor(CalculateFor.WORD);
|
||||
// ArrayList<Pattern> msdRegex = new ArrayList<>();
|
||||
// msdRegex.add(Pattern.compile("S.*"));
|
||||
// msdRegex.add(Pattern.compile("G.*"));
|
||||
// msdRegex.add(Pattern.compile(".*"));
|
||||
// filter.setMsd(msdRegex);
|
||||
//
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
//
|
||||
// assertEquals(1, taxonomyResult.get("Total").size());
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
|
||||
//
|
||||
// // tests:
|
||||
// // - ngrams - word - regex filter
|
||||
// filter.setCalculateFor(CalculateFor.WORD);
|
||||
// filter.setNgramValue(2);
|
||||
// msdRegex = new ArrayList<>();
|
||||
// msdRegex.add(Pattern.compile("G.*"));
|
||||
// msdRegex.add(Pattern.compile("Some.*"));
|
||||
// filter.setMsd(msdRegex);
|
||||
//
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
//
|
||||
// assertEquals(1, taxonomyResult.get("Total").size());
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak")));
|
||||
// }
|
||||
//
|
||||
//
|
||||
// // @Test
|
||||
// // public void ngramsTest() {
|
||||
// // // minimal compliance test
|
||||
// // Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
|
||||
// //
|
||||
// // Map<String, AtomicLong> results = recalculate(minCorpus, stats);
|
||||
// //
|
||||
// // // 1-gram minCorpusa should equal minCorpus' size
|
||||
// // assertEquals(minCorpus.get(0).getWords().size(), results.size());
|
||||
// //
|
||||
// // // each resulting word should have a frequency of 1
|
||||
// // List<Word> words = minCorpus.get(0).getWords();
|
||||
// // for (int i = 0; i < results.size(); i++) {
|
||||
// // Word w = words.get(i);
|
||||
// // AtomicLong frequency = results.get(w.getMsd());
|
||||
// // assertEquals(1, frequency.intValue());
|
||||
// // }
|
||||
// //
|
||||
// // // repeat for 2grams
|
||||
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
|
||||
// // results = recalculate(minCorpus, stats);
|
||||
// //
|
||||
// // // 2-gram of a 3 item corpus should equal 2 (first two words and second two words)
|
||||
// // assertEquals(2, results.size());
|
||||
// //
|
||||
// // // add a filter
|
||||
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// //
|
||||
// // List<String> morphosyntacticFilter = new ArrayList<>();
|
||||
// // morphosyntacticFilter.add("Sozem");
|
||||
// // stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// //
|
||||
// // results = recalculate(minCorpus, stats);
|
||||
// //
|
||||
// // // since min corpus doesn't contain Sozem, results should be empty
|
||||
// // assertEquals(0, results.size());
|
||||
// //
|
||||
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// // morphosyntacticFilter = new ArrayList<>();
|
||||
// // morphosyntacticFilter.add("Somei");
|
||||
// // stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// // results = recalculate(minCorpus, stats);
|
||||
// //
|
||||
// // // since we have 1 Somei, 1 result
|
||||
// // assertEquals(1, results.size());
|
||||
// // assertEquals(1, results.get("Somei").intValue());
|
||||
// //
|
||||
// // // actual filter with wildcards
|
||||
// // // 1gram
|
||||
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// // morphosyntacticFilter = new ArrayList<>();
|
||||
// // morphosyntacticFilter.add("So***");
|
||||
// // stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// // results = recalculate(minCorpus, stats);
|
||||
// //
|
||||
// // assertEquals(1, results.size());
|
||||
// // assertEquals(1, results.get("Somei").intValue());
|
||||
// //
|
||||
// // // 2gram
|
||||
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// // morphosyntacticFilter = new ArrayList<>();
|
||||
// // morphosyntacticFilter.add("Ggns*e-n");
|
||||
// // morphosyntacticFilter.add("So***");
|
||||
// // stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// // results = recalculate(minCorpus, stats);
|
||||
// //
|
||||
// // assertEquals(1, results.size());
|
||||
// // assertEquals(1, results.get("Ggnste-n Somei").intValue());
|
||||
// //
|
||||
// // // 2gram midCorpus
|
||||
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// // morphosyntacticFilter = new ArrayList<>();
|
||||
// // morphosyntacticFilter.add("Ggns*e-n");
|
||||
// // morphosyntacticFilter.add("So***");
|
||||
// // stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// // results = recalculate(midCorpus, stats);
|
||||
// //
|
||||
// // assertEquals(2, results.size());
|
||||
// // assertEquals(1, results.get("Ggnste-n Somei").intValue());
|
||||
// // assertEquals(1, results.get("Ggnste-n Sozem").intValue());
|
||||
// // }
|
||||
//
|
||||
// private Map<String, AtomicLong> recalculate(List<Sentence> corpus, Statistics stats) {
|
||||
// // calculateForAll(corpus, stats);
|
||||
// return stats.getResult();
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void skipgramsTest() {
|
||||
// Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
|
||||
//
|
||||
// Filter filter = new Filter();
|
||||
// filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
// filter.setCalculateFor(CalculateFor.WORD);
|
||||
// ArrayList<String> tax= new ArrayList<>();
|
||||
// tax.add("SSJ.T.P.C");
|
||||
// filter.setTaxonomy(tax);
|
||||
//
|
||||
// Corpus testCorpus = new Corpus();
|
||||
// testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
// testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
// ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
// taxForCombo.add("tisk-periodično-časopis");
|
||||
// testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
//
|
||||
// // tests:
|
||||
// // - bigrams
|
||||
// filter.setNgramValue(2);
|
||||
// StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
//
|
||||
// Set<String> bigrams = new HashSet<>(Arrays.asList("ker ima", "ima junak", "junak v", "v posesti"));
|
||||
// Set<MultipleHMKeys> bigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
// Set<String> bigramsActual = new HashSet<>(bigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
// assertEquals(bigrams, bigramsActual);
|
||||
//
|
||||
// // test:
|
||||
// // - two skip bigrams
|
||||
// filter.setNgramValue(2);
|
||||
// filter.setSkipValue(2);
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
//
|
||||
// Set<String> twoSkipBigrams = new HashSet<>(Arrays.asList("ker ima", "ker junak", "ker v", "ima junak", "ima v", "ima posesti", "junak v", "junak posesti", "v posesti"));
|
||||
// Set<MultipleHMKeys> twoSkipBigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
// Set<String> twoSkipBigramsActual = new HashSet<>(twoSkipBigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
//
|
||||
// assertEquals(twoSkipBigrams, twoSkipBigramsActual);
|
||||
//
|
||||
// // tests:
|
||||
// // - trigrams
|
||||
// filter.setNgramValue(3);
|
||||
// filter.setSkipValue(null);
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
// Set<String> trigrams = new HashSet<>(Arrays.asList("ker ima junak", "ima junak v", "junak v posesti"));
|
||||
// Set<MultipleHMKeys> trigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
// Set<String> trigramsActual = new HashSet<>(trigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
//
|
||||
// assertEquals(trigrams, trigramsActual);
|
||||
//
|
||||
// // tests:
|
||||
// // - two skip trigrams
|
||||
// filter.setNgramValue(3);
|
||||
// filter.setSkipValue(2);
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
// HashSet<String> twoSkipTrigrams = new HashSet<>(Arrays.asList("ker ima junak", "ker ima v", "ker ima posesti", "ker junak v", "ker junak posesti", "ker v posesti", "ima junak v", "ima junak posesti", "ima v posesti", "junak v posesti"));
|
||||
// Set<MultipleHMKeys> twoSkipTrigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
// Set<String> twoSkipTrigramsActual = new HashSet<>(twoSkipTrigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
//
|
||||
// assertEquals(twoSkipTrigrams, twoSkipTrigramsActual);
|
||||
// }
|
||||
//}
|
||||
|
|
|
@ -1,55 +1,55 @@
|
|||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import javafx.collections.FXCollections;
|
||||
import org.junit.Test;
|
||||
|
||||
import alg.inflectedJOS.WordFormation;
|
||||
import alg.ngram.Ngrams;
|
||||
import data.*;
|
||||
|
||||
public class WordFormationTest {
|
||||
|
||||
@Test
|
||||
public void calculationTest() throws UnsupportedEncodingException {
|
||||
Map<String, AtomicLong> result = null;
|
||||
|
||||
Filter filter = new Filter();
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
filter.setNgramValue(1);
|
||||
|
||||
Corpus testCorpus = new Corpus();
|
||||
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
taxForCombo.add("tisk-periodično-časopis");
|
||||
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
|
||||
// tests:
|
||||
// - normal ngrams - word
|
||||
// midCorpus contains 5 words which should make for 3 3-grams
|
||||
filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.josTest, stats);
|
||||
result = stats.getResult();
|
||||
WordFormation.calculateStatistics(stats);
|
||||
Object[][] resultArr = stats.getResultCustom();
|
||||
String debug = "";
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAnything() {
|
||||
String a = "Somei";
|
||||
String b = "SomeiD";
|
||||
|
||||
String c = a.substring(0, 5);
|
||||
String d = b.substring(0, 5);
|
||||
|
||||
String debug = "";
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
//import java.io.UnsupportedEncodingException;
|
||||
//import java.util.ArrayList;
|
||||
//import java.util.Map;
|
||||
//import java.util.concurrent.atomic.AtomicLong;
|
||||
//
|
||||
//import javafx.collections.FXCollections;
|
||||
//import org.junit.Test;
|
||||
//
|
||||
//import alg.inflectedJOS.WordFormation;
|
||||
//import alg.ngram.Ngrams;
|
||||
//import data.*;
|
||||
//
|
||||
//public class WordFormationTest {
|
||||
//
|
||||
// @Test
|
||||
// public void calculationTest() throws UnsupportedEncodingException {
|
||||
// Map<String, AtomicLong> result = null;
|
||||
//
|
||||
// Filter filter = new Filter();
|
||||
// filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
// filter.setNgramValue(1);
|
||||
//
|
||||
// Corpus testCorpus = new Corpus();
|
||||
// testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
// testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
// ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
// taxForCombo.add("tisk-periodično-časopis");
|
||||
// testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
//
|
||||
// // tests:
|
||||
// // - normal ngrams - word
|
||||
// // midCorpus contains 5 words which should make for 3 3-grams
|
||||
// filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.josTest, stats);
|
||||
// result = stats.getResult();
|
||||
// WordFormation.calculateStatistics(stats);
|
||||
// Object[][] resultArr = stats.getResultCustom();
|
||||
// String debug = "";
|
||||
//
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testAnything() {
|
||||
// String a = "Somei";
|
||||
// String b = "SomeiD";
|
||||
//
|
||||
// String c = a.substring(0, 5);
|
||||
// String d = b.substring(0, 5);
|
||||
//
|
||||
// String debug = "";
|
||||
//
|
||||
// }
|
||||
//
|
||||
//}
|
||||
|
|
|
@ -1,39 +1,39 @@
|
|||
import static org.junit.Assert.*;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import data.Word;
|
||||
|
||||
public class WordTest {
|
||||
@Test
|
||||
public void paddingTest() {
|
||||
Word w1 = new Word("w1", "l1", "Somei");
|
||||
Word w2 = new Word("w2", "l2", "Sometd");
|
||||
|
||||
// w1's msd should get padded
|
||||
String msd1 = w1.getMsd();
|
||||
String msd2 = w2.getMsd();
|
||||
assertEquals(msd1.length(), msd2.length());
|
||||
assertEquals(Word.PAD_CHARACTER, msd1.charAt(msd1.length() - 1));
|
||||
|
||||
w1 = new Word("w1", "l1", "Gp-g");
|
||||
w2 = new Word("w2", "l2", "Gp-g---d");
|
||||
|
||||
// w1's msd should get padded
|
||||
msd1 = w1.getMsd();
|
||||
msd2 = w2.getMsd();
|
||||
assertEquals(msd1.length(), msd2.length());
|
||||
assertEquals(Word.PAD_CHARACTER, msd1.charAt(msd1.length() - 1));
|
||||
assertEquals(Word.PAD_CHARACTER, msd2.charAt(2));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void cvvTest() {
|
||||
String siAlphabet = "abcčdefghijklmnoprsštuvzž";
|
||||
String siAlphabetCvv = "VCCCCVCCCVCCCCCVCCCCCVCCC";
|
||||
|
||||
Word w1 = new Word(siAlphabet, "l1", null);
|
||||
assertEquals(siAlphabetCvv, w1.getCVVWord());
|
||||
}
|
||||
}
|
||||
//import static org.junit.Assert.*;
|
||||
//
|
||||
//import org.junit.Test;
|
||||
//
|
||||
//import data.Word;
|
||||
//
|
||||
//public class WordTest {
|
||||
// @Test
|
||||
// public void paddingTest() {
|
||||
// Word w1 = new Word("w1", "l1", "Somei");
|
||||
// Word w2 = new Word("w2", "l2", "Sometd");
|
||||
//
|
||||
// // w1's msd should get padded
|
||||
// String msd1 = w1.getMsd();
|
||||
// String msd2 = w2.getMsd();
|
||||
// assertEquals(msd1.length(), msd2.length());
|
||||
// assertEquals(Word.PAD_CHARACTER, msd1.charAt(msd1.length() - 1));
|
||||
//
|
||||
// w1 = new Word("w1", "l1", "Gp-g");
|
||||
// w2 = new Word("w2", "l2", "Gp-g---d");
|
||||
//
|
||||
// // w1's msd should get padded
|
||||
// msd1 = w1.getMsd();
|
||||
// msd2 = w2.getMsd();
|
||||
// assertEquals(msd1.length(), msd2.length());
|
||||
// assertEquals(Word.PAD_CHARACTER, msd1.charAt(msd1.length() - 1));
|
||||
// assertEquals(Word.PAD_CHARACTER, msd2.charAt(2));
|
||||
//
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void cvvTest() {
|
||||
// String siAlphabet = "abcčdefghijklmnoprsštuvzž";
|
||||
// String siAlphabetCvv = "VCCCCVCCCVCCCCCVCCCCCVCCC";
|
||||
//
|
||||
// Word w1 = new Word(siAlphabet, "l1", null);
|
||||
// assertEquals(siAlphabetCvv, w1.getCVVWord());
|
||||
// }
|
||||
//}
|
||||
|
|
Loading…
Reference in New Issue
Block a user