parent
1c00f1a283
commit
426a9ccc46
@ -1,67 +1,67 @@
|
||||
package alg.inflectedJOS;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.RecursiveAction;
|
||||
|
||||
import data.Sentence;
|
||||
import data.Statistics;
|
||||
|
||||
public class ForkJoin extends RecursiveAction {
|
||||
private static final long serialVersionUID = -1260951004477299634L;
|
||||
|
||||
private static final int ACCEPTABLE_SIZE = 1000;
|
||||
private List<Sentence> corpus;
|
||||
private Statistics stats;
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
/**
|
||||
* Constructor for subproblems.
|
||||
*/
|
||||
private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default constructor for the initial problem
|
||||
*/
|
||||
public ForkJoin(List<Sentence> corpus, Statistics stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = 0;
|
||||
this.end = corpus.size();
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
private void computeDirectly() {
|
||||
List<Sentence> subCorpus = corpus.subList(start, end);
|
||||
|
||||
if (stats.isTaxonomySet()) {
|
||||
InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
|
||||
} else {
|
||||
InflectedJOSCount.calculateForAll(subCorpus, stats, null);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void compute() {
|
||||
int subCorpusSize = end - start;
|
||||
|
||||
if (subCorpusSize < ACCEPTABLE_SIZE) {
|
||||
computeDirectly();
|
||||
} else {
|
||||
int mid = start + subCorpusSize / 2;
|
||||
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
|
||||
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
|
||||
|
||||
// fork (push to queue)-> compute -> join
|
||||
left.fork();
|
||||
right.fork();
|
||||
left.join();
|
||||
right.join();
|
||||
}
|
||||
}
|
||||
}
|
||||
//package alg.inflectedJOS;
|
||||
//
|
||||
//import java.util.List;
|
||||
//import java.util.concurrent.RecursiveAction;
|
||||
//
|
||||
//import data.Sentence;
|
||||
//import data.Statistics;
|
||||
//
|
||||
//public class ForkJoin extends RecursiveAction {
|
||||
// private static final long serialVersionUID = -1260951004477299634L;
|
||||
//
|
||||
// private static final int ACCEPTABLE_SIZE = 1000;
|
||||
// private List<Sentence> corpus;
|
||||
// private Statistics stats;
|
||||
// private int start;
|
||||
// private int end;
|
||||
//
|
||||
//
|
||||
// /**
|
||||
// * Constructor for subproblems.
|
||||
// */
|
||||
// private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
|
||||
// this.corpus = corpus;
|
||||
// this.start = start;
|
||||
// this.end = end;
|
||||
// this.stats = stats;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * Default constructor for the initial problem
|
||||
// */
|
||||
// public ForkJoin(List<Sentence> corpus, Statistics stats) {
|
||||
// this.corpus = corpus;
|
||||
// this.start = 0;
|
||||
// this.end = corpus.size();
|
||||
// this.stats = stats;
|
||||
// }
|
||||
//
|
||||
// private void computeDirectly() {
|
||||
// List<Sentence> subCorpus = corpus.subList(start, end);
|
||||
//
|
||||
// if (stats.isTaxonomySet()) {
|
||||
// InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
|
||||
// } else {
|
||||
// InflectedJOSCount.calculateForAll(subCorpus, stats, null);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// protected void compute() {
|
||||
// int subCorpusSize = end - start;
|
||||
//
|
||||
// if (subCorpusSize < ACCEPTABLE_SIZE) {
|
||||
// computeDirectly();
|
||||
// } else {
|
||||
// int mid = start + subCorpusSize / 2;
|
||||
// ForkJoin left = new ForkJoin(corpus, start, mid, stats);
|
||||
// ForkJoin right = new ForkJoin(corpus, mid, end, stats);
|
||||
//
|
||||
// // fork (push to queue)-> compute -> join
|
||||
// left.fork();
|
||||
// right.fork();
|
||||
// left.join();
|
||||
// right.join();
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
@ -1,170 +1,170 @@
|
||||
package alg.inflectedJOS;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import alg.Common;
|
||||
import data.Sentence;
|
||||
import data.Statistics;
|
||||
import data.StatisticsNew;
|
||||
import data.Word;
|
||||
|
||||
public class InflectedJOSCount {
|
||||
|
||||
public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
|
||||
|
||||
// static {
|
||||
// // calculate all possible combinations of indices we will substitute with a '-' for substring statistics
|
||||
// indices = new HashMap<>();
|
||||
// for (int i = 5; i <= 8; i++) {
|
||||
// indices.put(i, calculateCombinations(i));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// private static List<Integer> calculateCombinations(int i) {
|
||||
// int arr[] = {1, 2, 3, 4, 5};
|
||||
// int r = 3;
|
||||
// int n = arr.length;
|
||||
// ArrayList<ArrayList<Integer>> result = new ArrayList<>();
|
||||
//
|
||||
// return printCombination(arr, n, r);
|
||||
// }
|
||||
//
|
||||
// /* arr[] ---> Input Array
|
||||
// data[] ---> Temporary array to store current combination
|
||||
// start & end ---> Staring and Ending indexes in arr[]
|
||||
// index ---> Current index in data[]
|
||||
// r ---> Size of a combination to be printed */
|
||||
// static void combinationUtil(int arr[], int data[], int start,
|
||||
// int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
|
||||
// // Current combination is ready to be printed, print it
|
||||
// ArrayList<Integer> tmpResult = new ArrayList<>();
|
||||
//
|
||||
// if (index == r) {
|
||||
// ArrayList<Integer> tmpResult = new ArrayList<>();
|
||||
// for (int j = 0; j < r; j++)
|
||||
// System.out.print(data[j] + " ");
|
||||
// System.out.println("");
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// // replace index with all possible elements. The condition
|
||||
// // "end-i+1 >= r-index" makes sure that including one element
|
||||
// // at index will make a combination with remaining elements
|
||||
// // at remaining positions
|
||||
// for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
|
||||
// data[index] = arr[i];
|
||||
// combinationUtil(arr, data, i + 1, end, index + 1, r);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // The main function that prints all combinations of size r
|
||||
// // in arr[] of size n. This function mainly uses combinationUtil()
|
||||
// static void printCombination(int arr[], int n, int r) {
|
||||
// // A temporary array to store all combination one by one
|
||||
// int data[] = new int[r];
|
||||
//
|
||||
// // Print all combination using temprary array 'data[]'
|
||||
// combinationUtil(arr, data, 0, n - 1, 0, r);
|
||||
// }
|
||||
|
||||
// public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
|
||||
// for (Sentence s : corpus) {
|
||||
// // disregard if wrong taxonomy
|
||||
// if (!(s.getTaxonomy().startsWith(taxonomy))) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// calculateCommon(s, stats.result);
|
||||
//
|
||||
// for (Word word : s.getWords()) {
|
||||
// // skip if current word is not inflected
|
||||
// if (!(word.getMsd().length() > 0)) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// String msd = word.getMsd();
|
||||
//
|
||||
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
//
|
||||
// for (int i = 1; i < msd.length(); i++) {
|
||||
// entry.setCharAt(i, msd.charAt(i));
|
||||
// Common.updateMap(stats.result, entry.toString());
|
||||
// entry.setCharAt(i, '-');
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
|
||||
// for (Sentence s : corpus) {
|
||||
// for (Word word : s.getWords()) {
|
||||
// if (!(word.getMsd().length() > 0)) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// String msd = word.getMsd();
|
||||
//
|
||||
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
//
|
||||
// for (int i = 1; i < msd.length(); i++) {
|
||||
// entry.setCharAt(i, msd.charAt(i));
|
||||
// Common.updateMap(stats.result, entry.toString());
|
||||
// entry.setCharAt(i, '-');
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
|
||||
for (Sentence s : corpus) {
|
||||
// disregard if wrong taxonomy
|
||||
// if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
|
||||
// continue;
|
||||
//package alg.inflectedJOS;
|
||||
//
|
||||
//import java.util.ArrayList;
|
||||
//import java.util.HashMap;
|
||||
//import java.util.List;
|
||||
//
|
||||
//import org.apache.commons.lang3.StringUtils;
|
||||
//
|
||||
//import alg.Common;
|
||||
//import data.Sentence;
|
||||
//import data.Statistics;
|
||||
//import data.StatisticsNew;
|
||||
//import data.Word;
|
||||
//
|
||||
//public class InflectedJOSCount {
|
||||
//
|
||||
// public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
|
||||
//
|
||||
// // static {
|
||||
// // // calculate all possible combinations of indices we will substitute with a '-' for substring statistics
|
||||
// // indices = new HashMap<>();
|
||||
// // for (int i = 5; i <= 8; i++) {
|
||||
// // indices.put(i, calculateCombinations(i));
|
||||
// // }
|
||||
// // }
|
||||
// //
|
||||
// // private static List<Integer> calculateCombinations(int i) {
|
||||
// // int arr[] = {1, 2, 3, 4, 5};
|
||||
// // int r = 3;
|
||||
// // int n = arr.length;
|
||||
// // ArrayList<ArrayList<Integer>> result = new ArrayList<>();
|
||||
// //
|
||||
// // return printCombination(arr, n, r);
|
||||
// // }
|
||||
// //
|
||||
// // /* arr[] ---> Input Array
|
||||
// // data[] ---> Temporary array to store current combination
|
||||
// // start & end ---> Staring and Ending indexes in arr[]
|
||||
// // index ---> Current index in data[]
|
||||
// // r ---> Size of a combination to be printed */
|
||||
// // static void combinationUtil(int arr[], int data[], int start,
|
||||
// // int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
|
||||
// // // Current combination is ready to be printed, print it
|
||||
// // ArrayList<Integer> tmpResult = new ArrayList<>();
|
||||
// //
|
||||
// // if (index == r) {
|
||||
// // ArrayList<Integer> tmpResult = new ArrayList<>();
|
||||
// // for (int j = 0; j < r; j++)
|
||||
// // System.out.print(data[j] + " ");
|
||||
// // System.out.println("");
|
||||
// // return;
|
||||
// // }
|
||||
// //
|
||||
// // // replace index with all possible elements. The condition
|
||||
// // // "end-i+1 >= r-index" makes sure that including one element
|
||||
// // // at index will make a combination with remaining elements
|
||||
// // // at remaining positions
|
||||
// // for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
|
||||
// // data[index] = arr[i];
|
||||
// // combinationUtil(arr, data, i + 1, end, index + 1, r);
|
||||
// // }
|
||||
// // }
|
||||
// //
|
||||
// // // The main function that prints all combinations of size r
|
||||
// // // in arr[] of size n. This function mainly uses combinationUtil()
|
||||
// // static void printCombination(int arr[], int n, int r) {
|
||||
// // // A temporary array to store all combination one by one
|
||||
// // int data[] = new int[r];
|
||||
// //
|
||||
// // // Print all combination using temprary array 'data[]'
|
||||
// // combinationUtil(arr, data, 0, n - 1, 0, r);
|
||||
// // }
|
||||
//
|
||||
// // public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
|
||||
// // for (Sentence s : corpus) {
|
||||
// // // disregard if wrong taxonomy
|
||||
// // if (!(s.getTaxonomy().startsWith(taxonomy))) {
|
||||
// // continue;
|
||||
// // }
|
||||
// //
|
||||
// // calculateCommon(s, stats.result);
|
||||
// //
|
||||
// // for (Word word : s.getWords()) {
|
||||
// // // skip if current word is not inflected
|
||||
// // if (!(word.getMsd().length() > 0)) {
|
||||
// // continue;
|
||||
// // }
|
||||
// //
|
||||
// // String msd = word.getMsd();
|
||||
// //
|
||||
// // StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
// //
|
||||
// // for (int i = 1; i < msd.length(); i++) {
|
||||
// // entry.setCharAt(i, msd.charAt(i));
|
||||
// // Common.updateMap(stats.result, entry.toString());
|
||||
// // entry.setCharAt(i, '-');
|
||||
// // }
|
||||
// // }
|
||||
// // }
|
||||
// // }
|
||||
//
|
||||
// // public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
|
||||
// // for (Sentence s : corpus) {
|
||||
// // for (Word word : s.getWords()) {
|
||||
// // if (!(word.getMsd().length() > 0)) {
|
||||
// // continue;
|
||||
// // }
|
||||
// //
|
||||
// // String msd = word.getMsd();
|
||||
// //
|
||||
// // StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
// //
|
||||
// // for (int i = 1; i < msd.length(); i++) {
|
||||
// // entry.setCharAt(i, msd.charAt(i));
|
||||
// // Common.updateMap(stats.result, entry.toString());
|
||||
// // entry.setCharAt(i, '-');
|
||||
// // }
|
||||
// // }
|
||||
// // }
|
||||
// // }
|
||||
//
|
||||
// static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
|
||||
// for (Sentence s : corpus) {
|
||||
// // disregard if wrong taxonomy
|
||||
//// if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
|
||||
//// continue;
|
||||
//// }
|
||||
//
|
||||
// for (Word word : s.getWords()) {
|
||||
// // skip if current word is not inflected
|
||||
// if (!(word.getMsd().length() > 0)) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// String msd = word.getMsd();
|
||||
//
|
||||
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
//
|
||||
// for (int i = 1; i < msd.length(); i++) {
|
||||
// entry.setCharAt(i, msd.charAt(i));
|
||||
// Common.updateMap(stats.result, entry.toString());
|
||||
// entry.setCharAt(i, '-');
|
||||
// }
|
||||
// }
|
||||
|
||||
for (Word word : s.getWords()) {
|
||||
// skip if current word is not inflected
|
||||
if (!(word.getMsd().length() > 0)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String msd = word.getMsd();
|
||||
|
||||
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
|
||||
for (int i = 1; i < msd.length(); i++) {
|
||||
entry.setCharAt(i, msd.charAt(i));
|
||||
Common.updateMap(stats.result, entry.toString());
|
||||
entry.setCharAt(i, '-');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
|
||||
for (Sentence s : corpus) {
|
||||
|
||||
for (Word word : s.getWords()) {
|
||||
// skip if current word is not inflected
|
||||
// // TODO: if has defined msd and is of correct type (create a set)
|
||||
// if (!(word.getMsd().length() > 0)) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
String msd = word.getMsd();
|
||||
|
||||
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
|
||||
for (int i = 1; i < msd.length(); i++) {
|
||||
entry.setCharAt(i, msd.charAt(i));
|
||||
stats.updateResults(entry.toString());
|
||||
entry.setCharAt(i, '-');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
|
||||
// for (Sentence s : corpus) {
|
||||
//
|
||||
// for (Word word : s.getWords()) {
|
||||
// // skip if current word is not inflected
|
||||
// // // TODO: if has defined msd and is of correct type (create a set)
|
||||
// // if (!(word.getMsd().length() > 0)) {
|
||||
// // continue;
|
||||
// // }
|
||||
//
|
||||
// String msd = word.getMsd();
|
||||
//
|
||||
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
//
|
||||
// for (int i = 1; i < msd.length(); i++) {
|
||||
// entry.setCharAt(i, msd.charAt(i));
|
||||
// stats.updateResults(entry.toString());
|
||||
// entry.setCharAt(i, '-');
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
@ -0,0 +1,17 @@
|
||||
package data;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Word1 implements Serializable, Word {
|
||||
private String w1;
|
||||
|
||||
public Word1(String w1) {
|
||||
this.w1 = w1;
|
||||
}
|
||||
|
||||
public String getW1() {
|
||||
return w1;
|
||||
}
|
||||
|
||||
public void setW1(String w){w1 = w;}
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
package data;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Word2 implements Serializable, Word {
|
||||
private String w1, w2;
|
||||
|
||||
public Word2(String w1, String w2) {
|
||||
this.w1 = w1;
|
||||
this.w2 = w2;
|
||||
}
|
||||
|
||||
public String getW1() {
|
||||
return w1;
|
||||
}
|
||||
public String getW2() {
|
||||
return w2;
|
||||
}
|
||||
|
||||
public void setW1(String w){w1 = w;}
|
||||
public void setW2(String w){w2 = w;}
|
||||
}
|
@ -0,0 +1,27 @@
|
||||
package data;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Word3 implements Serializable, Word {
|
||||
private String w1, w2, w3;
|
||||
|
||||
public Word3(String w1, String w2, String w3) {
|
||||
this.w1 = w1;
|
||||
this.w2 = w2;
|
||||
this.w3 = w3;
|
||||
}
|
||||
|
||||
public String getW1() {
|
||||
return w1;
|
||||
}
|
||||
public String getW2() {
|
||||
return w2;
|
||||
}
|
||||
public String getW3() {
|
||||
return w3;
|
||||
}
|
||||
|
||||
public void setW1(String w){w1 = w;}
|
||||
public void setW2(String w){w2 = w;}
|
||||
public void setW3(String w){w3 = w;}
|
||||
}
|
@ -0,0 +1,32 @@
|
||||
package data;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Word4 implements Serializable, Word {
|
||||
private String w1, w2, w3, w4;
|
||||
|
||||
public Word4(String w1, String w2, String w3, String w4) {
|
||||
this.w1 = w1;
|
||||
this.w2 = w2;
|
||||
this.w3 = w3;
|
||||
this.w4 = w4;
|
||||
}
|
||||
|
||||
public String getW1() {
|
||||
return w1;
|
||||
}
|
||||
public String getW2() {
|
||||
return w2;
|
||||
}
|
||||
public String getW3() {
|
||||
return w3;
|
||||
}
|
||||
public String getW4() {
|
||||
return w4;
|
||||
}
|
||||
|
||||
public void setW1(String w){w1 = w;}
|
||||
public void setW2(String w){w2 = w;}
|
||||
public void setW3(String w){w3 = w;}
|
||||
public void setW4(String w){w4 = w;}
|
||||
}
|
@ -1,87 +1,87 @@
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import data.Sentence;
|
||||
import data.Word;
|
||||
|
||||
public class Common {
|
||||
|
||||
public static List<Sentence> corpus;
|
||||
public static List<Sentence> minCorpus;
|
||||
public static List<Sentence> midCorpus;
|
||||
public static List<Sentence> midCorpusSkip;
|
||||
public static List<Sentence> josTest;
|
||||
|
||||
static {
|
||||
Sentence testSentence;
|
||||
|
||||
// full sentence
|
||||
ArrayList<String> taxonomy = new ArrayList<>();
|
||||
taxonomy.add("#Ft.Z.N.N");
|
||||
List<Word> words = new ArrayList<>();
|
||||
words.add(new Word("ker", "ker", "Vd"));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
words.add(new Word("junak", "junak", "Somei"));
|
||||
words.add(new Word("v", "v", "Dm"));
|
||||
words.add(new Word("posesti", "posest", "Sozem"));
|
||||
words.add(new Word("nekaj", "nekaj", "Rsn"));
|
||||
words.add(new Word("o", "o", "Dm"));
|
||||
words.add(new Word("čemer", "kar", "Zz-sem"));
|
||||
words.add(new Word("se", "se", "Zp------k"));
|
||||
words.add(new Word("mu", "on", "Zotmed--k"));
|
||||
words.add(new Word("ne", "ne", "L"));
|
||||
words.add(new Word("sanja", "sanjati", "Ggnste"));
|
||||
words.add(new Word("a", "a", "Vp"));
|
||||
words.add(new Word("se", "se", "Zp------k"));
|
||||
words.add(new Word("onemu", "oni", "Zk-sed"));
|
||||
words.add(new Word("zdi", "zdeti", "Ggnste"));
|
||||
words.add(new Word("ključno", "ključen", "Ppnsei"));
|
||||
words.add(new Word("pri", "pri", "Dm"));
|
||||
words.add(new Word("operaciji", "operacija", "Sozem"));
|
||||
words.add(new Word("666", "666", "Kag"));
|
||||
|
||||
testSentence = new Sentence(words, taxonomy);
|
||||
corpus = new ArrayList<>();
|
||||
corpus.add(testSentence);
|
||||
|
||||
// three word sentence
|
||||
testSentence = new Sentence(corpus.get(0).getSublist(0, 3), taxonomy);
|
||||
minCorpus = new ArrayList<>();
|
||||
minCorpus.add(testSentence);
|
||||
|
||||
// five word sentence
|
||||
words = new ArrayList<>();
|
||||
words.add(new Word("ker", "ker", "Vd"));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
words.add(new Word("junak", "junak", "Somei"));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
words.add(new Word("posesti", "posest", "Sozem"));
|
||||
testSentence = new Sentence(words, taxonomy);
|
||||
|
||||
midCorpus = new ArrayList<>();
|
||||
midCorpus.add(testSentence);
|
||||
|
||||
// five word sentence - for skipgrams
|
||||
words = new ArrayList<>();
|
||||
words.add(new Word("ker", "ker", "Vd"));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
words.add(new Word("junak", "junak", "Somei"));
|
||||
words.add(new Word("v", "v", "Dm"));
|
||||
words.add(new Word("posesti", "posest", "Sozem"));
|
||||
testSentence = new Sentence(words, taxonomy);
|
||||
|
||||
midCorpusSkip = new ArrayList<>();
|
||||
midCorpusSkip.add(testSentence);
|
||||
|
||||
// JOS test
|
||||
words = new ArrayList<>();
|
||||
words.add(new Word("junak", "junak", "Somei"));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
words.add(new Word("posesti", "posest", "Sozem"));
|
||||
testSentence = new Sentence(words, taxonomy);
|
||||
|
||||
josTest = new ArrayList<>();
|
||||
josTest.add(testSentence);
|
||||
}
|
||||
|
||||
}
|
||||
//import java.util.ArrayList;
|
||||
//import java.util.List;
|
||||
//
|
||||
//import data.Sentence;
|
||||
//import data.Word;
|
||||
//
|
||||
//public class Common {
|
||||
//
|
||||
// public static List<Sentence> corpus;
|
||||
// public static List<Sentence> minCorpus;
|
||||
// public static List<Sentence> midCorpus;
|
||||
// public static List<Sentence> midCorpusSkip;
|
||||
// public static List<Sentence> josTest;
|
||||
//
|
||||
// static {
|
||||
// Sentence testSentence;
|
||||
//
|
||||
// // full sentence
|
||||
// ArrayList<String> taxonomy = new ArrayList<>();
|
||||
// taxonomy.add("#Ft.Z.N.N");
|
||||
// List<Word> words = new ArrayList<>();
|
||||
// words.add(new Word("ker", "ker", "Vd"));
|
||||
// words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
// words.add(new Word("junak", "junak", "Somei"));
|
||||
// words.add(new Word("v", "v", "Dm"));
|
||||
// words.add(new Word("posesti", "posest", "Sozem"));
|
||||
// words.add(new Word("nekaj", "nekaj", "Rsn"));
|
||||
// words.add(new Word("o", "o", "Dm"));
|
||||
// words.add(new Word("čemer", "kar", "Zz-sem"));
|
||||
// words.add(new Word("se", "se", "Zp------k"));
|
||||
// words.add(new Word("mu", "on", "Zotmed--k"));
|
||||
// words.add(new Word("ne", "ne", "L"));
|
||||
// words.add(new Word("sanja", "sanjati", "Ggnste"));
|
||||
// words.add(new Word("a", "a", "Vp"));
|
||||
// words.add(new Word("se", "se", "Zp------k"));
|
||||
// words.add(new Word("onemu", "oni", "Zk-sed"));
|
||||
// words.add(new Word("zdi", "zdeti", "Ggnste"));
|
||||
// words.add(new Word("ključno", "ključen", "Ppnsei"));
|
||||
// words.add(new Word("pri", "pri", "Dm"));
|
||||
// words.add(new Word("operaciji", "operacija", "Sozem"));
|
||||
// words.add(new Word("666", "666", "Kag"));
|
||||
//
|
||||
// testSentence = new Sentence(words, taxonomy);
|
||||
// corpus = new ArrayList<>();
|
||||
// corpus.add(testSentence);
|
||||
//
|
||||
// // three word sentence
|
||||
// testSentence = new Sentence(corpus.get(0).getSublist(0, 3), taxonomy);
|
||||
// minCorpus = new ArrayList<>();
|
||||
// minCorpus.add(testSentence);
|
||||
//
|
||||
// // five word sentence
|
||||
// words = new ArrayList<>();
|
||||
// words.add(new Word("ker", "ker", "Vd"));
|
||||
// words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
// words.add(new Word("junak", "junak", "Somei"));
|
||||
// words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
// words.add(new Word("posesti", "posest", "Sozem"));
|
||||
// testSentence = new Sentence(words, taxonomy);
|
||||
//
|
||||
// midCorpus = new ArrayList<>();
|
||||
// midCorpus.add(testSentence);
|
||||
//
|
||||
// // five word sentence - for skipgrams
|
||||
// words = new ArrayList<>();
|
||||
// words.add(new Word("ker", "ker", "Vd"));
|
||||
// words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
// words.add(new Word("junak", "junak", "Somei"));
|
||||
// words.add(new Word("v", "v", "Dm"));
|
||||
// words.add(new Word("posesti", "posest", "Sozem"));
|
||||
// testSentence = new Sentence(words, taxonomy);
|
||||
//
|
||||
// midCorpusSkip = new ArrayList<>();
|
||||
// midCorpusSkip.add(testSentence);
|
||||
//
|
||||
// // JOS test
|
||||
// words = new ArrayList<>();
|
||||
// words.add(new Word("junak", "junak", "Somei"));
|
||||
// words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
// words.add(new Word("posesti", "posest", "Sozem"));
|
||||
// testSentence = new Sentence(words, taxonomy);
|
||||
//
|
||||
// josTest = new ArrayList<>();
|
||||
// josTest.add(testSentence);
|
||||
// }
|
||||
//
|
||||
//}
|
||||
|
@ -1,362 +1,362 @@
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javafx.collections.FXCollections;
|
||||
import org.junit.Test;
|
||||
|
||||
import alg.ngram.Ngrams;
|
||||
import data.*;
|
||||
|
||||
@SuppressWarnings({"Duplicates", "unused"})
|
||||
public class NgramTests {
|
||||
|
||||
@Test
|
||||
public void letterNgramsTest() {
|
||||
Map<String, AtomicLong> result = null;
|
||||
|
||||
Filter filter = new Filter();
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
filter.setStringLength(4);
|
||||
filter.setNgramValue(0); // letters
|
||||
filter.setCalculateFor(CalculateFor.WORD);
|
||||
ArrayList<String> tax= new ArrayList<>();
|
||||
tax.add("SSJ.T.P.C");
|
||||
filter.setTaxonomy(tax);
|
||||
|
||||
|
||||
Corpus testCorpus = new Corpus();
|
||||
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
taxForCombo.add("SSJ.T.P.C");
|
||||
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
|
||||
// tests:
|
||||
// - no regex
|
||||
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.minCorpus, stats);
|
||||
result = stats.getResult();
|
||||
|
||||
// tests:
|
||||
// - algorithm skips words that are shorter than set length value
|
||||
assertEquals(2, result.size());
|
||||
assertTrue(result.containsKey("juna"));
|
||||
assertEquals(1, result.get("juna").longValue());
|
||||
assertTrue(result.containsKey("unak"));
|
||||
assertEquals(1, result.get("unak").longValue());
|
||||
|
||||
// tests:
|
||||
// - map update (count) works ok
|
||||
filter.setStringLength(3);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
result = stats.getResult();
|
||||
|
||||
assertEquals(2, result.get("ima").longValue());
|
||||
|
||||
// tests:
|
||||
// - pre-check for the following regex test - this one should include word "ima", next one shouldn't
|
||||
filter.setStringLength(3);
|
||||
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
result = stats.getResult();
|
||||
|
||||
assertTrue(result.containsKey("ima"));
|
||||
|
||||
// tests:
|
||||
// - regex: S.* // vsi samostalniki
|
||||
ArrayList<Pattern> msdRegex = new ArrayList<>();
|
||||
msdRegex.add(Pattern.compile("S.*"));
|
||||
filter.setMsd(msdRegex);
|
||||
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
result = stats.getResult();
|
||||
|
||||
assertFalse(result.containsKey("ima"));
|
||||
|
||||
// tests:
|
||||
// - more precise regex
|
||||
msdRegex = new ArrayList<>();
|
||||
msdRegex.add(Pattern.compile("S.z.*")); // should include "posesti", but not "junak"
|
||||
filter.setMsd(msdRegex);
|
||||
filter.setStringLength(5);
|
||||
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
result = stats.getResult();
|
||||
|
||||
assertFalse(result.containsKey("junak"));
|
||||
assertEquals(3, result.size());
|
||||
|
||||
// tests:
|
||||
// - trickier regex
|
||||
msdRegex = new ArrayList<>();
|
||||
msdRegex.add(Pattern.compile(".{2}")); // should count only for msd="Vd" - "ker"
|
||||
filter.setMsd(msdRegex);
|
||||
filter.setStringLength(3);
|
||||
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
result = stats.getResult();
|
||||
|
||||
assertEquals(1, result.size());
|
||||
assertTrue(result.containsKey("ker"));
|
||||
assertEquals(1, result.get("ker").longValue());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void wordsNgramsTest() {
|
||||
Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
|
||||
|
||||
Filter filter = new Filter();
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
filter.setNgramValue(3);
|
||||
ArrayList<String> tax= new ArrayList<>();
|
||||
tax.add("SSJ.T.P.C");
|
||||
filter.setTaxonomy(tax);
|
||||
ArrayList<String> mKeys = new ArrayList<>();
|
||||
//mKeys.add("lema");
|
||||
filter.setMultipleKeys(mKeys);
|
||||
|
||||
Corpus testCorpus = new Corpus();
|
||||
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
taxForCombo.add("SSJ.T.P.C");
|
||||
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
|
||||
// tests:
|
||||
// - normal ngrams - word
|
||||
// midCorpus contains 5 words which should make for 3 3-grams
|
||||
filter.setCalculateFor(CalculateFor.WORD);
|
||||
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
assertEquals(3, taxonomyResult.get("Total").size());
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker ima junak")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak ima")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
|
||||
|
||||
// tests:
|
||||
// - normal ngrams - lemmas
|
||||
filter.setCalculateFor(CalculateFor.LEMMA);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
assertEquals(3, taxonomyResult.get("Total").size());
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker imeti junak")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("imeti junak imeti")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak imeti posest")));
|
||||
|
||||
// tests:
|
||||
// - normal ngrams - msd
|
||||
filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
assertEquals(3, taxonomyResult.get("Total").size());
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Vd Ggnste-n Somei")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Ggnste-n Somei Ggnste-n")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Somei Ggnste-n Sozem")));
|
||||
|
||||
// tests:
|
||||
// - ngrams - word - regex filter
|
||||
filter.setCalculateFor(CalculateFor.WORD);
|
||||
ArrayList<Pattern> msdRegex = new ArrayList<>();
|
||||
msdRegex.add(Pattern.compile("S.*"));
|
||||
msdRegex.add(Pattern.compile("G.*"));
|
||||
msdRegex.add(Pattern.compile(".*"));
|
||||
filter.setMsd(msdRegex);
|
||||
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
assertEquals(1, taxonomyResult.get("Total").size());
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
|
||||
|
||||
// tests:
|
||||
// - ngrams - word - regex filter
|
||||
filter.setCalculateFor(CalculateFor.WORD);
|
||||
filter.setNgramValue(2);
|
||||
msdRegex = new ArrayList<>();
|
||||
msdRegex.add(Pattern.compile("G.*"));
|
||||
msdRegex.add(Pattern.compile("Some.*"));
|
||||
filter.setMsd(msdRegex);
|
||||
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
assertEquals(1, taxonomyResult.get("Total").size());
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak")));
|
||||
}
|
||||
|
||||
|
||||
// @Test
|
||||
// public void ngramsTest() {
|
||||
// // minimal compliance test
|
||||
// Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
|
||||
//
|
||||
// Map<String, AtomicLong> results = recalculate(minCorpus, stats);
|
||||
//
|
||||
// // 1-gram minCorpusa should equal minCorpus' size
|
||||
// assertEquals(minCorpus.get(0).getWords().size(), results.size());
|
||||
//
|
||||
// // each resulting word should have a frequency of 1
|
||||
// List<Word> words = minCorpus.get(0).getWords();
|
||||
// for (int i = 0; i < results.size(); i++) {
|
||||
// Word w = words.get(i);
|
||||
// AtomicLong frequency = results.get(w.getMsd());
|
||||
// assertEquals(1, frequency.intValue());
|
||||
// }
|
||||
//
|
||||
// // repeat for 2grams
|
||||
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
|
||||
// results = recalculate(minCorpus, stats);
|
||||
//
|
||||
// // 2-gram of a 3 item corpus should equal 2 (first two words and second two words)
|
||||
// assertEquals(2, results.size());
|
||||
//
|
||||
// // add a filter
|
||||
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
//
|
||||
// List<String> morphosyntacticFilter = new ArrayList<>();
|
||||
// morphosyntacticFilter.add("Sozem");
|
||||
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
//
|
||||
// results = recalculate(minCorpus, stats);
|
||||
//
|
||||
// // since min corpus doesn't contain Sozem, results should be empty
|
||||
// assertEquals(0, results.size());
|
||||
//
|
||||
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// morphosyntacticFilter = new ArrayList<>();
|
||||
// morphosyntacticFilter.add("Somei");
|
||||
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// results = recalculate(minCorpus, stats);
|
||||
//
|
||||
// // since we have 1 Somei, 1 result
|
||||
// assertEquals(1, results.size());
|
||||
// assertEquals(1, results.get("Somei").intValue());
|
||||
//
|
||||
// // actual filter with wildcards
|
||||
// // 1gram
|
||||
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// morphosyntacticFilter = new ArrayList<>();
|
||||
// morphosyntacticFilter.add("So***");
|
||||
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// results = recalculate(minCorpus, stats);
|
||||
//
|
||||
// assertEquals(1, results.size());
|
||||
// assertEquals(1, results.get("Somei").intValue());
|
||||
//
|
||||
// // 2gram
|
||||
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// morphosyntacticFilter = new ArrayList<>();
|
||||
// morphosyntacticFilter.add("Ggns*e-n");
|
||||
// morphosyntacticFilter.add("So***");
|
||||
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// results = recalculate(minCorpus, stats);
|
||||
//
|
||||
// assertEquals(1, results.size());
|
||||
// assertEquals(1, results.get("Ggnste-n Somei").intValue());
|
||||
//
|
||||
// // 2gram midCorpus
|
||||
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// morphosyntacticFilter = new ArrayList<>();
|
||||
// morphosyntacticFilter.add("Ggns*e-n");
|
||||
// morphosyntacticFilter.add("So***");
|
||||
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// results = recalculate(midCorpus, stats);
|
||||
//
|
||||
// assertEquals(2, results.size());
|
||||
// assertEquals(1, results.get("Ggnste-n Somei").intValue());
|
||||
// assertEquals(1, results.get("Ggnste-n Sozem").intValue());
|
||||
// }
|
||||
|
||||
private Map<String, AtomicLong> recalculate(List<Sentence> corpus, Statistics stats) {
|
||||
// calculateForAll(corpus, stats);
|
||||
return stats.getResult();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void skipgramsTest() {
|
||||
Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
|
||||
|
||||
Filter filter = new Filter();
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
filter.setCalculateFor(CalculateFor.WORD);
|
||||
ArrayList<String> tax= new ArrayList<>();
|
||||
tax.add("SSJ.T.P.C");
|
||||
filter.setTaxonomy(tax);
|
||||
|
||||
Corpus testCorpus = new Corpus();
|
||||
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
taxForCombo.add("tisk-periodično-časopis");
|
||||
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
|
||||
// tests:
|
||||
// - bigrams
|
||||
filter.setNgramValue(2);
|
||||
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
Set<String> bigrams = new HashSet<>(Arrays.asList("ker ima", "ima junak", "junak v", "v posesti"));
|
||||
Set<MultipleHMKeys> bigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
Set<String> bigramsActual = new HashSet<>(bigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
assertEquals(bigrams, bigramsActual);
|
||||
|
||||
// test:
|
||||
// - two skip bigrams
|
||||
filter.setNgramValue(2);
|
||||
filter.setSkipValue(2);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
Set<String> twoSkipBigrams = new HashSet<>(Arrays.asList("ker ima", "ker junak", "ker v", "ima junak", "ima v", "ima posesti", "junak v", "junak posesti", "v posesti"));
|
||||
Set<MultipleHMKeys> twoSkipBigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
Set<String> twoSkipBigramsActual = new HashSet<>(twoSkipBigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
|
||||
assertEquals(twoSkipBigrams, twoSkipBigramsActual);
|
||||
|
||||
// tests:
|
||||
// - trigrams
|
||||
filter.setNgramValue(3);
|
||||
filter.setSkipValue(null);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
Set<String> trigrams = new HashSet<>(Arrays.asList("ker ima junak", "ima junak v", "junak v posesti"));
|
||||
Set<MultipleHMKeys> trigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
Set<String> trigramsActual = new HashSet<>(trigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
|
||||
assertEquals(trigrams, trigramsActual);
|
||||
|
||||
// tests:
|
||||
// - two skip trigrams
|
||||
filter.setNgramValue(3);
|
||||
filter.setSkipValue(2);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
HashSet<String> twoSkipTrigrams = new HashSet<>(Arrays.asList("ker ima junak", "ker ima v", "ker ima posesti", "ker junak v", "ker junak posesti", "ker v posesti", "ima junak v", "ima junak posesti", "ima v posesti", "junak v posesti"));
|
||||
Set<MultipleHMKeys> twoSkipTrigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
Set<String> twoSkipTrigramsActual = new HashSet<>(twoSkipTrigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
|
||||
assertEquals(twoSkipTrigrams, twoSkipTrigramsActual);
|
||||
}
|
||||
}
|
||||
//import static org.junit.Assert.*;
|
||||
//
|
||||
//import java.util.*;
|
||||
//import java.util.concurrent.atomic.AtomicLong;
|
||||
//import java.util.regex.Pattern;
|
||||
//import java.util.stream.Collectors;
|
||||
//
|
||||
//import javafx.collections.FXCollections;
|
||||
//import org.junit.Test;
|
||||
//
|
||||
//import alg.ngram.Ngrams;
|
||||
//import data.*;
|
||||
//
|
||||
//@SuppressWarnings({"Duplicates", "unused"})
|
||||
//public class NgramTests {
|
||||
//
|
||||
// @Test
|
||||
// public void letterNgramsTest() {
|
||||
// Map<String, AtomicLong> result = null;
|
||||
//
|
||||
// Filter filter = new Filter();
|
||||
// filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
// filter.setStringLength(4);
|
||||
// filter.setNgramValue(0); // letters
|
||||
// filter.setCalculateFor(CalculateFor.WORD);
|
||||
// ArrayList<String> tax= new ArrayList<>();
|
||||
// tax.add("SSJ.T.P.C");
|
||||
// filter.setTaxonomy(tax);
|
||||
//
|
||||
//
|
||||
// Corpus testCorpus = new Corpus();
|
||||
// testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
// testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
// ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
// taxForCombo.add("SSJ.T.P.C");
|
||||
// testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
//
|
||||
// // tests:
|
||||
// // - no regex
|
||||
// StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.minCorpus, stats);
|
||||
// result = stats.getResult();
|
||||
//
|
||||
// // tests:
|
||||
// // - algorithm skips words that are shorter than set length value
|
||||
// assertEquals(2, result.size());
|
||||
// assertTrue(result.containsKey("juna"));
|
||||
// assertEquals(1, result.get("juna").longValue());
|
||||
// assertTrue(result.containsKey("unak"));
|
||||
// assertEquals(1, result.get("unak").longValue());
|
||||
//
|
||||
// // tests:
|
||||
// // - map update (count) works ok
|
||||
// filter.setStringLength(3);
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// result = stats.getResult();
|
||||
//
|
||||
// assertEquals(2, result.get("ima").longValue());
|
||||
//
|
||||
// // tests:
|
||||
// // - pre-check for the following regex test - this one should include word "ima", next one shouldn't
|
||||
// filter.setStringLength(3);
|
||||
//
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// result = stats.getResult();
|
||||
//
|
||||
// assertTrue(result.containsKey("ima"));
|
||||
//
|
||||
// // tests:
|
||||
// // - regex: S.* // vsi samostalniki
|
||||
// ArrayList<Pattern> msdRegex = new ArrayList<>();
|
||||
// msdRegex.add(Pattern.compile("S.*"));
|
||||
// filter.setMsd(msdRegex);
|
||||
//
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// result = stats.getResult();
|
||||
//
|
||||
// assertFalse(result.containsKey("ima"));
|
||||
//
|
||||
// // tests:
|
||||
// // - more precise regex
|
||||
// msdRegex = new ArrayList<>();
|
||||
// msdRegex.add(Pattern.compile("S.z.*")); // should include "posesti", but not "junak"
|
||||
// filter.setMsd(msdRegex);
|
||||
// filter.setStringLength(5);
|
||||
//
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// result = stats.getResult();
|
||||
//
|
||||
// assertFalse(result.containsKey("junak"));
|
||||
// assertEquals(3, result.size());
|
||||
//
|
||||
// // tests:
|
||||
// // - trickier regex
|
||||
// msdRegex = new ArrayList<>();
|
||||
// msdRegex.add(Pattern.compile(".{2}")); // should count only for msd="Vd" - "ker"
|
||||
// filter.setMsd(msdRegex);
|
||||
// filter.setStringLength(3);
|
||||
//
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// result = stats.getResult();
|
||||
//
|
||||
// assertEquals(1, result.size());
|
||||
// assertTrue(result.containsKey("ker"));
|
||||
// assertEquals(1, result.get("ker").longValue());
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void wordsNgramsTest() {
|
||||
// Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
|
||||
//
|
||||
// Filter filter = new Filter();
|
||||
// filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
// filter.setNgramValue(3);
|
||||
// ArrayList<String> tax= new ArrayList<>();
|
||||
// tax.add("SSJ.T.P.C");
|
||||
// filter.setTaxonomy(tax);
|
||||
// ArrayList<String> mKeys = new ArrayList<>();
|
||||
// //mKeys.add("lema");
|
||||
// filter.setMultipleKeys(mKeys);
|
||||
//
|
||||
// Corpus testCorpus = new Corpus();
|
||||
// testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
// testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
// ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
// taxForCombo.add("SSJ.T.P.C");
|
||||
// testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
//
|
||||
// // tests:
|
||||
// // - normal ngrams - word
|
||||
// // midCorpus contains 5 words which should make for 3 3-grams
|
||||
// filter.setCalculateFor(CalculateFor.WORD);
|
||||
// StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
//
|
||||
// assertEquals(3, taxonomyResult.get("Total").size());
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker ima junak")));
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak ima")));
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
|
||||
//
|
||||
// // tests:
|
||||
// // - normal ngrams - lemmas
|
||||
// filter.setCalculateFor(CalculateFor.LEMMA);
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
//
|
||||
// assertEquals(3, taxonomyResult.get("Total").size());
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker imeti junak")));
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("imeti junak imeti")));
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak imeti posest")));
|
||||
//
|
||||
// // tests:
|
||||
// // - normal ngrams - msd
|
||||
// filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
//
|
||||
// assertEquals(3, taxonomyResult.get("Total").size());
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Vd Ggnste-n Somei")));
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Ggnste-n Somei Ggnste-n")));
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Somei Ggnste-n Sozem")));
|
||||
//
|
||||
// // tests:
|
||||
// // - ngrams - word - regex filter
|
||||
// filter.setCalculateFor(CalculateFor.WORD);
|
||||
// ArrayList<Pattern> msdRegex = new ArrayList<>();
|
||||
// msdRegex.add(Pattern.compile("S.*"));
|
||||
// msdRegex.add(Pattern.compile("G.*"));
|
||||
// msdRegex.add(Pattern.compile(".*"));
|
||||
// filter.setMsd(msdRegex);
|
||||
//
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
//
|
||||
// assertEquals(1, taxonomyResult.get("Total").size());
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
|
||||
//
|
||||
// // tests:
|
||||
// // - ngrams - word - regex filter
|
||||
// filter.setCalculateFor(CalculateFor.WORD);
|
||||
// filter.setNgramValue(2);
|
||||
// msdRegex = new ArrayList<>();
|
||||
// msdRegex.add(Pattern.compile("G.*"));
|
||||
// msdRegex.add(Pattern.compile("Some.*"));
|
||||
// filter.setMsd(msdRegex);
|
||||
//
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
//
|
||||
// assertEquals(1, taxonomyResult.get("Total").size());
|
||||
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak")));
|
||||
// }
|
||||
//
|
||||
//
|
||||
// // @Test
|
||||
// // public void ngramsTest() {
|
||||
// // // minimal compliance test
|
||||
// // Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
|
||||
// //
|
||||
// // Map<String, AtomicLong> results = recalculate(minCorpus, stats);
|
||||
// //
|
||||
// // // 1-gram minCorpusa should equal minCorpus' size
|
||||
// // assertEquals(minCorpus.get(0).getWords().size(), results.size());
|
||||
// //
|
||||
// // // each resulting word should have a frequency of 1
|
||||
// // List<Word> words = minCorpus.get(0).getWords();
|
||||
// // for (int i = 0; i < results.size(); i++) {
|
||||
// // Word w = words.get(i);
|
||||
// // AtomicLong frequency = results.get(w.getMsd());
|
||||
// // assertEquals(1, frequency.intValue());
|
||||
// // }
|
||||
// //
|
||||
// // // repeat for 2grams
|
||||
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
|
||||
// // results = recalculate(minCorpus, stats);
|
||||
// //
|
||||
// // // 2-gram of a 3 item corpus should equal 2 (first two words and second two words)
|
||||
// // assertEquals(2, results.size());
|
||||
// //
|
||||
// // // add a filter
|
||||
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// //
|
||||
// // List<String> morphosyntacticFilter = new ArrayList<>();
|
||||
// // morphosyntacticFilter.add("Sozem");
|
||||
// // stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// //
|
||||
// // results = recalculate(minCorpus, stats);
|
||||
// //
|
||||
// // // since min corpus doesn't contain Sozem, results should be empty
|
||||
// // assertEquals(0, results.size());
|
||||
// //
|
||||
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// // morphosyntacticFilter = new ArrayList<>();
|
||||
// // morphosyntacticFilter.add("Somei");
|
||||
// // stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// // results = recalculate(minCorpus, stats);
|
||||
// //
|
||||
// // // since we have 1 Somei, 1 result
|
||||
// // assertEquals(1, results.size());
|
||||
// // assertEquals(1, results.get("Somei").intValue());
|
||||
// //
|
||||
// // // actual filter with wildcards
|
||||
// // // 1gram
|
||||
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// // morphosyntacticFilter = new ArrayList<>();
|
||||
// // morphosyntacticFilter.add("So***");
|
||||
// // stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// // results = recalculate(minCorpus, stats);
|
||||
// //
|
||||
// // assertEquals(1, results.size());
|
||||
// // assertEquals(1, results.get("Somei").intValue());
|
||||
// //
|
||||
// // // 2gram
|
||||
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// // morphosyntacticFilter = new ArrayList<>();
|
||||
// // morphosyntacticFilter.add("Ggns*e-n");
|
||||
// // morphosyntacticFilter.add("So***");
|
||||
// // stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// // results = recalculate(minCorpus, stats);
|
||||
// //
|
||||
// // assertEquals(1, results.size());
|
||||
// // assertEquals(1, results.get("Ggnste-n Somei").intValue());
|
||||
// //
|
||||
// // // 2gram midCorpus
|
||||
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// // morphosyntacticFilter = new ArrayList<>();
|
||||
// // morphosyntacticFilter.add("Ggns*e-n");
|
||||
// // morphosyntacticFilter.add("So***");
|
||||
// // stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
||||
// // results = recalculate(midCorpus, stats);
|
||||
// //
|
||||
// // assertEquals(2, results.size());
|
||||
// // assertEquals(1, results.get("Ggnste-n Somei").intValue());
|
||||
// // assertEquals(1, results.get("Ggnste-n Sozem").intValue());
|
||||
// // }
|
||||
//
|
||||
// private Map<String, AtomicLong> recalculate(List<Sentence> corpus, Statistics stats) {
|
||||
// // calculateForAll(corpus, stats);
|
||||
// return stats.getResult();
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void skipgramsTest() {
|
||||
// Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
|
||||
//
|
||||
// Filter filter = new Filter();
|
||||
// filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
// filter.setCalculateFor(CalculateFor.WORD);
|
||||
// ArrayList<String> tax= new ArrayList<>();
|
||||
// tax.add("SSJ.T.P.C");
|
||||
// filter.setTaxonomy(tax);
|
||||
//
|
||||
// Corpus testCorpus = new Corpus();
|
||||
// testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
// testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
// ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
// taxForCombo.add("tisk-periodično-časopis");
|
||||
// testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
//
|
||||
// // tests:
|
||||
// // - bigrams
|
||||
// filter.setNgramValue(2);
|
||||
// StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
//
|
||||
// Set<String> bigrams = new HashSet<>(Arrays.asList("ker ima", "ima junak", "junak v", "v posesti"));
|
||||
// Set<MultipleHMKeys> bigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
// Set<String> bigramsActual = new HashSet<>(bigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
// assertEquals(bigrams, bigramsActual);
|
||||
//
|
||||
// // test:
|
||||
// // - two skip bigrams
|
||||
// filter.setNgramValue(2);
|
||||
// filter.setSkipValue(2);
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
//
|
||||
// Set<String> twoSkipBigrams = new HashSet<>(Arrays.asList("ker ima", "ker junak", "ker v", "ima junak", "ima v", "ima posesti", "junak v", "junak posesti", "v posesti"));
|
||||
// Set<MultipleHMKeys> twoSkipBigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
// Set<String> twoSkipBigramsActual = new HashSet<>(twoSkipBigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
//
|
||||
// assertEquals(twoSkipBigrams, twoSkipBigramsActual);
|
||||
//
|
||||
// // tests:
|
||||
// // - trigrams
|
||||
// filter.setNgramValue(3);
|
||||
// filter.setSkipValue(null);
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
// Set<String> trigrams = new HashSet<>(Arrays.asList("ker ima junak", "ima junak v", "junak v posesti"));
|
||||
// Set<MultipleHMKeys> trigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
// Set<String> trigramsActual = new HashSet<>(trigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
//
|
||||
// assertEquals(trigrams, trigramsActual);
|
||||
//
|
||||
// // tests:
|
||||
// // - two skip trigrams
|
||||
// filter.setNgramValue(3);
|
||||
// filter.setSkipValue(2);
|
||||
// stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
// taxonomyResult = stats.getTaxonomyResult();
|
||||
// HashSet<String> twoSkipTrigrams = new HashSet<>(Arrays.asList("ker ima junak", "ker ima v", "ker ima posesti", "ker junak v", "ker junak posesti", "ker v posesti", "ima junak v", "ima junak posesti", "ima v posesti", "junak v posesti"));
|
||||
// Set<MultipleHMKeys> twoSkipTrigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
// Set<String> twoSkipTrigramsActual = new HashSet<>(twoSkipTrigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
||||
//
|
||||
// assertEquals(twoSkipTrigrams, twoSkipTrigramsActual);
|
||||
// }
|
||||
//}
|
||||
|
@ -1,55 +1,55 @@
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import javafx.collections.FXCollections;
|
||||
import org.junit.Test;
|
||||
|
||||
import alg.inflectedJOS.WordFormation;
|
||||
import alg.ngram.Ngrams;
|
||||
import data.*;
|
||||
|
||||
public class WordFormationTest {
|
||||
|
||||
@Test
|
||||
public void calculationTest() throws UnsupportedEncodingException {
|
||||
Map<String, AtomicLong> result = null;
|
||||
|
||||
Filter filter = new Filter();
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
filter.setNgramValue(1);
|
||||
|
||||
Corpus testCorpus = new Corpus();
|
||||
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
taxForCombo.add("tisk-periodično-časopis");
|
||||
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
|
||||
// tests:
|
||||
// - normal ngrams - word
|
||||
// midCorpus contains 5 words which should make for 3 3-grams
|
||||
filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.josTest, stats);
|
||||
result = stats.getResult();
|
||||
WordFormation.calculateStatistics(stats);
|
||||
Object[][] resultArr = stats.getResultCustom();
|
||||
String debug = "";
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAnything() {
|
||||
String a = "Somei";
|
||||
String b = "SomeiD";
|
||||
|
||||
String c = a.substring(0, 5);
|
||||
String d = b.substring(0, 5);
|
||||
|
||||
String debug = "";
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
//import java.io.UnsupportedEncodingException;
|
||||
//import java.util.ArrayList;
|
||||
//import java.util.Map;
|
||||
//import java.util.concurrent.atomic.AtomicLong;
|
||||
//
|
||||
//import javafx.collections.FXCollections;
|
||||
//import org.junit.Test;
|
||||
//
|
||||
//import alg.inflectedJOS.WordFormation;
|
||||
//import alg.ngram.Ngrams;
|
||||
//import data.*;
|
||||
//
|
||||
//public class WordFormationTest {
|
||||
//
|
||||
// @Test
|
||||
// public void calculationTest() throws UnsupportedEncodingException {
|
||||
// Map<String, AtomicLong> result = null;
|
||||
//
|
||||
// Filter filter = new Filter();
|
||||
// filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
// filter.setNgramValue(1);
|
||||
//
|
||||
// Corpus testCorpus = new Corpus();
|
||||
// testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
// testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
// ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
// taxForCombo.add("tisk-periodično-časopis");
|
||||
// testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
//
|
||||
// // tests:
|
||||
// // - normal ngrams - word
|
||||
// // midCorpus contains 5 words which should make for 3 3-grams
|
||||
// filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
// StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
// Ngrams.calculateForAll(Common.josTest, stats);
|
||||
// result = stats.getResult();
|
||||
// WordFormation.calculateStatistics(stats);
|
||||
// Object[][] resultArr = stats.getResultCustom();
|
||||
// String debug = "";
|
||||
//
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testAnything() {
|
||||
// String a = "Somei";
|
||||
// String b = "SomeiD";
|
||||
//
|
||||
// String c = a.substring(0, 5);
|
||||
// String d = b.substring(0, 5);
|
||||
//
|
||||
// String debug = "";
|
||||
//
|
||||
// }
|
||||
//
|
||||
//}
|
||||
|
@ -1,39 +1,39 @@
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import data.Word;
|
||||
|
||||
public class WordTest {
|
||||
@Test
|
||||
public void paddingTest() {
|
||||
Word w1 = new Word("w1", "l1", "Somei");
|
||||
Word w2 = new Word("w2", "l2", "Sometd");
|
||||
|
||||
// w1's msd should get padded
|
||||
String msd1 = w1.getMsd();
|
||||
String msd2 = w2.getMsd();
|
||||
assertEquals(msd1.length(), msd2.length());
|
||||
assertEquals(Word.PAD_CHARACTER, msd1.charAt(msd1.length() - 1));
|
||||
|
||||
w1 = new Word("w1", "l1", "Gp-g");
|
||||
w2 = new Word("w2", "l2", "Gp-g---d");
|
||||
|
||||
// w1's msd should get padded
|
||||
msd1 = w1.getMsd();
|
||||
msd2 = w2.getMsd();
|
||||
assertEquals(msd1.length(), msd2.length());
|
||||
assertEquals(Word.PAD_CHARACTER, msd1.charAt(msd1.length() - 1));
|
||||
assertEquals(Word.PAD_CHARACTER, msd2.charAt(2));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void cvvTest() {
|
||||
String siAlphabet = "abcčdefghijklmnoprsštuvzž";
|
||||
String siAlphabetCvv = "VCCCCVCCCVCCCCCVCCCCCVCCC";
|
||||
|
||||
Word w1 = new Word(siAlphabet, "l1", null);
|
||||
assertEquals(siAlphabetCvv, w1.getCVVWord());
|
||||
}
|
||||
}
|
||||
//import static org.junit.Assert.*;
|
||||
//
|
||||
//import org.junit.Test;
|
||||
//
|
||||
//import data.Word;
|
||||
//
|
||||
//public class WordTest {
|
||||
// @Test
|
||||
// public void paddingTest() {
|
||||
// Word w1 = new Word("w1", "l1", "Somei");
|
||||
// Word w2 = new Word("w2", "l2", "Sometd");
|
||||
//
|
||||
// // w1's msd should get padded
|
||||
// String msd1 = w1.getMsd();
|
||||
// String msd2 = w2.getMsd();
|
||||
// assertEquals(msd1.length(), msd2.length());
|
||||
// assertEquals(Word.PAD_CHARACTER, msd1.charAt(msd1.length() - 1));
|
||||
//
|
||||
// w1 = new Word("w1", "l1", "Gp-g");
|
||||
// w2 = new Word("w2", "l2", "Gp-g---d");
|
||||
//
|
||||
// // w1's msd should get padded
|
||||
// msd1 = w1.getMsd();
|
||||
// msd2 = w2.getMsd();
|
||||
// assertEquals(msd1.length(), msd2.length());
|
||||
// assertEquals(Word.PAD_CHARACTER, msd1.charAt(msd1.length() - 1));
|
||||
// assertEquals(Word.PAD_CHARACTER, msd2.charAt(2));
|
||||
//
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void cvvTest() {
|
||||
// String siAlphabet = "abcčdefghijklmnoprsštuvzž";
|
||||
// String siAlphabetCvv = "VCCCCVCCCVCCCCCVCCCCCVCCC";
|
||||
//
|
||||
// Word w1 = new Word(siAlphabet, "l1", null);
|
||||
// assertEquals(siAlphabetCvv, w1.getCVVWord());
|
||||
// }
|
||||
//}
|
||||
|
Loading…
Reference in new issue