@ -1,362 +1,362 @@
import static org.junit.Assert.* ;
import java.util.* ;
import java.util.concurrent.atomic.AtomicLong ;
import java.util.regex.Pattern ;
import java.util.stream.Collectors ;
import javafx.collections.FXCollections ;
import org.junit.Test ;
import alg.ngram.Ngrams ;
import data.* ;
@SuppressWarnings ( { "Duplicates" , "unused" } )
public class NgramTests {
@Test
public void letterNgramsTest ( ) {
Map < String , AtomicLong > result = null ;
Filter filter = new Filter ( ) ;
filter . setAl ( AnalysisLevel . STRING_LEVEL ) ;
filter . setStringLength ( 4 ) ;
filter . setNgramValue ( 0 ) ; // letters
filter . setCalculateFor ( CalculateFor . WORD ) ;
ArrayList < String > tax = new ArrayList < > ( ) ;
tax . add ( "SSJ.T.P.C" ) ;
filter . setTaxonomy ( tax ) ;
Corpus testCorpus = new Corpus ( ) ;
testCorpus . setCorpusType ( CorpusType . GIGAFIDA ) ;
testCorpus . setDetectedCorpusFiles ( new ArrayList < > ( ) ) ;
ArrayList < String > taxForCombo = new ArrayList < > ( ) ;
taxForCombo . add ( "SSJ.T.P.C" ) ;
testCorpus . setTaxonomy ( FXCollections . observableArrayList ( taxForCombo ) ) ;
// tests:
// - no regex
StatisticsNew stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . minCorpus , stats ) ;
result = stats . getResult ( ) ;
// tests:
// - algorithm skips words that are shorter than set length value
assertEquals ( 2 , result . size ( ) ) ;
assertTrue ( result . containsKey ( "juna" ) ) ;
assertEquals ( 1 , result . get ( "juna" ) . longValue ( ) ) ;
assertTrue ( result . containsKey ( "unak" ) ) ;
assertEquals ( 1 , result . get ( "unak" ) . longValue ( ) ) ;
// tests:
// - map update (count) works ok
filter . setStringLength ( 3 ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpus , stats ) ;
result = stats . getResult ( ) ;
assertEquals ( 2 , result . get ( "ima" ) . longValue ( ) ) ;
// tests:
// - pre-check for the following regex test - this one should include word "ima", next one shouldn't
filter . setStringLength ( 3 ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpus , stats ) ;
result = stats . getResult ( ) ;
assertTrue ( result . containsKey ( "ima" ) ) ;
// tests:
// - regex: S.* // vsi samostalniki
ArrayList < Pattern > msdRegex = new ArrayList < > ( ) ;
msdRegex . add ( Pattern . compile ( "S.*" ) ) ;
filter . setMsd ( msdRegex ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpus , stats ) ;
result = stats . getResult ( ) ;
assertFalse ( result . containsKey ( "ima" ) ) ;
// tests:
// - more precise regex
msdRegex = new ArrayList < > ( ) ;
msdRegex . add ( Pattern . compile ( "S.z.*" ) ) ; // should include "posesti", but not "junak"
filter . setMsd ( msdRegex ) ;
filter . setStringLength ( 5 ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpus , stats ) ;
result = stats . getResult ( ) ;
assertFalse ( result . containsKey ( "junak" ) ) ;
assertEquals ( 3 , result . size ( ) ) ;
// tests:
// - trickier regex
msdRegex = new ArrayList < > ( ) ;
msdRegex . add ( Pattern . compile ( ".{2}" ) ) ; // should count only for msd="Vd" - "ker"
filter . setMsd ( msdRegex ) ;
filter . setStringLength ( 3 ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpus , stats ) ;
result = stats . getResult ( ) ;
assertEquals ( 1 , result . size ( ) ) ;
assertTrue ( result . containsKey ( "ker" ) ) ;
assertEquals ( 1 , result . get ( "ker" ) . longValue ( ) ) ;
}
@Test
public void wordsNgramsTest ( ) {
Map < String , Map < MultipleHMKeys , AtomicLong > > taxonomyResult ;
Filter filter = new Filter ( ) ;
filter . setAl ( AnalysisLevel . STRING_LEVEL ) ;
filter . setNgramValue ( 3 ) ;
ArrayList < String > tax = new ArrayList < > ( ) ;
tax . add ( "SSJ.T.P.C" ) ;
filter . setTaxonomy ( tax ) ;
ArrayList < String > mKeys = new ArrayList < > ( ) ;
//mKeys.add("lema");
filter . setMultipleKeys ( mKeys ) ;
Corpus testCorpus = new Corpus ( ) ;
testCorpus . setCorpusType ( CorpusType . GIGAFIDA ) ;
testCorpus . setDetectedCorpusFiles ( new ArrayList < > ( ) ) ;
ArrayList < String > taxForCombo = new ArrayList < > ( ) ;
taxForCombo . add ( "SSJ.T.P.C" ) ;
testCorpus . setTaxonomy ( FXCollections . observableArrayList ( taxForCombo ) ) ;
// tests:
// - normal ngrams - word
// midCorpus contains 5 words which should make for 3 3-grams
filter . setCalculateFor ( CalculateFor . WORD ) ;
StatisticsNew stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpus , stats ) ;
taxonomyResult = stats . getTaxonomyResult ( ) ;
assertEquals ( 3 , taxonomyResult . get ( "Total" ) . size ( ) ) ;
assertTrue ( taxonomyResult . get ( "Total" ) . containsKey ( new MultipleHMKeys1 ( "ker ima junak" ) ) ) ;
assertTrue ( taxonomyResult . get ( "Total" ) . containsKey ( new MultipleHMKeys1 ( "ima junak ima" ) ) ) ;
assertTrue ( taxonomyResult . get ( "Total" ) . containsKey ( new MultipleHMKeys1 ( "junak ima posesti" ) ) ) ;
// tests:
// - normal ngrams - lemmas
filter . setCalculateFor ( CalculateFor . LEMMA ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpus , stats ) ;
taxonomyResult = stats . getTaxonomyResult ( ) ;
assertEquals ( 3 , taxonomyResult . get ( "Total" ) . size ( ) ) ;
assertTrue ( taxonomyResult . get ( "Total" ) . containsKey ( new MultipleHMKeys1 ( "ker imeti junak" ) ) ) ;
assertTrue ( taxonomyResult . get ( "Total" ) . containsKey ( new MultipleHMKeys1 ( "imeti junak imeti" ) ) ) ;
assertTrue ( taxonomyResult . get ( "Total" ) . containsKey ( new MultipleHMKeys1 ( "junak imeti posest" ) ) ) ;
// tests:
// - normal ngrams - msd
filter . setCalculateFor ( CalculateFor . MORPHOSYNTACTIC_PROPERTY ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpus , stats ) ;
taxonomyResult = stats . getTaxonomyResult ( ) ;
assertEquals ( 3 , taxonomyResult . get ( "Total" ) . size ( ) ) ;
assertTrue ( taxonomyResult . get ( "Total" ) . containsKey ( new MultipleHMKeys1 ( "Vd Ggnste-n Somei" ) ) ) ;
assertTrue ( taxonomyResult . get ( "Total" ) . containsKey ( new MultipleHMKeys1 ( "Ggnste-n Somei Ggnste-n" ) ) ) ;
assertTrue ( taxonomyResult . get ( "Total" ) . containsKey ( new MultipleHMKeys1 ( "Somei Ggnste-n Sozem" ) ) ) ;
// tests:
// - ngrams - word - regex filter
filter . setCalculateFor ( CalculateFor . WORD ) ;
ArrayList < Pattern > msdRegex = new ArrayList < > ( ) ;
msdRegex . add ( Pattern . compile ( "S.*" ) ) ;
msdRegex . add ( Pattern . compile ( "G.*" ) ) ;
msdRegex . add ( Pattern . compile ( ".*" ) ) ;
filter . setMsd ( msdRegex ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpus , stats ) ;
taxonomyResult = stats . getTaxonomyResult ( ) ;
assertEquals ( 1 , taxonomyResult . get ( "Total" ) . size ( ) ) ;
assertTrue ( taxonomyResult . get ( "Total" ) . containsKey ( new MultipleHMKeys1 ( "junak ima posesti" ) ) ) ;
// tests:
// - ngrams - word - regex filter
filter . setCalculateFor ( CalculateFor . WORD ) ;
filter . setNgramValue ( 2 ) ;
msdRegex = new ArrayList < > ( ) ;
msdRegex . add ( Pattern . compile ( "G.*" ) ) ;
msdRegex . add ( Pattern . compile ( "Some.*" ) ) ;
filter . setMsd ( msdRegex ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpus , stats ) ;
taxonomyResult = stats . getTaxonomyResult ( ) ;
assertEquals ( 1 , taxonomyResult . get ( "Total" ) . size ( ) ) ;
assertTrue ( taxonomyResult . get ( "Total" ) . containsKey ( new MultipleHMKeys1 ( "ima junak" ) ) ) ;
}
//import static org.junit.Assert.*;
//
//import java.util.*;
//import java.util.concurrent.atomic.AtomicLong;
//import java.util.regex.Pattern;
//import java.util.stream.Collectors;
//
//import javafx.collections.FXCollections;
//import org.junit.Test;
//
//import alg.ngram.Ngrams;
//import data.*;
//
//@SuppressWarnings({"Duplicates", "unused"})
//public class NgramTests {
//
// @Test
// public void ngramsTest() {
// // minimal compliance test
// Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
// public void letterNgramsTest() {
// Map<String, AtomicLong> result = null;
//
// Filter filter = new Filter();
// filter.setAl(AnalysisLevel.STRING_LEVEL);
// filter.setStringLength(4);
// filter.setNgramValue(0); // letters
// filter.setCalculateFor(CalculateFor.WORD);
// ArrayList<String> tax= new ArrayList<>();
// tax.add("SSJ.T.P.C");
// filter.setTaxonomy(tax);
//
//
// Corpus testCorpus = new Corpus();
// testCorpus.setCorpusType(CorpusType.GIGAFIDA);
// testCorpus.setDetectedCorpusFiles(new ArrayList<>());
// ArrayList<String> taxForCombo = new ArrayList<>();
// taxForCombo.add("SSJ.T.P.C");
// testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
//
// // tests:
// // - no regex
// StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
// Ngrams.calculateForAll(Common.minCorpus, stats);
// result = stats.getResult();
//
// // tests:
// // - algorithm skips words that are shorter than set length value
// assertEquals(2, result.size());
// assertTrue(result.containsKey("juna"));
// assertEquals(1, result.get("juna").longValue());
// assertTrue(result.containsKey("unak"));
// assertEquals(1, result.get("unak").longValue());
//
// // tests:
// // - map update (count) works ok
// filter.setStringLength(3);
// stats = new StatisticsNew(testCorpus, filter, false);
// Ngrams.calculateForAll(Common.midCorpus, stats);
// result = stats.getResult();
//
// assertEquals(2, result.get("ima").longValue());
//
// // tests:
// // - pre-check for the following regex test - this one should include word "ima", next one shouldn't
// filter.setStringLength(3);
//
// stats = new StatisticsNew(testCorpus, filter, false);
// Ngrams.calculateForAll(Common.midCorpus, stats);
// result = stats.getResult();
//
// Map<String, AtomicLong> results = recalculate(minCorpus, stats);
// assertTrue(result.containsKey("ima"));
//
// // 1-gram minCorpusa should equal minCorpus' size
// assertEquals(minCorpus.get(0).getWords().size(), results.size());
// // tests:
// // - regex: S.* // vsi samostalniki
// ArrayList<Pattern> msdRegex = new ArrayList<>();
// msdRegex.add(Pattern.compile("S.*"));
// filter.setMsd(msdRegex);
//
// // each resulting word should have a frequency of 1
// List<Word> words = minCorpus.get(0).getWords();
// for (int i = 0; i < results.size(); i++) {
// Word w = words.get(i);
// AtomicLong frequency = results.get(w.getMsd());
// assertEquals(1, frequency.intValue());
// stats = new StatisticsNew(testCorpus, filter, false);
// Ngrams.calculateForAll(Common.midCorpus, stats);
// result = stats.getResult();
//
// assertFalse(result.containsKey("ima"));
//
// // tests:
// // - more precise regex
// msdRegex = new ArrayList<>();
// msdRegex.add(Pattern.compile("S.z.*")); // should include "posesti", but not "junak"
// filter.setMsd(msdRegex);
// filter.setStringLength(5);
//
// stats = new StatisticsNew(testCorpus, filter, false);
// Ngrams.calculateForAll(Common.midCorpus, stats);
// result = stats.getResult();
//
// assertFalse(result.containsKey("junak"));
// assertEquals(3, result.size());
//
// // tests:
// // - trickier regex
// msdRegex = new ArrayList<>();
// msdRegex.add(Pattern.compile(".{2}")); // should count only for msd="Vd" - "ker"
// filter.setMsd(msdRegex);
// filter.setStringLength(3);
//
// stats = new StatisticsNew(testCorpus, filter, false);
// Ngrams.calculateForAll(Common.midCorpus, stats);
// result = stats.getResult();
//
// assertEquals(1, result.size());
// assertTrue(result.containsKey("ker"));
// assertEquals(1, result.get("ker").longValue());
// }
//
// // repeat for 2grams
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
// results = recalculate(minCorpus, stats);
//
// // 2-gram of a 3 item corpus should equal 2 (first two words and second two words)
// assertEquals(2, results.size());
//
// // add a filter
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
//
// List<String> morphosyntacticFilter = new ArrayList<>();
// morphosyntacticFilter.add("Sozem");
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
//
// results = recalculate(minCorpus, stats);
//
// // since min corpus doesn't contain Sozem, results should be empty
// assertEquals(0, results.size());
//
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// morphosyntacticFilter = new ArrayList<>();
// morphosyntacticFilter.add("Somei");
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
// results = recalculate(minCorpus, stats);
//
// // since we have 1 Somei, 1 result
// assertEquals(1, results.size());
// assertEquals(1, results.get("Somei").intValue());
//
// // actual filter with wildcards
// // 1gram
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// morphosyntacticFilter = new ArrayList<>();
// morphosyntacticFilter.add("So***");
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
// results = recalculate(minCorpus, stats);
//
// assertEquals(1, results.size());
// assertEquals(1, results.get("Somei").intValue());
//
// // 2gram
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// morphosyntacticFilter = new ArrayList<>();
// morphosyntacticFilter.add("Ggns*e-n");
// morphosyntacticFilter.add("So***");
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
// results = recalculate(minCorpus, stats);
//
// assertEquals(1, results.size());
// assertEquals(1, results.get("Ggnste-n Somei").intValue());
//
// // 2gram midCorpus
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// morphosyntacticFilter = new ArrayList<>();
// morphosyntacticFilter.add("Ggns*e-n");
// morphosyntacticFilter.add("So***");
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
// results = recalculate(midCorpus, stats);
//
// assertEquals(2, results.size());
// assertEquals(1, results.get("Ggnste-n Somei").intValue());
// assertEquals(1, results.get("Ggnste-n Sozem").intValue());
// @Test
// public void wordsNgramsTest() {
// Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
//
// Filter filter = new Filter();
// filter.setAl(AnalysisLevel.STRING_LEVEL);
// filter.setNgramValue(3);
// ArrayList<String> tax= new ArrayList<>();
// tax.add("SSJ.T.P.C");
// filter.setTaxonomy(tax);
// ArrayList<String> mKeys = new ArrayList<>();
// //mKeys.add("lema");
// filter.setMultipleKeys(mKeys);
//
// Corpus testCorpus = new Corpus();
// testCorpus.setCorpusType(CorpusType.GIGAFIDA);
// testCorpus.setDetectedCorpusFiles(new ArrayList<>());
// ArrayList<String> taxForCombo = new ArrayList<>();
// taxForCombo.add("SSJ.T.P.C");
// testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
//
// // tests:
// // - normal ngrams - word
// // midCorpus contains 5 words which should make for 3 3-grams
// filter.setCalculateFor(CalculateFor.WORD);
// StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
// Ngrams.calculateForAll(Common.midCorpus, stats);
// taxonomyResult = stats.getTaxonomyResult();
//
// assertEquals(3, taxonomyResult.get("Total").size());
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker ima junak")));
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak ima")));
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
//
// // tests:
// // - normal ngrams - lemmas
// filter.setCalculateFor(CalculateFor.LEMMA);
// stats = new StatisticsNew(testCorpus, filter, false);
// Ngrams.calculateForAll(Common.midCorpus, stats);
// taxonomyResult = stats.getTaxonomyResult();
//
// assertEquals(3, taxonomyResult.get("Total").size());
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker imeti junak")));
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("imeti junak imeti")));
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak imeti posest")));
//
// // tests:
// // - normal ngrams - msd
// filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// stats = new StatisticsNew(testCorpus, filter, false);
// Ngrams.calculateForAll(Common.midCorpus, stats);
// taxonomyResult = stats.getTaxonomyResult();
//
// assertEquals(3, taxonomyResult.get("Total").size());
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Vd Ggnste-n Somei")));
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Ggnste-n Somei Ggnste-n")));
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Somei Ggnste-n Sozem")));
//
// // tests:
// // - ngrams - word - regex filter
// filter.setCalculateFor(CalculateFor.WORD);
// ArrayList<Pattern> msdRegex = new ArrayList<>();
// msdRegex.add(Pattern.compile("S.*"));
// msdRegex.add(Pattern.compile("G.*"));
// msdRegex.add(Pattern.compile(".*"));
// filter.setMsd(msdRegex);
//
// stats = new StatisticsNew(testCorpus, filter, false);
// Ngrams.calculateForAll(Common.midCorpus, stats);
// taxonomyResult = stats.getTaxonomyResult();
//
// assertEquals(1, taxonomyResult.get("Total").size());
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
//
// // tests:
// // - ngrams - word - regex filter
// filter.setCalculateFor(CalculateFor.WORD);
// filter.setNgramValue(2);
// msdRegex = new ArrayList<>();
// msdRegex.add(Pattern.compile("G.*"));
// msdRegex.add(Pattern.compile("Some.*"));
// filter.setMsd(msdRegex);
//
// stats = new StatisticsNew(testCorpus, filter, false);
// Ngrams.calculateForAll(Common.midCorpus, stats);
// taxonomyResult = stats.getTaxonomyResult();
//
// assertEquals(1, taxonomyResult.get("Total").size());
// assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak")));
// }
//
//
// // @Test
// // public void ngramsTest() {
// // // minimal compliance test
// // Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
// //
// // Map<String, AtomicLong> results = recalculate(minCorpus, stats);
// //
// // // 1-gram minCorpusa should equal minCorpus' size
// // assertEquals(minCorpus.get(0).getWords().size(), results.size());
// //
// // // each resulting word should have a frequency of 1
// // List<Word> words = minCorpus.get(0).getWords();
// // for (int i = 0; i < results.size(); i++) {
// // Word w = words.get(i);
// // AtomicLong frequency = results.get(w.getMsd());
// // assertEquals(1, frequency.intValue());
// // }
// //
// // // repeat for 2grams
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
// // results = recalculate(minCorpus, stats);
// //
// // // 2-gram of a 3 item corpus should equal 2 (first two words and second two words)
// // assertEquals(2, results.size());
// //
// // // add a filter
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// //
// // List<String> morphosyntacticFilter = new ArrayList<>();
// // morphosyntacticFilter.add("Sozem");
// // stats.setMorphosyntacticFilter(morphosyntacticFilter);
// //
// // results = recalculate(minCorpus, stats);
// //
// // // since min corpus doesn't contain Sozem, results should be empty
// // assertEquals(0, results.size());
// //
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// // morphosyntacticFilter = new ArrayList<>();
// // morphosyntacticFilter.add("Somei");
// // stats.setMorphosyntacticFilter(morphosyntacticFilter);
// // results = recalculate(minCorpus, stats);
// //
// // // since we have 1 Somei, 1 result
// // assertEquals(1, results.size());
// // assertEquals(1, results.get("Somei").intValue());
// //
// // // actual filter with wildcards
// // // 1gram
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// // morphosyntacticFilter = new ArrayList<>();
// // morphosyntacticFilter.add("So***");
// // stats.setMorphosyntacticFilter(morphosyntacticFilter);
// // results = recalculate(minCorpus, stats);
// //
// // assertEquals(1, results.size());
// // assertEquals(1, results.get("Somei").intValue());
// //
// // // 2gram
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// // morphosyntacticFilter = new ArrayList<>();
// // morphosyntacticFilter.add("Ggns*e-n");
// // morphosyntacticFilter.add("So***");
// // stats.setMorphosyntacticFilter(morphosyntacticFilter);
// // results = recalculate(minCorpus, stats);
// //
// // assertEquals(1, results.size());
// // assertEquals(1, results.get("Ggnste-n Somei").intValue());
// //
// // // 2gram midCorpus
// // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// // morphosyntacticFilter = new ArrayList<>();
// // morphosyntacticFilter.add("Ggns*e-n");
// // morphosyntacticFilter.add("So***");
// // stats.setMorphosyntacticFilter(morphosyntacticFilter);
// // results = recalculate(midCorpus, stats);
// //
// // assertEquals(2, results.size());
// // assertEquals(1, results.get("Ggnste-n Somei").intValue());
// // assertEquals(1, results.get("Ggnste-n Sozem").intValue());
// // }
//
// private Map<String, AtomicLong> recalculate(List<Sentence> corpus, Statistics stats) {
// // calculateForAll(corpus, stats);
// return stats.getResult();
// }
//
// @Test
// public void skipgramsTest() {
// Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
//
// Filter filter = new Filter();
// filter.setAl(AnalysisLevel.STRING_LEVEL);
// filter.setCalculateFor(CalculateFor.WORD);
// ArrayList<String> tax= new ArrayList<>();
// tax.add("SSJ.T.P.C");
// filter.setTaxonomy(tax);
//
// Corpus testCorpus = new Corpus();
// testCorpus.setCorpusType(CorpusType.GIGAFIDA);
// testCorpus.setDetectedCorpusFiles(new ArrayList<>());
// ArrayList<String> taxForCombo = new ArrayList<>();
// taxForCombo.add("tisk-periodično-časopis");
// testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
//
// // tests:
// // - bigrams
// filter.setNgramValue(2);
// StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
// Ngrams.calculateForAll(Common.midCorpusSkip, stats);
// taxonomyResult = stats.getTaxonomyResult();
//
// Set<String> bigrams = new HashSet<>(Arrays.asList("ker ima", "ima junak", "junak v", "v posesti"));
// Set<MultipleHMKeys> bigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
// Set<String> bigramsActual = new HashSet<>(bigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
// assertEquals(bigrams, bigramsActual);
//
// // test:
// // - two skip bigrams
// filter.setNgramValue(2);
// filter.setSkipValue(2);
// stats = new StatisticsNew(testCorpus, filter, false);
// Ngrams.calculateForAll(Common.midCorpusSkip, stats);
// taxonomyResult = stats.getTaxonomyResult();
//
// Set<String> twoSkipBigrams = new HashSet<>(Arrays.asList("ker ima", "ker junak", "ker v", "ima junak", "ima v", "ima posesti", "junak v", "junak posesti", "v posesti"));
// Set<MultipleHMKeys> twoSkipBigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
// Set<String> twoSkipBigramsActual = new HashSet<>(twoSkipBigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
//
// assertEquals(twoSkipBigrams, twoSkipBigramsActual);
//
// // tests:
// // - trigrams
// filter.setNgramValue(3);
// filter.setSkipValue(null);
// stats = new StatisticsNew(testCorpus, filter, false);
// Ngrams.calculateForAll(Common.midCorpusSkip, stats);
// taxonomyResult = stats.getTaxonomyResult();
// Set<String> trigrams = new HashSet<>(Arrays.asList("ker ima junak", "ima junak v", "junak v posesti"));
// Set<MultipleHMKeys> trigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
// Set<String> trigramsActual = new HashSet<>(trigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
//
// assertEquals(trigrams, trigramsActual);
//
// // tests:
// // - two skip trigrams
// filter.setNgramValue(3);
// filter.setSkipValue(2);
// stats = new StatisticsNew(testCorpus, filter, false);
// Ngrams.calculateForAll(Common.midCorpusSkip, stats);
// taxonomyResult = stats.getTaxonomyResult();
// HashSet<String> twoSkipTrigrams = new HashSet<>(Arrays.asList("ker ima junak", "ker ima v", "ker ima posesti", "ker junak v", "ker junak posesti", "ker v posesti", "ima junak v", "ima junak posesti", "ima v posesti", "junak v posesti"));
// Set<MultipleHMKeys> twoSkipTrigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
// Set<String> twoSkipTrigramsActual = new HashSet<>(twoSkipTrigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
//
// assertEquals(twoSkipTrigrams, twoSkipTrigramsActual);
// }
//}
private Map < String , AtomicLong > recalculate ( List < Sentence > corpus , Statistics stats ) {
// calculateForAll(corpus, stats);
return stats . getResult ( ) ;
}
@Test
public void skipgramsTest ( ) {
Map < String , Map < MultipleHMKeys , AtomicLong > > taxonomyResult ;
Filter filter = new Filter ( ) ;
filter . setAl ( AnalysisLevel . STRING_LEVEL ) ;
filter . setCalculateFor ( CalculateFor . WORD ) ;
ArrayList < String > tax = new ArrayList < > ( ) ;
tax . add ( "SSJ.T.P.C" ) ;
filter . setTaxonomy ( tax ) ;
Corpus testCorpus = new Corpus ( ) ;
testCorpus . setCorpusType ( CorpusType . GIGAFIDA ) ;
testCorpus . setDetectedCorpusFiles ( new ArrayList < > ( ) ) ;
ArrayList < String > taxForCombo = new ArrayList < > ( ) ;
taxForCombo . add ( "tisk-periodično-časopis" ) ;
testCorpus . setTaxonomy ( FXCollections . observableArrayList ( taxForCombo ) ) ;
// tests:
// - bigrams
filter . setNgramValue ( 2 ) ;
StatisticsNew stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpusSkip , stats ) ;
taxonomyResult = stats . getTaxonomyResult ( ) ;
Set < String > bigrams = new HashSet < > ( Arrays . asList ( "ker ima" , "ima junak" , "junak v" , "v posesti" ) ) ;
Set < MultipleHMKeys > bigramsMultipleHMKeys = taxonomyResult . get ( "Total" ) . keySet ( ) ;
Set < String > bigramsActual = new HashSet < > ( bigramsMultipleHMKeys . stream ( ) . map ( MultipleHMKeys : : getK1 ) . collect ( Collectors . toList ( ) ) ) ;
assertEquals ( bigrams , bigramsActual ) ;
// test:
// - two skip bigrams
filter . setNgramValue ( 2 ) ;
filter . setSkipValue ( 2 ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpusSkip , stats ) ;
taxonomyResult = stats . getTaxonomyResult ( ) ;
Set < String > twoSkipBigrams = new HashSet < > ( Arrays . asList ( "ker ima" , "ker junak" , "ker v" , "ima junak" , "ima v" , "ima posesti" , "junak v" , "junak posesti" , "v posesti" ) ) ;
Set < MultipleHMKeys > twoSkipBigramsMultipleHMKeys = taxonomyResult . get ( "Total" ) . keySet ( ) ;
Set < String > twoSkipBigramsActual = new HashSet < > ( twoSkipBigramsMultipleHMKeys . stream ( ) . map ( MultipleHMKeys : : getK1 ) . collect ( Collectors . toList ( ) ) ) ;
assertEquals ( twoSkipBigrams , twoSkipBigramsActual ) ;
// tests:
// - trigrams
filter . setNgramValue ( 3 ) ;
filter . setSkipValue ( null ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpusSkip , stats ) ;
taxonomyResult = stats . getTaxonomyResult ( ) ;
Set < String > trigrams = new HashSet < > ( Arrays . asList ( "ker ima junak" , "ima junak v" , "junak v posesti" ) ) ;
Set < MultipleHMKeys > trigramsMultipleHMKeys = taxonomyResult . get ( "Total" ) . keySet ( ) ;
Set < String > trigramsActual = new HashSet < > ( trigramsMultipleHMKeys . stream ( ) . map ( MultipleHMKeys : : getK1 ) . collect ( Collectors . toList ( ) ) ) ;
assertEquals ( trigrams , trigramsActual ) ;
// tests:
// - two skip trigrams
filter . setNgramValue ( 3 ) ;
filter . setSkipValue ( 2 ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpusSkip , stats ) ;
taxonomyResult = stats . getTaxonomyResult ( ) ;
HashSet < String > twoSkipTrigrams = new HashSet < > ( Arrays . asList ( "ker ima junak" , "ker ima v" , "ker ima posesti" , "ker junak v" , "ker junak posesti" , "ker v posesti" , "ima junak v" , "ima junak posesti" , "ima v posesti" , "junak v posesti" ) ) ;
Set < MultipleHMKeys > twoSkipTrigramsMultipleHMKeys = taxonomyResult . get ( "Total" ) . keySet ( ) ;
Set < String > twoSkipTrigramsActual = new HashSet < > ( twoSkipTrigramsMultipleHMKeys . stream ( ) . map ( MultipleHMKeys : : getK1 ) . collect ( Collectors . toList ( ) ) ) ;
assertEquals ( twoSkipTrigrams , twoSkipTrigramsActual ) ;
}
}