import static org.junit.Assert.*; import java.util.*; import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Pattern; import java.util.stream.Collectors; import javafx.collections.FXCollections; import org.junit.Test; import alg.ngram.Ngrams; import data.*; @SuppressWarnings({"Duplicates", "unused"}) public class NgramTests { @Test public void letterNgramsTest() { Map result = null; Filter filter = new Filter(); filter.setAl(AnalysisLevel.STRING_LEVEL); filter.setStringLength(4); filter.setNgramValue(0); // letters filter.setCalculateFor(CalculateFor.WORD); ArrayList tax= new ArrayList<>(); tax.add("SSJ.T.P.C"); filter.setTaxonomy(tax); Corpus testCorpus = new Corpus(); testCorpus.setCorpusType(CorpusType.GIGAFIDA); testCorpus.setDetectedCorpusFiles(new ArrayList<>()); ArrayList taxForCombo = new ArrayList<>(); taxForCombo.add("SSJ.T.P.C"); testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo)); // tests: // - no regex StatisticsNew stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.minCorpus, stats); result = stats.getResult(); // tests: // - algorithm skips words that are shorter than set length value assertEquals(2, result.size()); assertTrue(result.containsKey("juna")); assertEquals(1, result.get("juna").longValue()); assertTrue(result.containsKey("unak")); assertEquals(1, result.get("unak").longValue()); // tests: // - map update (count) works ok filter.setStringLength(3); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpus, stats); result = stats.getResult(); assertEquals(2, result.get("ima").longValue()); // tests: // - pre-check for the following regex test - this one should include word "ima", next one shouldn't filter.setStringLength(3); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpus, stats); result = stats.getResult(); assertTrue(result.containsKey("ima")); // tests: // - regex: S.* // vsi samostalniki ArrayList msdRegex = new ArrayList<>(); msdRegex.add(Pattern.compile("S.*")); filter.setMsd(msdRegex); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpus, stats); result = stats.getResult(); assertFalse(result.containsKey("ima")); // tests: // - more precise regex msdRegex = new ArrayList<>(); msdRegex.add(Pattern.compile("S.z.*")); // should include "posesti", but not "junak" filter.setMsd(msdRegex); filter.setStringLength(5); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpus, stats); result = stats.getResult(); assertFalse(result.containsKey("junak")); assertEquals(3, result.size()); // tests: // - trickier regex msdRegex = new ArrayList<>(); msdRegex.add(Pattern.compile(".{2}")); // should count only for msd="Vd" - "ker" filter.setMsd(msdRegex); filter.setStringLength(3); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpus, stats); result = stats.getResult(); assertEquals(1, result.size()); assertTrue(result.containsKey("ker")); assertEquals(1, result.get("ker").longValue()); } @Test public void wordsNgramsTest() { Map> taxonomyResult; Filter filter = new Filter(); filter.setAl(AnalysisLevel.STRING_LEVEL); filter.setNgramValue(3); ArrayList tax= new ArrayList<>(); tax.add("SSJ.T.P.C"); filter.setTaxonomy(tax); ArrayList mKeys = new ArrayList<>(); //mKeys.add("lema"); filter.setMultipleKeys(mKeys); Corpus testCorpus = new Corpus(); testCorpus.setCorpusType(CorpusType.GIGAFIDA); testCorpus.setDetectedCorpusFiles(new ArrayList<>()); ArrayList taxForCombo = new ArrayList<>(); taxForCombo.add("SSJ.T.P.C"); testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo)); // tests: // - normal ngrams - word // midCorpus contains 5 words which should make for 3 3-grams filter.setCalculateFor(CalculateFor.WORD); StatisticsNew stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpus, stats); taxonomyResult = stats.getTaxonomyResult(); assertEquals(3, taxonomyResult.get("Total").size()); assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ker ima junak", "", "", ""))); assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ima junak ima", "", "", ""))); assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak ima posesti", "", "", ""))); // tests: // - normal ngrams - lemmas filter.setCalculateFor(CalculateFor.LEMMA); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpus, stats); taxonomyResult = stats.getTaxonomyResult(); assertEquals(3, taxonomyResult.get("Total").size()); assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ker imeti junak", "", "", ""))); assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("imeti junak imeti", "", "", ""))); assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak imeti posest", "", "", ""))); // tests: // - normal ngrams - msd filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpus, stats); taxonomyResult = stats.getTaxonomyResult(); assertEquals(3, taxonomyResult.get("Total").size()); assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Vd Ggnste-n Somei", "", "", ""))); assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Ggnste-n Somei Ggnste-n", "", "", ""))); assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Somei Ggnste-n Sozem", "", "", ""))); // tests: // - ngrams - word - regex filter filter.setCalculateFor(CalculateFor.WORD); ArrayList msdRegex = new ArrayList<>(); msdRegex.add(Pattern.compile("S.*")); msdRegex.add(Pattern.compile("G.*")); msdRegex.add(Pattern.compile(".*")); filter.setMsd(msdRegex); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpus, stats); taxonomyResult = stats.getTaxonomyResult(); assertEquals(1, taxonomyResult.get("Total").size()); assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak ima posesti", "", "", ""))); // tests: // - ngrams - word - regex filter filter.setCalculateFor(CalculateFor.WORD); filter.setNgramValue(2); msdRegex = new ArrayList<>(); msdRegex.add(Pattern.compile("G.*")); msdRegex.add(Pattern.compile("Some.*")); filter.setMsd(msdRegex); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpus, stats); taxonomyResult = stats.getTaxonomyResult(); assertEquals(1, taxonomyResult.get("Total").size()); assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ima junak", "", "", ""))); } // @Test // public void ngramsTest() { // // minimal compliance test // Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_SPECS); // // Map results = recalculate(minCorpus, stats); // // // 1-gram minCorpusa should equal minCorpus' size // assertEquals(minCorpus.get(0).getWords().size(), results.size()); // // // each resulting word should have a frequency of 1 // List words = minCorpus.get(0).getWords(); // for (int i = 0; i < results.size(); i++) { // Word w = words.get(i); // AtomicLong frequency = results.get(w.getMsd()); // assertEquals(1, frequency.intValue()); // } // // // repeat for 2grams // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_SPECS); // results = recalculate(minCorpus, stats); // // // 2-gram of a 3 item corpus should equal 2 (first two words and second two words) // assertEquals(2, results.size()); // // // add a filter // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY); // // List morphosyntacticFilter = new ArrayList<>(); // morphosyntacticFilter.add("Sozem"); // stats.setMorphosyntacticFilter(morphosyntacticFilter); // // results = recalculate(minCorpus, stats); // // // since min corpus doesn't contain Sozem, results should be empty // assertEquals(0, results.size()); // // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY); // morphosyntacticFilter = new ArrayList<>(); // morphosyntacticFilter.add("Somei"); // stats.setMorphosyntacticFilter(morphosyntacticFilter); // results = recalculate(minCorpus, stats); // // // since we have 1 Somei, 1 result // assertEquals(1, results.size()); // assertEquals(1, results.get("Somei").intValue()); // // // actual filter with wildcards // // 1gram // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY); // morphosyntacticFilter = new ArrayList<>(); // morphosyntacticFilter.add("So***"); // stats.setMorphosyntacticFilter(morphosyntacticFilter); // results = recalculate(minCorpus, stats); // // assertEquals(1, results.size()); // assertEquals(1, results.get("Somei").intValue()); // // // 2gram // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY); // morphosyntacticFilter = new ArrayList<>(); // morphosyntacticFilter.add("Ggns*e-n"); // morphosyntacticFilter.add("So***"); // stats.setMorphosyntacticFilter(morphosyntacticFilter); // results = recalculate(minCorpus, stats); // // assertEquals(1, results.size()); // assertEquals(1, results.get("Ggnste-n Somei").intValue()); // // // 2gram midCorpus // stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY); // morphosyntacticFilter = new ArrayList<>(); // morphosyntacticFilter.add("Ggns*e-n"); // morphosyntacticFilter.add("So***"); // stats.setMorphosyntacticFilter(morphosyntacticFilter); // results = recalculate(midCorpus, stats); // // assertEquals(2, results.size()); // assertEquals(1, results.get("Ggnste-n Somei").intValue()); // assertEquals(1, results.get("Ggnste-n Sozem").intValue()); // } private Map recalculate(List corpus, Statistics stats) { // calculateForAll(corpus, stats); return stats.getResult(); } @Test public void skipgramsTest() { Map> taxonomyResult; Filter filter = new Filter(); filter.setAl(AnalysisLevel.STRING_LEVEL); filter.setCalculateFor(CalculateFor.WORD); ArrayList tax= new ArrayList<>(); tax.add("SSJ.T.P.C"); filter.setTaxonomy(tax); Corpus testCorpus = new Corpus(); testCorpus.setCorpusType(CorpusType.GIGAFIDA); testCorpus.setDetectedCorpusFiles(new ArrayList<>()); ArrayList taxForCombo = new ArrayList<>(); taxForCombo.add("tisk-periodično-časopis"); testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo)); // tests: // - bigrams filter.setNgramValue(2); StatisticsNew stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpusSkip, stats); taxonomyResult = stats.getTaxonomyResult(); Set bigrams = new HashSet<>(Arrays.asList("ker ima", "ima junak", "junak v", "v posesti")); Set bigramsMultipleHMKeys = taxonomyResult.get("Total").keySet(); Set bigramsActual = new HashSet<>(bigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList())); assertEquals(bigrams, bigramsActual); // test: // - two skip bigrams filter.setNgramValue(2); filter.setSkipValue(2); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpusSkip, stats); taxonomyResult = stats.getTaxonomyResult(); Set twoSkipBigrams = new HashSet<>(Arrays.asList("ker ima", "ker junak", "ker v", "ima junak", "ima v", "ima posesti", "junak v", "junak posesti", "v posesti")); Set twoSkipBigramsMultipleHMKeys = taxonomyResult.get("Total").keySet(); Set twoSkipBigramsActual = new HashSet<>(twoSkipBigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList())); assertEquals(twoSkipBigrams, twoSkipBigramsActual); // tests: // - trigrams filter.setNgramValue(3); filter.setSkipValue(null); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpusSkip, stats); taxonomyResult = stats.getTaxonomyResult(); Set trigrams = new HashSet<>(Arrays.asList("ker ima junak", "ima junak v", "junak v posesti")); Set trigramsMultipleHMKeys = taxonomyResult.get("Total").keySet(); Set trigramsActual = new HashSet<>(trigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList())); assertEquals(trigrams, trigramsActual); // tests: // - two skip trigrams filter.setNgramValue(3); filter.setSkipValue(2); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpusSkip, stats); taxonomyResult = stats.getTaxonomyResult(); HashSet twoSkipTrigrams = new HashSet<>(Arrays.asList("ker ima junak", "ker ima v", "ker ima posesti", "ker junak v", "ker junak posesti", "ker v posesti", "ima junak v", "ima junak posesti", "ima v posesti", "junak v posesti")); Set twoSkipTrigramsMultipleHMKeys = taxonomyResult.get("Total").keySet(); Set twoSkipTrigramsActual = new HashSet<>(twoSkipTrigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList())); assertEquals(twoSkipTrigrams, twoSkipTrigramsActual); } }