You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

363 lines
14 KiB

import static org.junit.Assert.*;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javafx.collections.FXCollections;
import org.junit.Test;
import alg.ngram.Ngrams;
import data.*;
@SuppressWarnings({"Duplicates", "unused"})
public class NgramTests {
@Test
public void letterNgramsTest() {
Map<String, AtomicLong> result = null;
Filter filter = new Filter();
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setStringLength(4);
filter.setNgramValue(0); // letters
filter.setCalculateFor(CalculateFor.WORD);
ArrayList<String> tax= new ArrayList<>();
tax.add("SSJ.T.P.C");
filter.setTaxonomy(tax);
Corpus testCorpus = new Corpus();
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
ArrayList<String> taxForCombo = new ArrayList<>();
taxForCombo.add("SSJ.T.P.C");
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
// tests:
// - no regex
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.minCorpus, stats);
result = stats.getResult();
// tests:
// - algorithm skips words that are shorter than set length value
assertEquals(2, result.size());
assertTrue(result.containsKey("juna"));
assertEquals(1, result.get("juna").longValue());
assertTrue(result.containsKey("unak"));
assertEquals(1, result.get("unak").longValue());
// tests:
// - map update (count) works ok
filter.setStringLength(3);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats);
result = stats.getResult();
assertEquals(2, result.get("ima").longValue());
// tests:
// - pre-check for the following regex test - this one should include word "ima", next one shouldn't
filter.setStringLength(3);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats);
result = stats.getResult();
assertTrue(result.containsKey("ima"));
// tests:
// - regex: S.* // vsi samostalniki
ArrayList<Pattern> msdRegex = new ArrayList<>();
msdRegex.add(Pattern.compile("S.*"));
filter.setMsd(msdRegex);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats);
result = stats.getResult();
assertFalse(result.containsKey("ima"));
// tests:
// - more precise regex
msdRegex = new ArrayList<>();
msdRegex.add(Pattern.compile("S.z.*")); // should include "posesti", but not "junak"
filter.setMsd(msdRegex);
filter.setStringLength(5);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats);
result = stats.getResult();
assertFalse(result.containsKey("junak"));
assertEquals(3, result.size());
// tests:
// - trickier regex
msdRegex = new ArrayList<>();
msdRegex.add(Pattern.compile(".{2}")); // should count only for msd="Vd" - "ker"
filter.setMsd(msdRegex);
filter.setStringLength(3);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats);
result = stats.getResult();
assertEquals(1, result.size());
assertTrue(result.containsKey("ker"));
assertEquals(1, result.get("ker").longValue());
}
@Test
public void wordsNgramsTest() {
Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
Filter filter = new Filter();
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setNgramValue(3);
ArrayList<String> tax= new ArrayList<>();
tax.add("SSJ.T.P.C");
filter.setTaxonomy(tax);
ArrayList<String> mKeys = new ArrayList<>();
//mKeys.add("lema");
filter.setMultipleKeys(mKeys);
Corpus testCorpus = new Corpus();
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
ArrayList<String> taxForCombo = new ArrayList<>();
taxForCombo.add("SSJ.T.P.C");
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
// tests:
// - normal ngrams - word
// midCorpus contains 5 words which should make for 3 3-grams
filter.setCalculateFor(CalculateFor.WORD);
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats);
taxonomyResult = stats.getTaxonomyResult();
assertEquals(3, taxonomyResult.get("Total").size());
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker ima junak")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak ima")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
// tests:
// - normal ngrams - lemmas
filter.setCalculateFor(CalculateFor.LEMMA);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats);
taxonomyResult = stats.getTaxonomyResult();
assertEquals(3, taxonomyResult.get("Total").size());
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker imeti junak")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("imeti junak imeti")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak imeti posest")));
// tests:
// - normal ngrams - msd
filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats);
taxonomyResult = stats.getTaxonomyResult();
assertEquals(3, taxonomyResult.get("Total").size());
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Vd Ggnste-n Somei")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Ggnste-n Somei Ggnste-n")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Somei Ggnste-n Sozem")));
// tests:
// - ngrams - word - regex filter
filter.setCalculateFor(CalculateFor.WORD);
ArrayList<Pattern> msdRegex = new ArrayList<>();
msdRegex.add(Pattern.compile("S.*"));
msdRegex.add(Pattern.compile("G.*"));
msdRegex.add(Pattern.compile(".*"));
filter.setMsd(msdRegex);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats);
taxonomyResult = stats.getTaxonomyResult();
assertEquals(1, taxonomyResult.get("Total").size());
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
// tests:
// - ngrams - word - regex filter
filter.setCalculateFor(CalculateFor.WORD);
filter.setNgramValue(2);
msdRegex = new ArrayList<>();
msdRegex.add(Pattern.compile("G.*"));
msdRegex.add(Pattern.compile("Some.*"));
filter.setMsd(msdRegex);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats);
taxonomyResult = stats.getTaxonomyResult();
assertEquals(1, taxonomyResult.get("Total").size());
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak")));
}
// @Test
// public void ngramsTest() {
// // minimal compliance test
// Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
//
// Map<String, AtomicLong> results = recalculate(minCorpus, stats);
//
// // 1-gram minCorpusa should equal minCorpus' size
// assertEquals(minCorpus.get(0).getWords().size(), results.size());
//
// // each resulting word should have a frequency of 1
// List<Word> words = minCorpus.get(0).getWords();
// for (int i = 0; i < results.size(); i++) {
// Word w = words.get(i);
// AtomicLong frequency = results.get(w.getMsd());
// assertEquals(1, frequency.intValue());
// }
//
// // repeat for 2grams
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
// results = recalculate(minCorpus, stats);
//
// // 2-gram of a 3 item corpus should equal 2 (first two words and second two words)
// assertEquals(2, results.size());
//
// // add a filter
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
//
// List<String> morphosyntacticFilter = new ArrayList<>();
// morphosyntacticFilter.add("Sozem");
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
//
// results = recalculate(minCorpus, stats);
//
// // since min corpus doesn't contain Sozem, results should be empty
// assertEquals(0, results.size());
//
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// morphosyntacticFilter = new ArrayList<>();
// morphosyntacticFilter.add("Somei");
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
// results = recalculate(minCorpus, stats);
//
// // since we have 1 Somei, 1 result
// assertEquals(1, results.size());
// assertEquals(1, results.get("Somei").intValue());
//
// // actual filter with wildcards
// // 1gram
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// morphosyntacticFilter = new ArrayList<>();
// morphosyntacticFilter.add("So***");
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
// results = recalculate(minCorpus, stats);
//
// assertEquals(1, results.size());
// assertEquals(1, results.get("Somei").intValue());
//
// // 2gram
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// morphosyntacticFilter = new ArrayList<>();
// morphosyntacticFilter.add("Ggns*e-n");
// morphosyntacticFilter.add("So***");
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
// results = recalculate(minCorpus, stats);
//
// assertEquals(1, results.size());
// assertEquals(1, results.get("Ggnste-n Somei").intValue());
//
// // 2gram midCorpus
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
// morphosyntacticFilter = new ArrayList<>();
// morphosyntacticFilter.add("Ggns*e-n");
// morphosyntacticFilter.add("So***");
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
// results = recalculate(midCorpus, stats);
//
// assertEquals(2, results.size());
// assertEquals(1, results.get("Ggnste-n Somei").intValue());
// assertEquals(1, results.get("Ggnste-n Sozem").intValue());
// }
private Map<String, AtomicLong> recalculate(List<Sentence> corpus, Statistics stats) {
// calculateForAll(corpus, stats);
return stats.getResult();
}
@Test
public void skipgramsTest() {
Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
Filter filter = new Filter();
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setCalculateFor(CalculateFor.WORD);
ArrayList<String> tax= new ArrayList<>();
tax.add("SSJ.T.P.C");
filter.setTaxonomy(tax);
Corpus testCorpus = new Corpus();
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
ArrayList<String> taxForCombo = new ArrayList<>();
taxForCombo.add("tisk-periodično-časopis");
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
// tests:
// - bigrams
filter.setNgramValue(2);
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
taxonomyResult = stats.getTaxonomyResult();
Set<String> bigrams = new HashSet<>(Arrays.asList("ker ima", "ima junak", "junak v", "v posesti"));
Set<MultipleHMKeys> bigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
Set<String> bigramsActual = new HashSet<>(bigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
assertEquals(bigrams, bigramsActual);
// test:
// - two skip bigrams
filter.setNgramValue(2);
filter.setSkipValue(2);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
taxonomyResult = stats.getTaxonomyResult();
Set<String> twoSkipBigrams = new HashSet<>(Arrays.asList("ker ima", "ker junak", "ker v", "ima junak", "ima v", "ima posesti", "junak v", "junak posesti", "v posesti"));
Set<MultipleHMKeys> twoSkipBigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
Set<String> twoSkipBigramsActual = new HashSet<>(twoSkipBigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
assertEquals(twoSkipBigrams, twoSkipBigramsActual);
// tests:
// - trigrams
filter.setNgramValue(3);
filter.setSkipValue(null);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
taxonomyResult = stats.getTaxonomyResult();
Set<String> trigrams = new HashSet<>(Arrays.asList("ker ima junak", "ima junak v", "junak v posesti"));
Set<MultipleHMKeys> trigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
Set<String> trigramsActual = new HashSet<>(trigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
assertEquals(trigrams, trigramsActual);
// tests:
// - two skip trigrams
filter.setNgramValue(3);
filter.setSkipValue(2);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
taxonomyResult = stats.getTaxonomyResult();
HashSet<String> twoSkipTrigrams = new HashSet<>(Arrays.asList("ker ima junak", "ker ima v", "ker ima posesti", "ker junak v", "ker junak posesti", "ker v posesti", "ima junak v", "ima junak posesti", "ima v posesti", "junak v posesti"));
Set<MultipleHMKeys> twoSkipTrigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
Set<String> twoSkipTrigramsActual = new HashSet<>(twoSkipTrigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
assertEquals(twoSkipTrigrams, twoSkipTrigramsActual);
}
}