You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
363 lines
14 KiB
363 lines
14 KiB
import static org.junit.Assert.*;
|
|
|
|
import java.util.*;
|
|
import java.util.concurrent.atomic.AtomicLong;
|
|
import java.util.regex.Pattern;
|
|
import java.util.stream.Collectors;
|
|
|
|
import javafx.collections.FXCollections;
|
|
import org.junit.Test;
|
|
|
|
import alg.ngram.Ngrams;
|
|
import data.*;
|
|
|
|
@SuppressWarnings({"Duplicates", "unused"})
|
|
public class NgramTests {
|
|
|
|
@Test
|
|
public void letterNgramsTest() {
|
|
Map<String, AtomicLong> result = null;
|
|
|
|
Filter filter = new Filter();
|
|
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
|
filter.setStringLength(4);
|
|
filter.setNgramValue(0); // letters
|
|
filter.setCalculateFor(CalculateFor.WORD);
|
|
ArrayList<String> tax= new ArrayList<>();
|
|
tax.add("SSJ.T.P.C");
|
|
filter.setTaxonomy(tax);
|
|
|
|
|
|
Corpus testCorpus = new Corpus();
|
|
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
|
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
|
ArrayList<String> taxForCombo = new ArrayList<>();
|
|
taxForCombo.add("SSJ.T.P.C");
|
|
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
|
|
|
// tests:
|
|
// - no regex
|
|
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
|
Ngrams.calculateForAll(Common.minCorpus, stats);
|
|
result = stats.getResult();
|
|
|
|
// tests:
|
|
// - algorithm skips words that are shorter than set length value
|
|
assertEquals(2, result.size());
|
|
assertTrue(result.containsKey("juna"));
|
|
assertEquals(1, result.get("juna").longValue());
|
|
assertTrue(result.containsKey("unak"));
|
|
assertEquals(1, result.get("unak").longValue());
|
|
|
|
// tests:
|
|
// - map update (count) works ok
|
|
filter.setStringLength(3);
|
|
stats = new StatisticsNew(testCorpus, filter, false);
|
|
Ngrams.calculateForAll(Common.midCorpus, stats);
|
|
result = stats.getResult();
|
|
|
|
assertEquals(2, result.get("ima").longValue());
|
|
|
|
// tests:
|
|
// - pre-check for the following regex test - this one should include word "ima", next one shouldn't
|
|
filter.setStringLength(3);
|
|
|
|
stats = new StatisticsNew(testCorpus, filter, false);
|
|
Ngrams.calculateForAll(Common.midCorpus, stats);
|
|
result = stats.getResult();
|
|
|
|
assertTrue(result.containsKey("ima"));
|
|
|
|
// tests:
|
|
// - regex: S.* // vsi samostalniki
|
|
ArrayList<Pattern> msdRegex = new ArrayList<>();
|
|
msdRegex.add(Pattern.compile("S.*"));
|
|
filter.setMsd(msdRegex);
|
|
|
|
stats = new StatisticsNew(testCorpus, filter, false);
|
|
Ngrams.calculateForAll(Common.midCorpus, stats);
|
|
result = stats.getResult();
|
|
|
|
assertFalse(result.containsKey("ima"));
|
|
|
|
// tests:
|
|
// - more precise regex
|
|
msdRegex = new ArrayList<>();
|
|
msdRegex.add(Pattern.compile("S.z.*")); // should include "posesti", but not "junak"
|
|
filter.setMsd(msdRegex);
|
|
filter.setStringLength(5);
|
|
|
|
stats = new StatisticsNew(testCorpus, filter, false);
|
|
Ngrams.calculateForAll(Common.midCorpus, stats);
|
|
result = stats.getResult();
|
|
|
|
assertFalse(result.containsKey("junak"));
|
|
assertEquals(3, result.size());
|
|
|
|
// tests:
|
|
// - trickier regex
|
|
msdRegex = new ArrayList<>();
|
|
msdRegex.add(Pattern.compile(".{2}")); // should count only for msd="Vd" - "ker"
|
|
filter.setMsd(msdRegex);
|
|
filter.setStringLength(3);
|
|
|
|
stats = new StatisticsNew(testCorpus, filter, false);
|
|
Ngrams.calculateForAll(Common.midCorpus, stats);
|
|
result = stats.getResult();
|
|
|
|
assertEquals(1, result.size());
|
|
assertTrue(result.containsKey("ker"));
|
|
assertEquals(1, result.get("ker").longValue());
|
|
}
|
|
|
|
@Test
|
|
public void wordsNgramsTest() {
|
|
Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
|
|
|
|
Filter filter = new Filter();
|
|
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
|
filter.setNgramValue(3);
|
|
ArrayList<String> tax= new ArrayList<>();
|
|
tax.add("SSJ.T.P.C");
|
|
filter.setTaxonomy(tax);
|
|
ArrayList<String> mKeys = new ArrayList<>();
|
|
//mKeys.add("lema");
|
|
filter.setMultipleKeys(mKeys);
|
|
|
|
Corpus testCorpus = new Corpus();
|
|
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
|
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
|
ArrayList<String> taxForCombo = new ArrayList<>();
|
|
taxForCombo.add("SSJ.T.P.C");
|
|
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
|
|
|
// tests:
|
|
// - normal ngrams - word
|
|
// midCorpus contains 5 words which should make for 3 3-grams
|
|
filter.setCalculateFor(CalculateFor.WORD);
|
|
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
|
Ngrams.calculateForAll(Common.midCorpus, stats);
|
|
taxonomyResult = stats.getTaxonomyResult();
|
|
|
|
assertEquals(3, taxonomyResult.get("Total").size());
|
|
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker ima junak")));
|
|
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak ima")));
|
|
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
|
|
|
|
// tests:
|
|
// - normal ngrams - lemmas
|
|
filter.setCalculateFor(CalculateFor.LEMMA);
|
|
stats = new StatisticsNew(testCorpus, filter, false);
|
|
Ngrams.calculateForAll(Common.midCorpus, stats);
|
|
taxonomyResult = stats.getTaxonomyResult();
|
|
|
|
assertEquals(3, taxonomyResult.get("Total").size());
|
|
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ker imeti junak")));
|
|
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("imeti junak imeti")));
|
|
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak imeti posest")));
|
|
|
|
// tests:
|
|
// - normal ngrams - msd
|
|
filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
|
stats = new StatisticsNew(testCorpus, filter, false);
|
|
Ngrams.calculateForAll(Common.midCorpus, stats);
|
|
taxonomyResult = stats.getTaxonomyResult();
|
|
|
|
assertEquals(3, taxonomyResult.get("Total").size());
|
|
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Vd Ggnste-n Somei")));
|
|
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Ggnste-n Somei Ggnste-n")));
|
|
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("Somei Ggnste-n Sozem")));
|
|
|
|
// tests:
|
|
// - ngrams - word - regex filter
|
|
filter.setCalculateFor(CalculateFor.WORD);
|
|
ArrayList<Pattern> msdRegex = new ArrayList<>();
|
|
msdRegex.add(Pattern.compile("S.*"));
|
|
msdRegex.add(Pattern.compile("G.*"));
|
|
msdRegex.add(Pattern.compile(".*"));
|
|
filter.setMsd(msdRegex);
|
|
|
|
stats = new StatisticsNew(testCorpus, filter, false);
|
|
Ngrams.calculateForAll(Common.midCorpus, stats);
|
|
taxonomyResult = stats.getTaxonomyResult();
|
|
|
|
assertEquals(1, taxonomyResult.get("Total").size());
|
|
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("junak ima posesti")));
|
|
|
|
// tests:
|
|
// - ngrams - word - regex filter
|
|
filter.setCalculateFor(CalculateFor.WORD);
|
|
filter.setNgramValue(2);
|
|
msdRegex = new ArrayList<>();
|
|
msdRegex.add(Pattern.compile("G.*"));
|
|
msdRegex.add(Pattern.compile("Some.*"));
|
|
filter.setMsd(msdRegex);
|
|
|
|
stats = new StatisticsNew(testCorpus, filter, false);
|
|
Ngrams.calculateForAll(Common.midCorpus, stats);
|
|
taxonomyResult = stats.getTaxonomyResult();
|
|
|
|
assertEquals(1, taxonomyResult.get("Total").size());
|
|
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys1("ima junak")));
|
|
}
|
|
|
|
|
|
// @Test
|
|
// public void ngramsTest() {
|
|
// // minimal compliance test
|
|
// Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
|
|
//
|
|
// Map<String, AtomicLong> results = recalculate(minCorpus, stats);
|
|
//
|
|
// // 1-gram minCorpusa should equal minCorpus' size
|
|
// assertEquals(minCorpus.get(0).getWords().size(), results.size());
|
|
//
|
|
// // each resulting word should have a frequency of 1
|
|
// List<Word> words = minCorpus.get(0).getWords();
|
|
// for (int i = 0; i < results.size(); i++) {
|
|
// Word w = words.get(i);
|
|
// AtomicLong frequency = results.get(w.getMsd());
|
|
// assertEquals(1, frequency.intValue());
|
|
// }
|
|
//
|
|
// // repeat for 2grams
|
|
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
|
|
// results = recalculate(minCorpus, stats);
|
|
//
|
|
// // 2-gram of a 3 item corpus should equal 2 (first two words and second two words)
|
|
// assertEquals(2, results.size());
|
|
//
|
|
// // add a filter
|
|
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
|
//
|
|
// List<String> morphosyntacticFilter = new ArrayList<>();
|
|
// morphosyntacticFilter.add("Sozem");
|
|
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
|
//
|
|
// results = recalculate(minCorpus, stats);
|
|
//
|
|
// // since min corpus doesn't contain Sozem, results should be empty
|
|
// assertEquals(0, results.size());
|
|
//
|
|
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
|
// morphosyntacticFilter = new ArrayList<>();
|
|
// morphosyntacticFilter.add("Somei");
|
|
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
|
// results = recalculate(minCorpus, stats);
|
|
//
|
|
// // since we have 1 Somei, 1 result
|
|
// assertEquals(1, results.size());
|
|
// assertEquals(1, results.get("Somei").intValue());
|
|
//
|
|
// // actual filter with wildcards
|
|
// // 1gram
|
|
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
|
// morphosyntacticFilter = new ArrayList<>();
|
|
// morphosyntacticFilter.add("So***");
|
|
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
|
// results = recalculate(minCorpus, stats);
|
|
//
|
|
// assertEquals(1, results.size());
|
|
// assertEquals(1, results.get("Somei").intValue());
|
|
//
|
|
// // 2gram
|
|
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
|
// morphosyntacticFilter = new ArrayList<>();
|
|
// morphosyntacticFilter.add("Ggns*e-n");
|
|
// morphosyntacticFilter.add("So***");
|
|
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
|
// results = recalculate(minCorpus, stats);
|
|
//
|
|
// assertEquals(1, results.size());
|
|
// assertEquals(1, results.get("Ggnste-n Somei").intValue());
|
|
//
|
|
// // 2gram midCorpus
|
|
// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
|
// morphosyntacticFilter = new ArrayList<>();
|
|
// morphosyntacticFilter.add("Ggns*e-n");
|
|
// morphosyntacticFilter.add("So***");
|
|
// stats.setMorphosyntacticFilter(morphosyntacticFilter);
|
|
// results = recalculate(midCorpus, stats);
|
|
//
|
|
// assertEquals(2, results.size());
|
|
// assertEquals(1, results.get("Ggnste-n Somei").intValue());
|
|
// assertEquals(1, results.get("Ggnste-n Sozem").intValue());
|
|
// }
|
|
|
|
private Map<String, AtomicLong> recalculate(List<Sentence> corpus, Statistics stats) {
|
|
// calculateForAll(corpus, stats);
|
|
return stats.getResult();
|
|
}
|
|
|
|
@Test
|
|
public void skipgramsTest() {
|
|
Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
|
|
|
|
Filter filter = new Filter();
|
|
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
|
filter.setCalculateFor(CalculateFor.WORD);
|
|
ArrayList<String> tax= new ArrayList<>();
|
|
tax.add("SSJ.T.P.C");
|
|
filter.setTaxonomy(tax);
|
|
|
|
Corpus testCorpus = new Corpus();
|
|
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
|
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
|
ArrayList<String> taxForCombo = new ArrayList<>();
|
|
taxForCombo.add("tisk-periodično-časopis");
|
|
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
|
|
|
// tests:
|
|
// - bigrams
|
|
filter.setNgramValue(2);
|
|
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
|
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
|
taxonomyResult = stats.getTaxonomyResult();
|
|
|
|
Set<String> bigrams = new HashSet<>(Arrays.asList("ker ima", "ima junak", "junak v", "v posesti"));
|
|
Set<MultipleHMKeys> bigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
|
Set<String> bigramsActual = new HashSet<>(bigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
|
assertEquals(bigrams, bigramsActual);
|
|
|
|
// test:
|
|
// - two skip bigrams
|
|
filter.setNgramValue(2);
|
|
filter.setSkipValue(2);
|
|
stats = new StatisticsNew(testCorpus, filter, false);
|
|
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
|
taxonomyResult = stats.getTaxonomyResult();
|
|
|
|
Set<String> twoSkipBigrams = new HashSet<>(Arrays.asList("ker ima", "ker junak", "ker v", "ima junak", "ima v", "ima posesti", "junak v", "junak posesti", "v posesti"));
|
|
Set<MultipleHMKeys> twoSkipBigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
|
Set<String> twoSkipBigramsActual = new HashSet<>(twoSkipBigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
|
|
|
assertEquals(twoSkipBigrams, twoSkipBigramsActual);
|
|
|
|
// tests:
|
|
// - trigrams
|
|
filter.setNgramValue(3);
|
|
filter.setSkipValue(null);
|
|
stats = new StatisticsNew(testCorpus, filter, false);
|
|
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
|
taxonomyResult = stats.getTaxonomyResult();
|
|
Set<String> trigrams = new HashSet<>(Arrays.asList("ker ima junak", "ima junak v", "junak v posesti"));
|
|
Set<MultipleHMKeys> trigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
|
Set<String> trigramsActual = new HashSet<>(trigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
|
|
|
assertEquals(trigrams, trigramsActual);
|
|
|
|
// tests:
|
|
// - two skip trigrams
|
|
filter.setNgramValue(3);
|
|
filter.setSkipValue(2);
|
|
stats = new StatisticsNew(testCorpus, filter, false);
|
|
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
|
taxonomyResult = stats.getTaxonomyResult();
|
|
HashSet<String> twoSkipTrigrams = new HashSet<>(Arrays.asList("ker ima junak", "ker ima v", "ker ima posesti", "ker junak v", "ker junak posesti", "ker v posesti", "ima junak v", "ima junak posesti", "ima v posesti", "junak v posesti"));
|
|
Set<MultipleHMKeys> twoSkipTrigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
|
Set<String> twoSkipTrigramsActual = new HashSet<>(twoSkipTrigramsMultipleHMKeys.stream().map(MultipleHMKeys::getK1).collect(Collectors.toList()));
|
|
|
|
assertEquals(twoSkipTrigrams, twoSkipTrigramsActual);
|
|
}
|
|
}
|