list/src/test/java/NgramTests.java

import static org.junit.Assert.*;

import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import javafx.collections.FXCollections;
import org.junit.Test;

import alg.ngram.Ngrams;
import data.*;

@SuppressWarnings({"Duplicates", "unused"})
public class NgramTests {

	@Test
	public void letterNgramsTest() {
		Map<String, AtomicLong> result = null;

		Filter filter = new Filter();
		filter.setAl(AnalysisLevel.STRING_LEVEL);
		filter.setStringLength(4);
		filter.setNgramValue(0); // letters
		filter.setCalculateFor(CalculateFor.WORD);
		ArrayList<String> tax= new ArrayList<>();
		tax.add("SSJ.T.P.C");
		filter.setTaxonomy(tax);


		Corpus testCorpus = new Corpus();
		testCorpus.setCorpusType(CorpusType.GIGAFIDA);
		testCorpus.setDetectedCorpusFiles(new ArrayList<>());
		ArrayList<String> taxForCombo = new ArrayList<>();
		taxForCombo.add("SSJ.T.P.C");
		testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));

		// tests:
		//  - no regex
		StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
		Ngrams.calculateForAll(Common.minCorpus, stats);
		result = stats.getResult();

		// tests:
		// - algorithm skips words that are shorter than set length value
		assertEquals(2, result.size());
		assertTrue(result.containsKey("juna"));
		assertEquals(1, result.get("juna").longValue());
		assertTrue(result.containsKey("unak"));
		assertEquals(1, result.get("unak").longValue());

		// tests:
		// - map update (count) works ok
		filter.setStringLength(3);
		stats = new StatisticsNew(testCorpus, filter, false);
		Ngrams.calculateForAll(Common.midCorpus, stats);
		result = stats.getResult();

		assertEquals(2, result.get("ima").longValue());

		// tests:
		//  - pre-check for the following regex test - this one should include word "ima", next one shouldn't
		filter.setStringLength(3);

		stats = new StatisticsNew(testCorpus, filter, false);
		Ngrams.calculateForAll(Common.midCorpus, stats);
		result = stats.getResult();

		assertTrue(result.containsKey("ima"));

		// tests:
		//  - regex: S.* // vsi samostalniki
		ArrayList<Pattern> msdRegex = new ArrayList<>();
		msdRegex.add(Pattern.compile("S.*"));
		filter.setMsd(msdRegex);

		stats = new StatisticsNew(testCorpus, filter, false);
		Ngrams.calculateForAll(Common.midCorpus, stats);
		result = stats.getResult();

		assertFalse(result.containsKey("ima"));

		// tests:
		// - more precise regex
		msdRegex = new ArrayList<>();
		msdRegex.add(Pattern.compile("S.z.*")); // should include "posesti", but not "junak"
		filter.setMsd(msdRegex);
		filter.setStringLength(5);

		stats = new StatisticsNew(testCorpus, filter, false);
		Ngrams.calculateForAll(Common.midCorpus, stats);
		result = stats.getResult();

		assertFalse(result.containsKey("junak"));
		assertEquals(3, result.size());

		// tests:
		// - trickier regex
		msdRegex = new ArrayList<>();
		msdRegex.add(Pattern.compile(".{2}")); // should count only for msd="Vd" - "ker"
		filter.setMsd(msdRegex);
		filter.setStringLength(3);

		stats = new StatisticsNew(testCorpus, filter, false);
		Ngrams.calculateForAll(Common.midCorpus, stats);
		result = stats.getResult();

		assertEquals(1, result.size());
		assertTrue(result.containsKey("ker"));
		assertEquals(1, result.get("ker").longValue());
	}

	@Test
	public void wordsNgramsTest() {
        Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;

		Filter filter = new Filter();
		filter.setAl(AnalysisLevel.STRING_LEVEL);
		filter.setNgramValue(3);
		ArrayList<String> tax= new ArrayList<>();
		tax.add("SSJ.T.P.C");
		filter.setTaxonomy(tax);
		ArrayList<String> mKeys = new ArrayList<>();
		//mKeys.add("lema");
        filter.setMultipleKeys(mKeys);

		Corpus testCorpus = new Corpus();
		testCorpus.setCorpusType(CorpusType.GIGAFIDA);
		testCorpus.setDetectedCorpusFiles(new ArrayList<>());
        ArrayList<String> taxForCombo = new ArrayList<>();
        taxForCombo.add("SSJ.T.P.C");
        testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));

		// tests:
		//  - normal ngrams - word
		// midCorpus contains 5 words which should make for 3 3-grams
		filter.setCalculateFor(CalculateFor.WORD);
		StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
		Ngrams.calculateForAll(Common.midCorpus, stats);
        taxonomyResult = stats.getTaxonomyResult();

		assertEquals(3, taxonomyResult.get("Total").size());
		assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ker ima junak", "", "", "")));
		assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ima junak ima", "", "", "")));
		assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak ima posesti", "", "", "")));

		// tests:
		//  - normal ngrams - lemmas
		filter.setCalculateFor(CalculateFor.LEMMA);
		stats = new StatisticsNew(testCorpus, filter, false);
		Ngrams.calculateForAll(Common.midCorpus, stats);
        taxonomyResult = stats.getTaxonomyResult();

		assertEquals(3, taxonomyResult.get("Total").size());
		assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ker imeti junak", "", "", "")));
		assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("imeti junak imeti", "", "", "")));
		assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak imeti posest", "", "", "")));

		// tests:
		//  - normal ngrams - msd
		filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
		stats = new StatisticsNew(testCorpus, filter, false);
		Ngrams.calculateForAll(Common.midCorpus, stats);
        taxonomyResult = stats.getTaxonomyResult();

		assertEquals(3, taxonomyResult.get("Total").size());
		assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Vd Ggnste-n Somei", "", "", "")));
		assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Ggnste-n Somei Ggnste-n", "", "", "")));
		assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Somei Ggnste-n Sozem", "", "", "")));

		// tests:
		//  - ngrams - word - regex filter
		filter.setCalculateFor(CalculateFor.WORD);
		ArrayList<Pattern> msdRegex = new ArrayList<>();
		msdRegex.add(Pattern.compile("S.*"));
		msdRegex.add(Pattern.compile("G.*"));
		msdRegex.add(Pattern.compile(".*"));
		filter.setMsd(msdRegex);

		stats = new StatisticsNew(testCorpus, filter, false);
		Ngrams.calculateForAll(Common.midCorpus, stats);
        taxonomyResult = stats.getTaxonomyResult();

		assertEquals(1, taxonomyResult.get("Total").size());
		assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak ima posesti", "", "", "")));

		// tests:
		//  - ngrams - word - regex filter
		filter.setCalculateFor(CalculateFor.WORD);
		filter.setNgramValue(2);
		msdRegex = new ArrayList<>();
		msdRegex.add(Pattern.compile("G.*"));
		msdRegex.add(Pattern.compile("Some.*"));
		filter.setMsd(msdRegex);

		stats = new StatisticsNew(testCorpus, filter, false);
		Ngrams.calculateForAll(Common.midCorpus, stats);
        taxonomyResult = stats.getTaxonomyResult();

		assertEquals(1, taxonomyResult.get("Total").size());
		assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ima junak", "", "", "")));
	}


	// @Test
	// public void ngramsTest() {
	// 	// minimal compliance test
	// 	Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
	//
	// 	Map<String, AtomicLong> results = recalculate(minCorpus, stats);
	//
	// 	// 1-gram minCorpusa should equal minCorpus' size
	// 	assertEquals(minCorpus.get(0).getWords().size(), results.size());
	//
	// 	// each resulting word should have a frequency of 1
	// 	List<Word> words = minCorpus.get(0).getWords();
	// 	for (int i = 0; i < results.size(); i++) {
	// 		Word w = words.get(i);
	// 		AtomicLong frequency = results.get(w.getMsd());
	// 		assertEquals(1, frequency.intValue());
	// 	}
	//
	// 	// repeat for 2grams
	// 	stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_SPECS);
	// 	results = recalculate(minCorpus, stats);
	//
	// 	// 2-gram of a 3 item corpus should equal 2 (first two words and second two words)
	// 	assertEquals(2, results.size());
	//
	// 	// add a filter
	// 	stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
	//
	// 	List<String> morphosyntacticFilter = new ArrayList<>();
	// 	morphosyntacticFilter.add("Sozem");
	// 	stats.setMorphosyntacticFilter(morphosyntacticFilter);
	//
	// 	results = recalculate(minCorpus, stats);
	//
	// 	// since min corpus doesn't contain Sozem, results should be empty
	// 	assertEquals(0, results.size());
	//
	// 	stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
	// 	morphosyntacticFilter = new ArrayList<>();
	// 	morphosyntacticFilter.add("Somei");
	// 	stats.setMorphosyntacticFilter(morphosyntacticFilter);
	// 	results = recalculate(minCorpus, stats);
	//
	// 	// since we have 1 Somei, 1 result
	// 	assertEquals(1, results.size());
	// 	assertEquals(1, results.get("Somei").intValue());
	//
	// 	// actual filter with wildcards
	// 	// 1gram
	// 	stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
	// 	morphosyntacticFilter = new ArrayList<>();
	// 	morphosyntacticFilter.add("So***");
	// 	stats.setMorphosyntacticFilter(morphosyntacticFilter);
	// 	results = recalculate(minCorpus, stats);
	//
	// 	assertEquals(1, results.size());
	// 	assertEquals(1, results.get("Somei").intValue());
	//
	// 	// 2gram
	// 	stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
	// 	morphosyntacticFilter = new ArrayList<>();
	// 	morphosyntacticFilter.add("Ggns*e-n");
	// 	morphosyntacticFilter.add("So***");
	// 	stats.setMorphosyntacticFilter(morphosyntacticFilter);
	// 	results = recalculate(minCorpus, stats);
	//
	// 	assertEquals(1, results.size());
	// 	assertEquals(1, results.get("Ggnste-n Somei").intValue());
	//
	// 	// 2gram midCorpus
	// 	stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);
	// 	morphosyntacticFilter = new ArrayList<>();
	// 	morphosyntacticFilter.add("Ggns*e-n");
	// 	morphosyntacticFilter.add("So***");
	// 	stats.setMorphosyntacticFilter(morphosyntacticFilter);
	// 	results = recalculate(midCorpus, stats);
	//
	// 	assertEquals(2, results.size());
	// 	assertEquals(1, results.get("Ggnste-n Somei").intValue());
	// 	assertEquals(1, results.get("Ggnste-n Sozem").intValue());
	// }

	private Map<String, AtomicLong> recalculate(List<Sentence> corpus, Statistics stats) {
		// calculateForAll(corpus, stats);
		return stats.getResult();
	}

	@Test
	public void skipgramsTest() {
        Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;

		Filter filter = new Filter();
		filter.setAl(AnalysisLevel.STRING_LEVEL);
		filter.setCalculateFor(CalculateFor.WORD);
        ArrayList<String> tax= new ArrayList<>();
        tax.add("SSJ.T.P.C");
        filter.setTaxonomy(tax);

		Corpus testCorpus = new Corpus();
		testCorpus.setCorpusType(CorpusType.GIGAFIDA);
		testCorpus.setDetectedCorpusFiles(new ArrayList<>());
        ArrayList<String> taxForCombo = new ArrayList<>();
        taxForCombo.add("tisk-periodično-časopis");
        testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));

		// tests:
		//  - bigrams
		filter.setNgramValue(2);
		StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
		Ngrams.calculateForAll(Common.midCorpusSkip, stats);
        taxonomyResult = stats.getTaxonomyResult();

		Set<String> bigrams = new HashSet<>(Arrays.asList("ker ima", "ima junak", "junak v", "v posesti"));
		Set<MultipleHMKeys> bigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
        Set<String> bigramsActual = new HashSet<>(bigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));
		assertEquals(bigrams, bigramsActual);

		// test:
		// - two skip bigrams
		filter.setNgramValue(2);
		filter.setSkipValue(2);
		stats = new StatisticsNew(testCorpus, filter, false);
		Ngrams.calculateForAll(Common.midCorpusSkip, stats);
        taxonomyResult = stats.getTaxonomyResult();

		Set<String> twoSkipBigrams = new HashSet<>(Arrays.asList("ker ima", "ker junak", "ker v", "ima junak", "ima v", "ima posesti", "junak v", "junak posesti", "v posesti"));
		Set<MultipleHMKeys> twoSkipBigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
        Set<String> twoSkipBigramsActual = new HashSet<>(twoSkipBigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));

		assertEquals(twoSkipBigrams, twoSkipBigramsActual);

		// tests:
		// - trigrams
		filter.setNgramValue(3);
		filter.setSkipValue(null);
		stats = new StatisticsNew(testCorpus, filter, false);
		Ngrams.calculateForAll(Common.midCorpusSkip, stats);
        taxonomyResult = stats.getTaxonomyResult();
		Set<String> trigrams = new HashSet<>(Arrays.asList("ker ima junak", "ima junak v", "junak v posesti"));
		Set<MultipleHMKeys> trigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
        Set<String> trigramsActual = new HashSet<>(trigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));

		assertEquals(trigrams, trigramsActual);

		// tests:
		// - two skip trigrams
		filter.setNgramValue(3);
		filter.setSkipValue(2);
		stats = new StatisticsNew(testCorpus, filter, false);
		Ngrams.calculateForAll(Common.midCorpusSkip, stats);
        taxonomyResult = stats.getTaxonomyResult();
		HashSet<String> twoSkipTrigrams = new HashSet<>(Arrays.asList("ker ima junak", "ker ima v", "ker ima posesti", "ker junak v", "ker junak posesti", "ker v posesti", "ima junak v", "ima junak posesti", "ima v posesti", "junak v posesti"));
		Set<MultipleHMKeys> twoSkipTrigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
        Set<String> twoSkipTrigramsActual = new HashSet<>(twoSkipTrigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));

		assertEquals(twoSkipTrigrams, twoSkipTrigramsActual);
	}
}
Project copied 6 years ago			`import static org.junit.Assert.*;`

			`import java.util.*;`
			`import java.util.concurrent.atomic.AtomicLong;`
			`import java.util.regex.Pattern;`
Fixed slow combination of words and lemmas presentation 6 years ago			`import java.util.stream.Collectors;`
Project copied 6 years ago
Fixed slow combination of words and lemmas presentation 6 years ago			`import javafx.collections.FXCollections;`
Project copied 6 years ago			`import org.junit.Test;`

			`import alg.ngram.Ngrams;`
			`import data.*;`

			`@SuppressWarnings({"Duplicates", "unused"})`
			`public class NgramTests {`

			`@Test`
			`public void letterNgramsTest() {`
			`Map<String, AtomicLong> result = null;`

			`Filter filter = new Filter();`
			`filter.setAl(AnalysisLevel.STRING_LEVEL);`
			`filter.setStringLength(4);`
			`filter.setNgramValue(0); // letters`
			`filter.setCalculateFor(CalculateFor.WORD);`
Fixed slow combination of words and lemmas presentation 6 years ago			`ArrayList<String> tax= new ArrayList<>();`
			`tax.add("SSJ.T.P.C");`
			`filter.setTaxonomy(tax);`

Project copied 6 years ago
			`Corpus testCorpus = new Corpus();`
			`testCorpus.setCorpusType(CorpusType.GIGAFIDA);`
			`testCorpus.setDetectedCorpusFiles(new ArrayList<>());`
Fixed slow combination of words and lemmas presentation 6 years ago			`ArrayList<String> taxForCombo = new ArrayList<>();`
			`taxForCombo.add("SSJ.T.P.C");`
			`testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));`
Project copied 6 years ago
			`// tests:`
			`// - no regex`
			`StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);`
			`Ngrams.calculateForAll(Common.minCorpus, stats);`
			`result = stats.getResult();`

			`// tests:`
			`// - algorithm skips words that are shorter than set length value`
			`assertEquals(2, result.size());`
			`assertTrue(result.containsKey("juna"));`
			`assertEquals(1, result.get("juna").longValue());`
			`assertTrue(result.containsKey("unak"));`
			`assertEquals(1, result.get("unak").longValue());`

			`// tests:`
			`// - map update (count) works ok`
			`filter.setStringLength(3);`
			`stats = new StatisticsNew(testCorpus, filter, false);`
			`Ngrams.calculateForAll(Common.midCorpus, stats);`
			`result = stats.getResult();`

			`assertEquals(2, result.get("ima").longValue());`

			`// tests:`
			`// - pre-check for the following regex test - this one should include word "ima", next one shouldn't`
			`filter.setStringLength(3);`

			`stats = new StatisticsNew(testCorpus, filter, false);`
			`Ngrams.calculateForAll(Common.midCorpus, stats);`
			`result = stats.getResult();`

			`assertTrue(result.containsKey("ima"));`

			`// tests:`
			`// - regex: S.* // vsi samostalniki`
			`ArrayList<Pattern> msdRegex = new ArrayList<>();`
			`msdRegex.add(Pattern.compile("S.*"));`
			`filter.setMsd(msdRegex);`

			`stats = new StatisticsNew(testCorpus, filter, false);`
			`Ngrams.calculateForAll(Common.midCorpus, stats);`
			`result = stats.getResult();`

			`assertFalse(result.containsKey("ima"));`

			`// tests:`
			`// - more precise regex`
			`msdRegex = new ArrayList<>();`
			`msdRegex.add(Pattern.compile("S.z.*")); // should include "posesti", but not "junak"`
			`filter.setMsd(msdRegex);`
			`filter.setStringLength(5);`

			`stats = new StatisticsNew(testCorpus, filter, false);`
			`Ngrams.calculateForAll(Common.midCorpus, stats);`
			`result = stats.getResult();`

			`assertFalse(result.containsKey("junak"));`
			`assertEquals(3, result.size());`

			`// tests:`
			`// - trickier regex`
			`msdRegex = new ArrayList<>();`
			`msdRegex.add(Pattern.compile(".{2}")); // should count only for msd="Vd" - "ker"`
			`filter.setMsd(msdRegex);`
			`filter.setStringLength(3);`

			`stats = new StatisticsNew(testCorpus, filter, false);`
			`Ngrams.calculateForAll(Common.midCorpus, stats);`
			`result = stats.getResult();`

			`assertEquals(1, result.size());`
			`assertTrue(result.containsKey("ker"));`
			`assertEquals(1, result.get("ker").longValue());`
			`}`

			`@Test`
			`public void wordsNgramsTest() {`
Fixed slow combination of words and lemmas presentation 6 years ago			`Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;`
Project copied 6 years ago
			`Filter filter = new Filter();`
			`filter.setAl(AnalysisLevel.STRING_LEVEL);`
			`filter.setNgramValue(3);`
Fixed slow combination of words and lemmas presentation 6 years ago			`ArrayList<String> tax= new ArrayList<>();`
			`tax.add("SSJ.T.P.C");`
			`filter.setTaxonomy(tax);`
			`ArrayList<String> mKeys = new ArrayList<>();`
			`//mKeys.add("lema");`
			`filter.setMultipleKeys(mKeys);`
Project copied 6 years ago
			`Corpus testCorpus = new Corpus();`
			`testCorpus.setCorpusType(CorpusType.GIGAFIDA);`
			`testCorpus.setDetectedCorpusFiles(new ArrayList<>());`
Fixed slow combination of words and lemmas presentation 6 years ago			`ArrayList<String> taxForCombo = new ArrayList<>();`
			`taxForCombo.add("SSJ.T.P.C");`
			`testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));`
Project copied 6 years ago
			`// tests:`
			`// - normal ngrams - word`
			`// midCorpus contains 5 words which should make for 3 3-grams`
			`filter.setCalculateFor(CalculateFor.WORD);`
			`StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);`
			`Ngrams.calculateForAll(Common.midCorpus, stats);`
Fixed slow combination of words and lemmas presentation 6 years ago			`taxonomyResult = stats.getTaxonomyResult();`
Project copied 6 years ago
Fixed slow combination of words and lemmas presentation 6 years ago			`assertEquals(3, taxonomyResult.get("Total").size());`
			`assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ker ima junak", "", "", "")));`
			`assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ima junak ima", "", "", "")));`
			`assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak ima posesti", "", "", "")));`
Project copied 6 years ago
			`// tests:`
			`// - normal ngrams - lemmas`
			`filter.setCalculateFor(CalculateFor.LEMMA);`
			`stats = new StatisticsNew(testCorpus, filter, false);`
			`Ngrams.calculateForAll(Common.midCorpus, stats);`
Fixed slow combination of words and lemmas presentation 6 years ago			`taxonomyResult = stats.getTaxonomyResult();`
Project copied 6 years ago
Fixed slow combination of words and lemmas presentation 6 years ago			`assertEquals(3, taxonomyResult.get("Total").size());`
			`assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ker imeti junak", "", "", "")));`
			`assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("imeti junak imeti", "", "", "")));`
			`assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak imeti posest", "", "", "")));`
Project copied 6 years ago
			`// tests:`
			`// - normal ngrams - msd`
			`filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);`
			`stats = new StatisticsNew(testCorpus, filter, false);`
			`Ngrams.calculateForAll(Common.midCorpus, stats);`
Fixed slow combination of words and lemmas presentation 6 years ago			`taxonomyResult = stats.getTaxonomyResult();`
Project copied 6 years ago
Fixed slow combination of words and lemmas presentation 6 years ago			`assertEquals(3, taxonomyResult.get("Total").size());`
			`assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Vd Ggnste-n Somei", "", "", "")));`
			`assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Ggnste-n Somei Ggnste-n", "", "", "")));`
			`assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Somei Ggnste-n Sozem", "", "", "")));`
Project copied 6 years ago
			`// tests:`
			`// - ngrams - word - regex filter`
			`filter.setCalculateFor(CalculateFor.WORD);`
			`ArrayList<Pattern> msdRegex = new ArrayList<>();`
			`msdRegex.add(Pattern.compile("S.*"));`
			`msdRegex.add(Pattern.compile("G.*"));`
			`msdRegex.add(Pattern.compile(".*"));`
			`filter.setMsd(msdRegex);`

			`stats = new StatisticsNew(testCorpus, filter, false);`
			`Ngrams.calculateForAll(Common.midCorpus, stats);`
Fixed slow combination of words and lemmas presentation 6 years ago			`taxonomyResult = stats.getTaxonomyResult();`
Project copied 6 years ago
Fixed slow combination of words and lemmas presentation 6 years ago			`assertEquals(1, taxonomyResult.get("Total").size());`
			`assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak ima posesti", "", "", "")));`
Project copied 6 years ago
			`// tests:`
			`// - ngrams - word - regex filter`
			`filter.setCalculateFor(CalculateFor.WORD);`
			`filter.setNgramValue(2);`
			`msdRegex = new ArrayList<>();`
			`msdRegex.add(Pattern.compile("G.*"));`
			`msdRegex.add(Pattern.compile("Some.*"));`
			`filter.setMsd(msdRegex);`

			`stats = new StatisticsNew(testCorpus, filter, false);`
			`Ngrams.calculateForAll(Common.midCorpus, stats);`
Fixed slow combination of words and lemmas presentation 6 years ago			`taxonomyResult = stats.getTaxonomyResult();`
Project copied 6 years ago
Fixed slow combination of words and lemmas presentation 6 years ago			`assertEquals(1, taxonomyResult.get("Total").size());`
			`assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ima junak", "", "", "")));`
Project copied 6 years ago			`}`


			`// @Test`
			`// public void ngramsTest() {`
			`// // minimal compliance test`
			`// Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_SPECS);`
			`//`
			`// Map<String, AtomicLong> results = recalculate(minCorpus, stats);`
			`//`
			`// // 1-gram minCorpusa should equal minCorpus' size`
			`// assertEquals(minCorpus.get(0).getWords().size(), results.size());`
			`//`
			`// // each resulting word should have a frequency of 1`
			`// List<Word> words = minCorpus.get(0).getWords();`
			`// for (int i = 0; i < results.size(); i++) {`
			`// Word w = words.get(i);`
			`// AtomicLong frequency = results.get(w.getMsd());`
			`// assertEquals(1, frequency.intValue());`
			`// }`
			`//`
			`// // repeat for 2grams`
			`// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_SPECS);`
			`// results = recalculate(minCorpus, stats);`
			`//`
			`// // 2-gram of a 3 item corpus should equal 2 (first two words and second two words)`
			`// assertEquals(2, results.size());`
			`//`
			`// // add a filter`
			`// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);`
			`//`
			`// List<String> morphosyntacticFilter = new ArrayList<>();`
			`// morphosyntacticFilter.add("Sozem");`
			`// stats.setMorphosyntacticFilter(morphosyntacticFilter);`
			`//`
			`// results = recalculate(minCorpus, stats);`
			`//`
			`// // since min corpus doesn't contain Sozem, results should be empty`
			`// assertEquals(0, results.size());`
			`//`
			`// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);`
			`// morphosyntacticFilter = new ArrayList<>();`
			`// morphosyntacticFilter.add("Somei");`
			`// stats.setMorphosyntacticFilter(morphosyntacticFilter);`
			`// results = recalculate(minCorpus, stats);`
			`//`
			`// // since we have 1 Somei, 1 result`
			`// assertEquals(1, results.size());`
			`// assertEquals(1, results.get("Somei").intValue());`
			`//`
			`// // actual filter with wildcards`
			`// // 1gram`
			`// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 1, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);`
			`// morphosyntacticFilter = new ArrayList<>();`
			`// morphosyntacticFilter.add("So***");`
			`// stats.setMorphosyntacticFilter(morphosyntacticFilter);`
			`// results = recalculate(minCorpus, stats);`
			`//`
			`// assertEquals(1, results.size());`
			`// assertEquals(1, results.get("Somei").intValue());`
			`//`
			`// // 2gram`
			`// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);`
			`// morphosyntacticFilter = new ArrayList<>();`
			`// morphosyntacticFilter.add("Ggns*e-n");`
			`// morphosyntacticFilter.add("So***");`
			`// stats.setMorphosyntacticFilter(morphosyntacticFilter);`
			`// results = recalculate(minCorpus, stats);`
			`//`
			`// assertEquals(1, results.size());`
			`// assertEquals(1, results.get("Ggnste-n Somei").intValue());`
			`//`
			`// // 2gram midCorpus`
			`// stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, null, CalculateFor.MORPHOSYNTACTIC_PROPERTY);`
			`// morphosyntacticFilter = new ArrayList<>();`
			`// morphosyntacticFilter.add("Ggns*e-n");`
			`// morphosyntacticFilter.add("So***");`
			`// stats.setMorphosyntacticFilter(morphosyntacticFilter);`
			`// results = recalculate(midCorpus, stats);`
			`//`
			`// assertEquals(2, results.size());`
			`// assertEquals(1, results.get("Ggnste-n Somei").intValue());`
			`// assertEquals(1, results.get("Ggnste-n Sozem").intValue());`
			`// }`

			`private Map<String, AtomicLong> recalculate(List<Sentence> corpus, Statistics stats) {`
			`// calculateForAll(corpus, stats);`
			`return stats.getResult();`
			`}`

			`@Test`
			`public void skipgramsTest() {`
Fixed slow combination of words and lemmas presentation 6 years ago			`Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;`
Project copied 6 years ago
			`Filter filter = new Filter();`
			`filter.setAl(AnalysisLevel.STRING_LEVEL);`
			`filter.setCalculateFor(CalculateFor.WORD);`
Fixed slow combination of words and lemmas presentation 6 years ago			`ArrayList<String> tax= new ArrayList<>();`
			`tax.add("SSJ.T.P.C");`
			`filter.setTaxonomy(tax);`
Project copied 6 years ago
			`Corpus testCorpus = new Corpus();`
			`testCorpus.setCorpusType(CorpusType.GIGAFIDA);`
			`testCorpus.setDetectedCorpusFiles(new ArrayList<>());`
Fixed slow combination of words and lemmas presentation 6 years ago			`ArrayList<String> taxForCombo = new ArrayList<>();`
			`taxForCombo.add("tisk-periodično-časopis");`
			`testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));`
Project copied 6 years ago
			`// tests:`
			`// - bigrams`
			`filter.setNgramValue(2);`
			`StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);`
			`Ngrams.calculateForAll(Common.midCorpusSkip, stats);`
Fixed slow combination of words and lemmas presentation 6 years ago			`taxonomyResult = stats.getTaxonomyResult();`
Project copied 6 years ago
			`Set<String> bigrams = new HashSet<>(Arrays.asList("ker ima", "ima junak", "junak v", "v posesti"));`
Fixed slow combination of words and lemmas presentation 6 years ago			`Set<MultipleHMKeys> bigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();`
			`Set<String> bigramsActual = new HashSet<>(bigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));`
Project copied 6 years ago			`assertEquals(bigrams, bigramsActual);`

			`// test:`
			`// - two skip bigrams`
			`filter.setNgramValue(2);`
			`filter.setSkipValue(2);`
			`stats = new StatisticsNew(testCorpus, filter, false);`
			`Ngrams.calculateForAll(Common.midCorpusSkip, stats);`
Fixed slow combination of words and lemmas presentation 6 years ago			`taxonomyResult = stats.getTaxonomyResult();`
Project copied 6 years ago
			`Set<String> twoSkipBigrams = new HashSet<>(Arrays.asList("ker ima", "ker junak", "ker v", "ima junak", "ima v", "ima posesti", "junak v", "junak posesti", "v posesti"));`
Fixed slow combination of words and lemmas presentation 6 years ago			`Set<MultipleHMKeys> twoSkipBigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();`
			`Set<String> twoSkipBigramsActual = new HashSet<>(twoSkipBigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));`
Project copied 6 years ago
			`assertEquals(twoSkipBigrams, twoSkipBigramsActual);`

			`// tests:`
			`// - trigrams`
			`filter.setNgramValue(3);`
			`filter.setSkipValue(null);`
			`stats = new StatisticsNew(testCorpus, filter, false);`
			`Ngrams.calculateForAll(Common.midCorpusSkip, stats);`
Fixed slow combination of words and lemmas presentation 6 years ago			`taxonomyResult = stats.getTaxonomyResult();`
Project copied 6 years ago			`Set<String> trigrams = new HashSet<>(Arrays.asList("ker ima junak", "ima junak v", "junak v posesti"));`
Fixed slow combination of words and lemmas presentation 6 years ago			`Set<MultipleHMKeys> trigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();`
			`Set<String> trigramsActual = new HashSet<>(trigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));`
Project copied 6 years ago
			`assertEquals(trigrams, trigramsActual);`

			`// tests:`
			`// - two skip trigrams`
			`filter.setNgramValue(3);`
			`filter.setSkipValue(2);`
			`stats = new StatisticsNew(testCorpus, filter, false);`
			`Ngrams.calculateForAll(Common.midCorpusSkip, stats);`
Fixed slow combination of words and lemmas presentation 6 years ago			`taxonomyResult = stats.getTaxonomyResult();`
Project copied 6 years ago			`HashSet<String> twoSkipTrigrams = new HashSet<>(Arrays.asList("ker ima junak", "ker ima v", "ker ima posesti", "ker junak v", "ker junak posesti", "ker v posesti", "ima junak v", "ima junak posesti", "ima v posesti", "junak v posesti"));`
Fixed slow combination of words and lemmas presentation 6 years ago			`Set<MultipleHMKeys> twoSkipTrigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();`
			`Set<String> twoSkipTrigramsActual = new HashSet<>(twoSkipTrigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));`
Project copied 6 years ago
			`assertEquals(twoSkipTrigrams, twoSkipTrigramsActual);`
			`}`
			`}`