@ -3,7 +3,9 @@ import static org.junit.Assert.*;
import java.util.* ;
import java.util.concurrent.atomic.AtomicLong ;
import java.util.regex.Pattern ;
import java.util.stream.Collectors ;
import javafx.collections.FXCollections ;
import org.junit.Test ;
import alg.ngram.Ngrams ;
@ -21,10 +23,17 @@ public class NgramTests {
filter . setStringLength ( 4 ) ;
filter . setNgramValue ( 0 ) ; // letters
filter . setCalculateFor ( CalculateFor . WORD ) ;
ArrayList < String > tax = new ArrayList < > ( ) ;
tax . add ( "SSJ.T.P.C" ) ;
filter . setTaxonomy ( tax ) ;
Corpus testCorpus = new Corpus ( ) ;
testCorpus . setCorpusType ( CorpusType . GIGAFIDA ) ;
testCorpus . setDetectedCorpusFiles ( new ArrayList < > ( ) ) ;
ArrayList < String > taxForCombo = new ArrayList < > ( ) ;
taxForCombo . add ( "SSJ.T.P.C" ) ;
testCorpus . setTaxonomy ( FXCollections . observableArrayList ( taxForCombo ) ) ;
// tests:
// - no regex
@ -103,15 +112,24 @@ public class NgramTests {
@Test
public void wordsNgramsTest ( ) {
Map < String , AtomicLong > result = null ;
Map < String , Map < MultipleHMKeys , AtomicLong > > taxonomyResult ;
Filter filter = new Filter ( ) ;
filter . setAl ( AnalysisLevel . STRING_LEVEL ) ;
filter . setNgramValue ( 3 ) ;
ArrayList < String > tax = new ArrayList < > ( ) ;
tax . add ( "SSJ.T.P.C" ) ;
filter . setTaxonomy ( tax ) ;
ArrayList < String > mKeys = new ArrayList < > ( ) ;
//mKeys.add("lema");
filter . setMultipleKeys ( mKeys ) ;
Corpus testCorpus = new Corpus ( ) ;
testCorpus . setCorpusType ( CorpusType . GIGAFIDA ) ;
testCorpus . setDetectedCorpusFiles ( new ArrayList < > ( ) ) ;
ArrayList < String > taxForCombo = new ArrayList < > ( ) ;
taxForCombo . add ( "SSJ.T.P.C" ) ;
testCorpus . setTaxonomy ( FXCollections . observableArrayList ( taxForCombo ) ) ;
// tests:
// - normal ngrams - word
@ -119,36 +137,36 @@ public class NgramTests {
filter . setCalculateFor ( CalculateFor . WORD ) ;
StatisticsNew stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpus , stats ) ;
result = stats . get Result( ) ;
taxonomyResult = stats . getTaxonomy Result( ) ;
assertEquals ( 3 , result . size ( ) ) ;
assertTrue ( result. containsKey ( "ker ima junak" ) ) ;
assertTrue ( result. containsKey ( "ima junak ima" ) ) ;
assertTrue ( result. containsKey ( "junak ima posesti" ) ) ;
assertEquals ( 3 , taxonomyResult. get ( "Total" ) . size ( ) ) ;
assertTrue ( taxonomyResult. get ( "Total" ) . containsKey ( new MultipleHMKeys ( "ker ima junak" , "" , "" , "" ) ) ) ;
assertTrue ( taxonomyResult. get ( "Total" ) . containsKey ( new MultipleHMKeys ( "ima junak ima" , "" , "" , "" ) ) ) ;
assertTrue ( taxonomyResult. get ( "Total" ) . containsKey ( new MultipleHMKeys ( "junak ima posesti" , "" , "" , "" ) ) ) ;
// tests:
// - normal ngrams - lemmas
filter . setCalculateFor ( CalculateFor . LEMMA ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpus , stats ) ;
result = stats . get Result( ) ;
taxonomyResult = stats . getTaxonomy Result( ) ;
assertEquals ( 3 , result . size ( ) ) ;
assertTrue ( result. containsKey ( "ker imeti junak" ) ) ;
assertTrue ( result. containsKey ( "imeti junak imeti" ) ) ;
assertTrue ( result. containsKey ( "junak imeti posest" ) ) ;
assertEquals ( 3 , taxonomyResult. get ( "Total" ) . size ( ) ) ;
assertTrue ( taxonomyResult. get ( "Total" ) . containsKey ( new MultipleHMKeys ( "ker imeti junak" , "" , "" , "" ) ) ) ;
assertTrue ( taxonomyResult. get ( "Total" ) . containsKey ( new MultipleHMKeys ( "imeti junak imeti" , "" , "" , "" ) ) ) ;
assertTrue ( taxonomyResult. get ( "Total" ) . containsKey ( new MultipleHMKeys ( "junak imeti posest" , "" , "" , "" ) ) ) ;
// tests:
// - normal ngrams - msd
filter . setCalculateFor ( CalculateFor . MORPHOSYNTACTIC_PROPERTY ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpus , stats ) ;
result = stats . get Result( ) ;
taxonomyResult = stats . getTaxonomy Result( ) ;
assertEquals ( 3 , result . size ( ) ) ;
assertTrue ( result. containsKey ( "Vd Ggnste-n Somei" ) ) ;
assertTrue ( result. containsKey ( "Ggnste-n Somei Ggnste-n" ) ) ;
assertTrue ( result. containsKey ( "Somei Ggnste-n Sozem" ) ) ;
assertEquals ( 3 , taxonomyResult. get ( "Total" ) . size ( ) ) ;
assertTrue ( taxonomyResult. get ( "Total" ) . containsKey ( new MultipleHMKeys ( "Vd Ggnste-n Somei" , "" , "" , "" ) ) ) ;
assertTrue ( taxonomyResult. get ( "Total" ) . containsKey ( new MultipleHMKeys ( "Ggnste-n Somei Ggnste-n" , "" , "" , "" ) ) ) ;
assertTrue ( taxonomyResult. get ( "Total" ) . containsKey ( new MultipleHMKeys ( "Somei Ggnste-n Sozem" , "" , "" , "" ) ) ) ;
// tests:
// - ngrams - word - regex filter
@ -161,10 +179,10 @@ public class NgramTests {
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpus , stats ) ;
result = stats . get Result( ) ;
taxonomyResult = stats . getTaxonomy Result( ) ;
assertEquals ( 1 , result . size ( ) ) ;
assertTrue ( result. containsKey ( "junak ima posesti" ) ) ;
assertEquals ( 1 , taxonomyResult. get ( "Total" ) . size ( ) ) ;
assertTrue ( taxonomyResult. get ( "Total" ) . containsKey ( new MultipleHMKeys ( "junak ima posesti" , "" , "" , "" ) ) ) ;
// tests:
// - ngrams - word - regex filter
@ -177,10 +195,10 @@ public class NgramTests {
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpus , stats ) ;
result = stats . get Result( ) ;
taxonomyResult = stats . getTaxonomy Result( ) ;
assertEquals ( 1 , result . size ( ) ) ;
assertTrue ( result. containsKey ( "ima junak" ) ) ;
assertEquals ( 1 , taxonomyResult. get ( "Total" ) . size ( ) ) ;
assertTrue ( taxonomyResult. get ( "Total" ) . containsKey ( new MultipleHMKeys ( "ima junak" , "" , "" , "" ) ) ) ;
}
@ -273,25 +291,32 @@ public class NgramTests {
@Test
public void skipgramsTest ( ) {
Map < String , AtomicLong > result = null ;
Map < String , Map < MultipleHMKeys , AtomicLong > > taxonomyResult ;
Filter filter = new Filter ( ) ;
filter . setAl ( AnalysisLevel . STRING_LEVEL ) ;
filter . setCalculateFor ( CalculateFor . WORD ) ;
ArrayList < String > tax = new ArrayList < > ( ) ;
tax . add ( "SSJ.T.P.C" ) ;
filter . setTaxonomy ( tax ) ;
Corpus testCorpus = new Corpus ( ) ;
testCorpus . setCorpusType ( CorpusType . GIGAFIDA ) ;
testCorpus . setDetectedCorpusFiles ( new ArrayList < > ( ) ) ;
ArrayList < String > taxForCombo = new ArrayList < > ( ) ;
taxForCombo . add ( "tisk-periodično-časopis" ) ;
testCorpus . setTaxonomy ( FXCollections . observableArrayList ( taxForCombo ) ) ;
// tests:
// - bigrams
filter . setNgramValue ( 2 ) ;
StatisticsNew stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpusSkip , stats ) ;
result = stats . get Result( ) ;
taxonomyResult = stats . getTaxonomy Result( ) ;
Set < String > bigrams = new HashSet < > ( Arrays . asList ( "ker ima" , "ima junak" , "junak v" , "v posesti" ) ) ;
Set < String > bigramsActual = result . keySet ( ) ;
Set < MultipleHMKeys > bigramsMultipleHMKeys = taxonomyResult . get ( "Total" ) . keySet ( ) ;
Set < String > bigramsActual = new HashSet < > ( bigramsMultipleHMKeys . stream ( ) . map ( MultipleHMKeys : : getKey ) . collect ( Collectors . toList ( ) ) ) ;
assertEquals ( bigrams , bigramsActual ) ;
// test:
@ -300,10 +325,11 @@ public class NgramTests {
filter . setSkipValue ( 2 ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpusSkip , stats ) ;
result = stats . get Result( ) ;
taxonomyResult = stats . getTaxonomy Result( ) ;
Set < String > twoSkipBigrams = new HashSet < > ( Arrays . asList ( "ker ima" , "ker junak" , "ker v" , "ima junak" , "ima v" , "ima posesti" , "junak v" , "junak posesti" , "v posesti" ) ) ;
Set < String > twoSkipBigramsActual = result . keySet ( ) ;
Set < MultipleHMKeys > twoSkipBigramsMultipleHMKeys = taxonomyResult . get ( "Total" ) . keySet ( ) ;
Set < String > twoSkipBigramsActual = new HashSet < > ( twoSkipBigramsMultipleHMKeys . stream ( ) . map ( MultipleHMKeys : : getKey ) . collect ( Collectors . toList ( ) ) ) ;
assertEquals ( twoSkipBigrams , twoSkipBigramsActual ) ;
@ -313,9 +339,10 @@ public class NgramTests {
filter . setSkipValue ( null ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpusSkip , stats ) ;
result = stats . get Result( ) ;
taxonomyResult = stats . getTaxonomy Result( ) ;
Set < String > trigrams = new HashSet < > ( Arrays . asList ( "ker ima junak" , "ima junak v" , "junak v posesti" ) ) ;
Set < String > trigramsActual = result . keySet ( ) ;
Set < MultipleHMKeys > trigramsMultipleHMKeys = taxonomyResult . get ( "Total" ) . keySet ( ) ;
Set < String > trigramsActual = new HashSet < > ( trigramsMultipleHMKeys . stream ( ) . map ( MultipleHMKeys : : getKey ) . collect ( Collectors . toList ( ) ) ) ;
assertEquals ( trigrams , trigramsActual ) ;
@ -325,9 +352,10 @@ public class NgramTests {
filter . setSkipValue ( 2 ) ;
stats = new StatisticsNew ( testCorpus , filter , false ) ;
Ngrams . calculateForAll ( Common . midCorpusSkip , stats ) ;
result = stats . get Result( ) ;
taxonomyResult = stats . getTaxonomy Result( ) ;
HashSet < String > twoSkipTrigrams = new HashSet < > ( Arrays . asList ( "ker ima junak" , "ker ima v" , "ker ima posesti" , "ker junak v" , "ker junak posesti" , "ker v posesti" , "ima junak v" , "ima junak posesti" , "ima v posesti" , "junak v posesti" ) ) ;
Set < String > twoSkipTrigramsActual = result . keySet ( ) ;
Set < MultipleHMKeys > twoSkipTrigramsMultipleHMKeys = taxonomyResult . get ( "Total" ) . keySet ( ) ;
Set < String > twoSkipTrigramsActual = new HashSet < > ( twoSkipTrigramsMultipleHMKeys . stream ( ) . map ( MultipleHMKeys : : getKey ) . collect ( Collectors . toList ( ) ) ) ;
assertEquals ( twoSkipTrigrams , twoSkipTrigramsActual ) ;
}