@ -67,22 +67,43 @@ public class Ngrams {
multipleKeys = new MultipleHMKeys1 ( key ) ;
break ;
case 1 :
multipleKeys = new MultipleHMKeys2 ( key , wordToString ( ngramCandidate , otherKeys . get ( 0 ) ) ) ;
String k1_2 = wordToString ( ngramCandidate , otherKeys . get ( 0 ) ) ;
if ( stats . getFilter ( ) . getNotePunctuations ( ) )
k1_2 = ( ! k1_2 . equals ( "" ) & & k1_2 . charAt ( k1_2 . length ( ) - 1 ) = = ',' ) ? k1_2 . substring ( 0 , k1_2 . length ( ) - 1 ) : k1_2 ;
multipleKeys = new MultipleHMKeys2 ( key , k1_2 ) ;
break ;
case 2 :
multipleKeys = new MultipleHMKeys3 ( key , wordToString ( ngramCandidate , otherKeys . get ( 0 ) ) ,
wordToString ( ngramCandidate , otherKeys . get ( 1 ) ) ) ;
String k2_2 = wordToString ( ngramCandidate , otherKeys . get ( 0 ) ) ;
String k2_3 = wordToString ( ngramCandidate , otherKeys . get ( 1 ) ) ;
if ( stats . getFilter ( ) . getNotePunctuations ( ) ) {
k2_2 = ( ! k2_2 . equals ( "" ) & & k2_2 . charAt ( k2_2 . length ( ) - 1 ) = = ',' ) ? k2_2 . substring ( 0 , k2_2 . length ( ) - 1 ) : k2_2 ;
k2_3 = ( ! k2_3 . equals ( "" ) & & k2_3 . charAt ( k2_3 . length ( ) - 1 ) = = ',' ) ? k2_3 . substring ( 0 , k2_3 . length ( ) - 1 ) : k2_3 ;
}
multipleKeys = new MultipleHMKeys3 ( key , k2_2 , k2_3 ) ;
break ;
case 3 :
multipleKeys = new MultipleHMKeys4 ( key , wordToString ( ngramCandidate , otherKeys . get ( 0 ) ) ,
wordToString ( ngramCandidate , otherKeys . get ( 1 ) ) ,
wordToString ( ngramCandidate , otherKeys . get ( 2 ) ) ) ;
String k3_2 = wordToString ( ngramCandidate , otherKeys . get ( 0 ) ) ;
String k3_3 = wordToString ( ngramCandidate , otherKeys . get ( 1 ) ) ;
String k3_4 = wordToString ( ngramCandidate , otherKeys . get ( 2 ) ) ;
if ( stats . getFilter ( ) . getNotePunctuations ( ) ) {
k3_2 = ( ! k3_2 . equals ( "" ) & & k3_2 . charAt ( k3_2 . length ( ) - 1 ) = = ',' ) ? k3_2 . substring ( 0 , k3_2 . length ( ) - 1 ) : k3_2 ;
k3_3 = ( ! k3_3 . equals ( "" ) & & k3_3 . charAt ( k3_3 . length ( ) - 1 ) = = ',' ) ? k3_3 . substring ( 0 , k3_3 . length ( ) - 1 ) : k3_3 ;
k3_4 = ( ! k3_4 . equals ( "" ) & & k3_4 . charAt ( k3_4 . length ( ) - 1 ) = = ',' ) ? k3_4 . substring ( 0 , k3_4 . length ( ) - 1 ) : k3_4 ;
}
multipleKeys = new MultipleHMKeys4 ( key , k3_2 , k3_3 , k3_4 ) ;
break ;
case 4 :
multipleKeys = new MultipleHMKeys5 ( key , wordToString ( ngramCandidate , otherKeys . get ( 0 ) ) ,
wordToString ( ngramCandidate , otherKeys . get ( 1 ) ) ,
wordToString ( ngramCandidate , otherKeys . get ( 2 ) ) ,
wordToString ( ngramCandidate , otherKeys . get ( 3 ) ) ) ;
String k4_2 = wordToString ( ngramCandidate , otherKeys . get ( 0 ) ) ;
String k4_3 = wordToString ( ngramCandidate , otherKeys . get ( 1 ) ) ;
String k4_4 = wordToString ( ngramCandidate , otherKeys . get ( 2 ) ) ;
String k4_5 = wordToString ( ngramCandidate , otherKeys . get ( 3 ) ) ;
if ( stats . getFilter ( ) . getNotePunctuations ( ) ) {
k4_2 = ( ! k4_2 . equals ( "" ) & & k4_2 . charAt ( k4_2 . length ( ) - 1 ) = = ',' ) ? k4_2 . substring ( 0 , k4_2 . length ( ) - 1 ) : k4_2 ;
k4_3 = ( ! k4_3 . equals ( "" ) & & k4_3 . charAt ( k4_3 . length ( ) - 1 ) = = ',' ) ? k4_3 . substring ( 0 , k4_3 . length ( ) - 1 ) : k4_3 ;
k4_4 = ( ! k4_4 . equals ( "" ) & & k4_4 . charAt ( k4_4 . length ( ) - 1 ) = = ',' ) ? k4_4 . substring ( 0 , k4_4 . length ( ) - 1 ) : k4_4 ;
k4_5 = ( ! k4_5 . equals ( "" ) & & k4_5 . charAt ( k4_5 . length ( ) - 1 ) = = ',' ) ? k4_5 . substring ( 0 , k4_5 . length ( ) - 1 ) : k4_5 ;
}
multipleKeys = new MultipleHMKeys5 ( key , k4_2 , k4_3 , k4_4 , k4_5 ) ;
break ;
default :
multipleKeys = null ;
@ -265,7 +286,7 @@ public class Ngrams {
currentLoop . add ( checkAndModifySkipgramPunctuation ( sentence , i , j , stats ) ) ;
currentLoop . add ( sentence . get ( j ) ) ;
validateAndCountSkipgramCandidate ( currentLoop , stats );
validateAndCountSkipgramCandidate ( currentLoop , stats , s . getTaxonomy ( ) );
} else {
for ( int k = j + 1 ; k < = j + 1 + skip ; k + + ) { // 3gram
if ( ngram = = 3 & & k < sentence . size ( ) ) {
@ -274,7 +295,7 @@ public class Ngrams {
currentLoop . add ( checkAndModifySkipgramPunctuation ( sentence , j , k , stats ) ) ;
currentLoop . add ( sentence . get ( k ) ) ;
validateAndCountSkipgramCandidate ( currentLoop , stats );
validateAndCountSkipgramCandidate ( currentLoop , stats , s . getTaxonomy ( ) );
} else {
for ( int l = k + 1 ; l < = k + 1 + skip ; l + + ) { // 4gram
if ( ngram = = 4 & & l < sentence . size ( ) ) {
@ -284,7 +305,7 @@ public class Ngrams {
currentLoop . add ( checkAndModifySkipgramPunctuation ( sentence , k , l , stats ) ) ;
currentLoop . add ( sentence . get ( l ) ) ;
validateAndCountSkipgramCandidate ( currentLoop , stats );
validateAndCountSkipgramCandidate ( currentLoop , stats , s . getTaxonomy ( ) );
} else {
for ( int m = l + 1 ; m < = l + 1 + skip ; m + + ) { // 5gram
if ( ngram = = 5 & & m < sentence . size ( ) ) {
@ -295,7 +316,7 @@ public class Ngrams {
currentLoop . add ( checkAndModifySkipgramPunctuation ( sentence , l , m , stats ) ) ;
currentLoop . add ( sentence . get ( m ) ) ;
validateAndCountSkipgramCandidate ( currentLoop , stats );
validateAndCountSkipgramCandidate ( currentLoop , stats , s . getTaxonomy ( ) );
}
}
}
@ -308,13 +329,80 @@ public class Ngrams {
}
}
private static void validateAndCountSkipgramCandidate ( ArrayList < Word > skipgramCandidate , StatisticsNew stats ) {
private static void validateAndCountSkipgramCandidate ( ArrayList < Word > skipgramCandidate , StatisticsNew stats , List < String > taxonomy ) {
// count if no regex is set or if it is & candidate passes it
if ( ! stats . getFilter ( ) . hasMsd ( ) | | passesRegex ( skipgramCandidate , stats . getFilter ( ) . getMsd ( ) ) ) {
String key = wordToString ( skipgramCandidate , stats . getFilter ( ) . getCalculateFor ( ) ) ;
key = ( key . charAt ( key . length ( ) - 1 ) = = ',' ) ? key . substring ( 0 , key . length ( ) - 1 ) : key ;
stats . updateTaxonomyResults ( new MultipleHMKeys1 ( key ) ,
stats . getCorpus ( ) . getTaxonomy ( ) ) ;
// String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
// key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
// stats.updateTaxonomyResults(new MultipleHMKeys1(key),
// stats.getCorpus().getTaxonomy());
ArrayList < CalculateFor > otherKeys = stats . getFilter ( ) . getMultipleKeys ( ) ;
String key = wordToString ( skipgramCandidate , stats . getFilter ( ) . getCalculateFor ( ) ) ;
// if last letter is ',' erase it
// if (key.equals("")){
// String test = key;
// }
if ( stats . getFilter ( ) . getNotePunctuations ( ) )
key = ( ! key . equals ( "" ) & & key . charAt ( key . length ( ) - 1 ) = = ',' ) ? key . substring ( 0 , key . length ( ) - 1 ) : key ;
MultipleHMKeys multipleKeys ;
// create MultipleHMKeys for different amount of other keys
switch ( otherKeys . size ( ) ) {
case 0 :
multipleKeys = new MultipleHMKeys1 ( key ) ;
break ;
case 1 :
String k1_2 = wordToString ( skipgramCandidate , otherKeys . get ( 0 ) ) ;
if ( stats . getFilter ( ) . getNotePunctuations ( ) )
k1_2 = ( ! k1_2 . equals ( "" ) & & k1_2 . charAt ( k1_2 . length ( ) - 1 ) = = ',' ) ? k1_2 . substring ( 0 , k1_2 . length ( ) - 1 ) : k1_2 ;
multipleKeys = new MultipleHMKeys2 ( key , k1_2 ) ;
break ;
case 2 :
String k2_2 = wordToString ( skipgramCandidate , otherKeys . get ( 0 ) ) ;
String k2_3 = wordToString ( skipgramCandidate , otherKeys . get ( 1 ) ) ;
if ( stats . getFilter ( ) . getNotePunctuations ( ) ) {
k2_2 = ( ! k2_2 . equals ( "" ) & & k2_2 . charAt ( k2_2 . length ( ) - 1 ) = = ',' ) ? k2_2 . substring ( 0 , k2_2 . length ( ) - 1 ) : k2_2 ;
k2_3 = ( ! k2_3 . equals ( "" ) & & k2_3 . charAt ( k2_3 . length ( ) - 1 ) = = ',' ) ? k2_3 . substring ( 0 , k2_3 . length ( ) - 1 ) : k2_3 ;
}
multipleKeys = new MultipleHMKeys3 ( key , k2_2 , k2_3 ) ;
break ;
case 3 :
String k3_2 = wordToString ( skipgramCandidate , otherKeys . get ( 0 ) ) ;
String k3_3 = wordToString ( skipgramCandidate , otherKeys . get ( 1 ) ) ;
String k3_4 = wordToString ( skipgramCandidate , otherKeys . get ( 2 ) ) ;
if ( stats . getFilter ( ) . getNotePunctuations ( ) ) {
k3_2 = ( ! k3_2 . equals ( "" ) & & k3_2 . charAt ( k3_2 . length ( ) - 1 ) = = ',' ) ? k3_2 . substring ( 0 , k3_2 . length ( ) - 1 ) : k3_2 ;
k3_3 = ( ! k3_3 . equals ( "" ) & & k3_3 . charAt ( k3_3 . length ( ) - 1 ) = = ',' ) ? k3_3 . substring ( 0 , k3_3 . length ( ) - 1 ) : k3_3 ;
k3_4 = ( ! k3_4 . equals ( "" ) & & k3_4 . charAt ( k3_4 . length ( ) - 1 ) = = ',' ) ? k3_4 . substring ( 0 , k3_4 . length ( ) - 1 ) : k3_4 ;
}
multipleKeys = new MultipleHMKeys4 ( key , k3_2 , k3_3 , k3_4 ) ;
break ;
case 4 :
String k4_2 = wordToString ( skipgramCandidate , otherKeys . get ( 0 ) ) ;
String k4_3 = wordToString ( skipgramCandidate , otherKeys . get ( 1 ) ) ;
String k4_4 = wordToString ( skipgramCandidate , otherKeys . get ( 2 ) ) ;
String k4_5 = wordToString ( skipgramCandidate , otherKeys . get ( 3 ) ) ;
if ( stats . getFilter ( ) . getNotePunctuations ( ) ) {
k4_2 = ( ! k4_2 . equals ( "" ) & & k4_2 . charAt ( k4_2 . length ( ) - 1 ) = = ',' ) ? k4_2 . substring ( 0 , k4_2 . length ( ) - 1 ) : k4_2 ;
k4_3 = ( ! k4_3 . equals ( "" ) & & k4_3 . charAt ( k4_3 . length ( ) - 1 ) = = ',' ) ? k4_3 . substring ( 0 , k4_3 . length ( ) - 1 ) : k4_3 ;
k4_4 = ( ! k4_4 . equals ( "" ) & & k4_4 . charAt ( k4_4 . length ( ) - 1 ) = = ',' ) ? k4_4 . substring ( 0 , k4_4 . length ( ) - 1 ) : k4_4 ;
k4_5 = ( ! k4_5 . equals ( "" ) & & k4_5 . charAt ( k4_5 . length ( ) - 1 ) = = ',' ) ? k4_5 . substring ( 0 , k4_5 . length ( ) - 1 ) : k4_5 ;
}
multipleKeys = new MultipleHMKeys5 ( key , k4_2 , k4_3 , k4_4 , k4_5 ) ;
break ;
default :
multipleKeys = null ;
}
stats . updateTaxonomyResults ( multipleKeys , taxonomy ) ;
}
}
}