@ -3,9 +3,11 @@ package alg.ngram;
import java.util.ArrayList ;
import java.util.ArrayList ;
import java.util.List ;
import java.util.List ;
import java.util.Set ;
import java.util.regex.Pattern ;
import java.util.regex.Pattern ;
import java.util.stream.Collectors ;
import java.util.stream.Collectors ;
import com.sun.xml.internal.bind.v2.runtime.reflect.Lister ;
import data.* ;
import data.* ;
import org.apache.commons.lang3.StringUtils ;
import org.apache.commons.lang3.StringUtils ;
import org.apache.logging.log4j.LogManager ;
import org.apache.logging.log4j.LogManager ;
@ -28,6 +30,9 @@ public class Ngrams {
}
}
public static void generateNgramCandidates ( List < Sentence > corpus , StatisticsNew stats ) {
public static void generateNgramCandidates ( List < Sentence > corpus , StatisticsNew stats ) {
// preprocess CalculateFor for this corpus and prepare data for MultipleHMKeys
ArrayList < CalculateFor > otherKeys = stats . getFilter ( ) . getMultipleKeys ( ) ;
for ( Sentence s : corpus ) {
for ( Sentence s : corpus ) {
// skip sentences shorter than specified ngram length
// skip sentences shorter than specified ngram length
if ( s . getWords ( ) . size ( ) < stats . getFilter ( ) . getNgramValue ( ) ) {
if ( s . getWords ( ) . size ( ) < stats . getFilter ( ) . getNgramValue ( ) ) {
@ -46,29 +51,62 @@ public class Ngrams {
String key = wordToString ( ngramCandidate , stats . getFilter ( ) . getCalculateFor ( ) ) ;
String key = wordToString ( ngramCandidate , stats . getFilter ( ) . getCalculateFor ( ) ) ;
// if last letter is ',' erase it
// if last letter is ',' erase it
key = ( key . charAt ( key . length ( ) - 1 ) = = ',' ) ? key . substring ( 0 , key . length ( ) - 1 ) : key ;
// String key = "aaaaaaaaaaaaaaaaaaaaaaa";
// if (key.equals("")){
// String test = key;
String lemma = "" ;
// }
String wordType = "" ;
String msd = "" ;
// key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
for ( CalculateFor otherKey : stats . getFilter ( ) . getMultipleKeys ( ) ) {
if ( otherKey . toString ( ) . equals ( "lema" ) ) {
MultipleHMKeys multipleKeys ;
// lemma = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
lemma = wordToString ( ngramCandidate , otherKey ) ;
// create MultipleHMKeys for different amount of other keys
} else if ( otherKey . toString ( ) . equals ( "besedna vrsta" ) ) {
switch ( otherKeys . size ( ) ) {
wordType = wordToString ( ngramCandidate , otherKey ) . substring ( 0 , 1 ) ;
case 0 :
} else if ( otherKey . toString ( ) . equals ( "oblikoskladenjska oznaka" ) ) {
multipleKeys = new MultipleHMKeys1 ( key ) ;
msd = wordToString ( ngramCandidate , otherKey ) ;
break ;
}
case 1 :
multipleKeys = new MultipleHMKeys2 ( key , wordToString ( ngramCandidate , otherKeys . get ( 0 ) ) ) ;
break ;
case 2 :
multipleKeys = new MultipleHMKeys3 ( key , wordToString ( ngramCandidate , otherKeys . get ( 0 ) ) ,
wordToString ( ngramCandidate , otherKeys . get ( 1 ) ) ) ;
break ;
case 3 :
multipleKeys = new MultipleHMKeys4 ( key , wordToString ( ngramCandidate , otherKeys . get ( 0 ) ) ,
wordToString ( ngramCandidate , otherKeys . get ( 1 ) ) ,
wordToString ( ngramCandidate , otherKeys . get ( 2 ) ) ) ;
break ;
case 4 :
multipleKeys = new MultipleHMKeys5 ( key , wordToString ( ngramCandidate , otherKeys . get ( 0 ) ) ,
wordToString ( ngramCandidate , otherKeys . get ( 1 ) ) ,
wordToString ( ngramCandidate , otherKeys . get ( 2 ) ) ,
wordToString ( ngramCandidate , otherKeys . get ( 3 ) ) ) ;
break ;
default :
multipleKeys = null ;
}
}
// String lemma = "";
// String wordType = "";
// String msd = "";
// for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){
// if(otherKey.toString().equals("lema")){
// lemma = wordToString(ngramCandidate, otherKey);
// } else if(otherKey.toString().equals("besedna vrsta")){
// wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);
// } else if(otherKey.toString().equals("oblikoskladenjska oznaka")){
// msd = wordToString(ngramCandidate, otherKey);
// }
// }
//
// MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd);
MultipleHMKeys multipleKeys = new MultipleHMKeys ( key , lemma , wordType , msd ) ;
// UPDATE TAXONOMY HERE!!!
// UPDATE TAXONOMY HERE!!!
stats . updateTaxonomyResults ( multipleKeys , ngramCandidate . get ( 0 ) . getTaxonomy ( ) ) ;
stats . updateTaxonomyResults ( multipleKeys , s . getTaxonomy ( ) ) ;
// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
}
}
}
}
@ -102,26 +140,31 @@ public class Ngrams {
. stream ( )
. stream ( )
. map ( Word : : getLemma )
. map ( Word : : getLemma )
. collect ( Collectors . toList ( ) ) ) ;
. collect ( Collectors . toList ( ) ) ) ;
break ;
return StringUtils . join ( candidate , " " ) ;
case WORD :
case WORD :
candidate . addAll ( ngramCandidate
candidate . addAll ( ngramCandidate
. stream ( )
. stream ( )
. map ( Word : : getWord )
. map ( Word : : getWord )
. collect ( Collectors . toList ( ) ) ) ;
. collect ( Collectors . toList ( ) ) ) ;
break ;
return StringUtils . join ( candidate , " " ) ;
case MORPHOSYNTACTIC_SPECS :
case MORPHOSYNTACTIC_SPECS :
case MORPHOSYNTACTIC_PROPERTY :
case MORPHOSYNTACTIC_PROPERTY :
candidate . addAll ( ngramCandidate
candidate . addAll ( ngramCandidate
. stream ( )
. stream ( )
. map ( Word : : getMsd )
. map ( Word : : getMsd )
. collect ( Collectors . toList ( ) ) ) ;
. collect ( Collectors . toList ( ) ) ) ;
break ;
return StringUtils . join ( candidate , " " ) ;
case WORD_TYPE :
case WORD_TYPE :
candidate . addAll ( ngramCandidate
candidate . addAll ( ngramCandidate
. stream ( )
. stream ( )
. map ( w - > Character . toString ( w . getMsd ( ) . charAt ( 0 ) ) )
. map ( w - > Character . toString ( w . getMsd ( ) . charAt ( 0 ) ) )
. collect ( Collectors . toList ( ) ) ) ;
. collect ( Collectors . toList ( ) ) ) ;
break ;
// candidate.addAll(ngramCandidate
// .stream()
// .map(w -> Character.toString(w.getMsd().charAt(0)))
// .collect(Collectors.toList()));
// .substring(0, 1)
return StringUtils . join ( candidate , " " ) ;
}
}
return StringUtils . join ( candidate , " " ) ;
return StringUtils . join ( candidate , " " ) ;
@ -136,7 +179,7 @@ public class Ngrams {
private static void generateNgramLetterCandidates ( List < Sentence > corpus , StatisticsNew stats ) {
private static void generateNgramLetterCandidates ( List < Sentence > corpus , StatisticsNew stats ) {
for ( Sentence s : corpus ) {
for ( Sentence s : corpus ) {
for ( Word w : s . getWords ( ) ) {
for ( Word w : s . getWords ( ) ) {
List < String > taxonomy = w . getTaxonomy ( ) ;
List < String > taxonomy = s . getTaxonomy ( ) ;
String word = w . getForCf ( stats . getFilter ( ) . getCalculateFor ( ) , stats . getFilter ( ) . isCvv ( ) ) ;
String word = w . getForCf ( stats . getFilter ( ) . getCalculateFor ( ) , stats . getFilter ( ) . isCvv ( ) ) ;
// skip this iteration if:
// skip this iteration if:
@ -152,7 +195,7 @@ public class Ngrams {
for ( int i = 0 ; i < word . length ( ) - stats . getFilter ( ) . getStringLength ( ) + 1 ; i + + ) {
for ( int i = 0 ; i < word . length ( ) - stats . getFilter ( ) . getStringLength ( ) + 1 ; i + + ) {
// TODO: locila?
// TODO: locila?
MultipleHMKeys multipleKeys = new MultipleHMKeys ( word . substring ( i , i + stats . getFilter ( ) . getStringLength ( ) ) ) ;
MultipleHMKeys multipleKeys = new MultipleHMKeys 1 ( word . substring ( i , i + stats . getFilter ( ) . getStringLength ( ) ) ) ;
stats . updateTaxonomyResults ( multipleKeys , taxonomy ) ;
stats . updateTaxonomyResults ( multipleKeys , taxonomy ) ;
// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
@ -183,8 +226,7 @@ public class Ngrams {
String punctuation = "," ;
String punctuation = "," ;
return new Word ( sentence . get ( i ) . getWord ( ) + punctuation ,
return new Word ( sentence . get ( i ) . getWord ( ) + punctuation ,
sentence . get ( i ) . getLemma ( ) + punctuation ,
sentence . get ( i ) . getLemma ( ) + punctuation ,
sentence . get ( i ) . getMsd ( ) + punctuation ,
sentence . get ( i ) . getMsd ( ) + punctuation ) ;
sentence . get ( i ) . getTaxonomy ( ) ) ;
}
}
}
}
return sentence . get ( i ) ;
return sentence . get ( i ) ;
@ -204,6 +246,10 @@ public class Ngrams {
for ( Sentence s : corpus ) {
for ( Sentence s : corpus ) {
List < Word > sentence = s . getWords ( ) ;
List < Word > sentence = s . getWords ( ) ;
if ( sentence = = null ) {
continue ;
}
for ( int i = 0 ; i < = sentence . size ( ) - ngram ; i + + ) { // 1gram
for ( int i = 0 ; i < = sentence . size ( ) - ngram ; i + + ) { // 1gram
for ( int j = i + 1 ; j < = i + skip + 1 ; j + + ) { // 2gram
for ( int j = i + 1 ; j < = i + skip + 1 ; j + + ) { // 2gram
if ( ngram = = 2 & & j < sentence . size ( ) ) {
if ( ngram = = 2 & & j < sentence . size ( ) ) {
@ -260,7 +306,7 @@ public class Ngrams {
if ( ! stats . getFilter ( ) . hasMsd ( ) | | passesRegex ( skipgramCandidate , stats . getFilter ( ) . getMsd ( ) ) ) {
if ( ! stats . getFilter ( ) . hasMsd ( ) | | passesRegex ( skipgramCandidate , stats . getFilter ( ) . getMsd ( ) ) ) {
String key = wordToString ( skipgramCandidate , stats . getFilter ( ) . getCalculateFor ( ) ) ;
String key = wordToString ( skipgramCandidate , stats . getFilter ( ) . getCalculateFor ( ) ) ;
key = ( key . charAt ( key . length ( ) - 1 ) = = ',' ) ? key . substring ( 0 , key . length ( ) - 1 ) : key ;
key = ( key . charAt ( key . length ( ) - 1 ) = = ',' ) ? key . substring ( 0 , key . length ( ) - 1 ) : key ;
stats . updateTaxonomyResults ( new MultipleHMKeys ( key , "" , "" , "" ) ,
stats . updateTaxonomyResults ( new MultipleHMKeys 1 ( key ) ,
stats . getCorpus ( ) . getTaxonomy ( ) ) ;
stats . getCorpus ( ) . getTaxonomy ( ) ) ;
}
}
}
}