@ -224,7 +224,8 @@ public class XML_processing {
@SuppressWarnings ( "unused" )
public static void readXMLSolar ( String path , StatisticsNew stats ) {
boolean in_word = false ;
String lemma = "" ;
boolean inPunctuation = false ;
String lemma = "" ;
String msd = "" ;
List < Word > stavek = new ArrayList < > ( ) ;
@ -275,6 +276,9 @@ public class XML_processing {
corpus . clear ( ) ;
}
}
else if ( includeThisBlock ) {
inPunctuation = true ;
}
} else if ( headTags . contains ( qName ) ) {
String tagContent = eventReader . nextEvent ( ) . asCharacters ( ) . getData ( ) ;
headBlock . put ( qName , tagContent ) ;
@ -291,7 +295,13 @@ public class XML_processing {
if ( in_word ) {
stavek . add ( new Word ( characters . getData ( ) , lemma , msd ) ) ;
in_word = false ;
}
} else if ( inPunctuation ) {
String punctuation = "," ;
stavek . get ( stavek . size ( ) - 1 ) . setWord ( stavek . get ( stavek . size ( ) - 1 ) . getWord ( ) + punctuation ) ;
stavek . get ( stavek . size ( ) - 1 ) . setLemma ( stavek . get ( stavek . size ( ) - 1 ) . getLemma ( ) + punctuation ) ;
stavek . get ( stavek . size ( ) - 1 ) . setMsd ( stavek . get ( stavek . size ( ) - 1 ) . getMsd ( ) + punctuation ) ;
inPunctuation = false ;
}
break ;
case XMLStreamConstants . END_ELEMENT :
@ -472,6 +482,7 @@ public class XML_processing {
@SuppressWarnings ( "Duplicates" )
public static boolean readXMLGigafida ( String path , StatisticsNew stats ) {
boolean inWord = false ;
boolean inPunctuation = false ;
ArrayList < String > currentFiletaxonomy = new ArrayList < > ( ) ;
ArrayList < String > currentFiletaxonomyLong = new ArrayList < > ( ) ;
String lemma = "" ;
@ -501,6 +512,11 @@ public class XML_processing {
msd = String . valueOf ( startElement . getAttributeByName ( QName . valueOf ( "msd" ) ) . getValue ( ) ) ;
lemma = String . valueOf ( startElement . getAttributeByName ( QName . valueOf ( "lemma" ) ) . getValue ( ) ) ;
}
if ( qName . equals ( "c" ) ) {
inPunctuation = true ;
}
// taxonomy node
else if ( qName . equalsIgnoreCase ( "catRef" ) ) {
// there are some term nodes at the beginning that are of no interest to us
@ -526,6 +542,14 @@ public class XML_processing {
sentence . add ( new Word ( word , lemma , msd , currentFiletaxonomyLong ) ) ;
inWord = false ;
}
// if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
//// String punctuation = characters.getData();
// String punctuation = ",";
// sentence.get(sentence.size()-1).setWord(sentence.get(sentence.size()-1).getWord() + punctuation);
// sentence.get(sentence.size()-1).setLemma(sentence.get(sentence.size()-1).getLemma() + punctuation);
// sentence.get(sentence.size()-1).setMsd(sentence.get(sentence.size()-1).getMsd() + punctuation);
// inPunctuation = false;
// }
break ;
case XMLStreamConstants . END_ELEMENT :
@ -604,6 +628,7 @@ public class XML_processing {
@SuppressWarnings ( "Duplicates" )
public static boolean readXMLGos ( String path , StatisticsNew stats ) {
boolean inWord = false ;
boolean inPunctuation = false ;
boolean inOrthDiv = false ;
boolean computeForOrth = stats . getCorpus ( ) . isGosOrthMode ( ) ;
ArrayList < String > currentFiletaxonomy = new ArrayList < > ( ) ;