Reimplementation of other signs (,/*() etc.) in ngrams.
This commit is contained in:
@@ -260,6 +260,12 @@ public class XML_processing {
|
||||
} else if (qName.equals("c3")) {
|
||||
String c3Content = eventReader.nextEvent().asCharacters().getData();
|
||||
|
||||
if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() &&
|
||||
stavek.size() > 0){
|
||||
stavek.add(new Word(c3Content, c3Content, "/"));
|
||||
|
||||
}
|
||||
|
||||
if (c3Content.equals(".") && includeThisBlock) {
|
||||
// add sentence to corpus
|
||||
corpus.add(new Sentence(stavek, null));
|
||||
@@ -277,9 +283,6 @@ public class XML_processing {
|
||||
corpus.clear();
|
||||
}
|
||||
}
|
||||
else if(includeThisBlock){
|
||||
inPunctuation = true;
|
||||
}
|
||||
} else if (headTags.contains(qName)) {
|
||||
String tagContent = eventReader.nextEvent().asCharacters().getData();
|
||||
headBlock.put(qName, tagContent);
|
||||
@@ -296,17 +299,7 @@ public class XML_processing {
|
||||
if (in_word) {
|
||||
stavek.add(new Word(characters.getData(), lemma, msd));
|
||||
in_word = false;
|
||||
} else if(inPunctuation){
|
||||
String punctuation = ",";
|
||||
|
||||
if (stavek.size() > 0){
|
||||
stavek.get(stavek.size()-1).setWord(stavek.get(stavek.size()-1).getWord() + punctuation);
|
||||
stavek.get(stavek.size()-1).setLemma(stavek.get(stavek.size()-1).getLemma() + punctuation);
|
||||
stavek.get(stavek.size()-1).setMsd(stavek.get(stavek.size()-1).getMsd() + punctuation);
|
||||
}
|
||||
|
||||
inPunctuation = false;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.END_ELEMENT:
|
||||
@@ -548,13 +541,16 @@ public class XML_processing {
|
||||
inWord = false;
|
||||
}
|
||||
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
|
||||
// String punctuation = characters.getData();
|
||||
String punctuation = ",";
|
||||
String punctuation = characters.getData();
|
||||
sentence.add(new Word(punctuation, punctuation, "/"));
|
||||
inPunctuation = false;
|
||||
|
||||
sentence.get(sentence.size() - 1).setWord(sentence.get(sentence.size() - 1).getWord() + punctuation);
|
||||
sentence.get(sentence.size() - 1).setLemma(sentence.get(sentence.size() - 1).getLemma() + punctuation);
|
||||
sentence.get(sentence.size() - 1).setMsd(sentence.get(sentence.size() - 1).getMsd() + punctuation);
|
||||
inPunctuation = false;
|
||||
// String punctuation = ",";
|
||||
//
|
||||
// sentence.get(sentence.size() - 1).setWord(sentence.get(sentence.size() - 1).getWord() + punctuation);
|
||||
// sentence.get(sentence.size() - 1).setLemma(sentence.get(sentence.size() - 1).getLemma() + punctuation);
|
||||
// sentence.get(sentence.size() - 1).setMsd(sentence.get(sentence.size() - 1).getMsd() + punctuation);
|
||||
// inPunctuation = false;
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user