Reimplementation of other signs (,/*() etc.) in ngrams.

This commit is contained in:
2018-08-28 11:41:19 +02:00
parent a8d147de52
commit 1c00f1a283
9 changed files with 203 additions and 91 deletions

View File

@@ -260,6 +260,12 @@ public class XML_processing {
} else if (qName.equals("c3")) {
String c3Content = eventReader.nextEvent().asCharacters().getData();
if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() &&
stavek.size() > 0){
stavek.add(new Word(c3Content, c3Content, "/"));
}
if (c3Content.equals(".") && includeThisBlock) {
// add sentence to corpus
corpus.add(new Sentence(stavek, null));
@@ -277,9 +283,6 @@ public class XML_processing {
corpus.clear();
}
}
else if(includeThisBlock){
inPunctuation = true;
}
} else if (headTags.contains(qName)) {
String tagContent = eventReader.nextEvent().asCharacters().getData();
headBlock.put(qName, tagContent);
@@ -296,17 +299,7 @@ public class XML_processing {
if (in_word) {
stavek.add(new Word(characters.getData(), lemma, msd));
in_word = false;
} else if(inPunctuation){
String punctuation = ",";
if (stavek.size() > 0){
stavek.get(stavek.size()-1).setWord(stavek.get(stavek.size()-1).getWord() + punctuation);
stavek.get(stavek.size()-1).setLemma(stavek.get(stavek.size()-1).getLemma() + punctuation);
stavek.get(stavek.size()-1).setMsd(stavek.get(stavek.size()-1).getMsd() + punctuation);
}
inPunctuation = false;
}
}
break;
case XMLStreamConstants.END_ELEMENT:
@@ -548,13 +541,16 @@ public class XML_processing {
inWord = false;
}
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
// String punctuation = characters.getData();
String punctuation = ",";
String punctuation = characters.getData();
sentence.add(new Word(punctuation, punctuation, "/"));
inPunctuation = false;
sentence.get(sentence.size() - 1).setWord(sentence.get(sentence.size() - 1).getWord() + punctuation);
sentence.get(sentence.size() - 1).setLemma(sentence.get(sentence.size() - 1).getLemma() + punctuation);
sentence.get(sentence.size() - 1).setMsd(sentence.get(sentence.size() - 1).getMsd() + punctuation);
inPunctuation = false;
// String punctuation = ",";
//
// sentence.get(sentence.size() - 1).setWord(sentence.get(sentence.size() - 1).getWord() + punctuation);
// sentence.get(sentence.size() - 1).setLemma(sentence.get(sentence.size() - 1).getLemma() + punctuation);
// sentence.get(sentence.size() - 1).setMsd(sentence.get(sentence.size() - 1).getMsd() + punctuation);
// inPunctuation = false;
}
break;