Added functionality for n-grams (comma separation), minimal occurances etc.
This commit is contained in:
@@ -293,13 +293,17 @@ public class XML_processing {
|
||||
|
||||
// "word" node value
|
||||
if (in_word) {
|
||||
stavek.add(new Word(characters.getData(), lemma, msd));
|
||||
stavek.add(new Word(characters.getData(), lemma, msd, null));
|
||||
in_word = false;
|
||||
} else if(inPunctuation){
|
||||
String punctuation = ",";
|
||||
stavek.get(stavek.size()-1).setWord(stavek.get(stavek.size()-1).getWord() + punctuation);
|
||||
stavek.get(stavek.size()-1).setLemma(stavek.get(stavek.size()-1).getLemma() + punctuation);
|
||||
stavek.get(stavek.size()-1).setMsd(stavek.get(stavek.size()-1).getMsd() + punctuation);
|
||||
|
||||
if (stavek.size() > 0){
|
||||
stavek.get(stavek.size()-1).setWord(stavek.get(stavek.size()-1).getWord() + punctuation);
|
||||
stavek.get(stavek.size()-1).setLemma(stavek.get(stavek.size()-1).getLemma() + punctuation);
|
||||
stavek.get(stavek.size()-1).setMsd(stavek.get(stavek.size()-1).getMsd() + punctuation);
|
||||
}
|
||||
|
||||
inPunctuation = false;
|
||||
}
|
||||
break;
|
||||
@@ -652,6 +656,7 @@ public class XML_processing {
|
||||
boolean inOrthDiv = false;
|
||||
boolean computeForOrth = stats.getCorpus().isGosOrthMode();
|
||||
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
|
||||
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
|
||||
String lemma = "";
|
||||
String msd = "";
|
||||
|
||||
@@ -718,7 +723,10 @@ public class XML_processing {
|
||||
|
||||
if (tax != null) {
|
||||
// keep only taxonomy properties
|
||||
currentFiletaxonomy.add(String.valueOf(tax.getValue()));
|
||||
String currentFiletaxonomyElement = String.valueOf(tax.getValue());
|
||||
currentFiletaxonomy.add(currentFiletaxonomyElement);
|
||||
Tax taxonomy = new Tax();
|
||||
currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
|
||||
}
|
||||
} else if (qName.equalsIgnoreCase("div")) {
|
||||
gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
|
||||
@@ -730,9 +738,9 @@ public class XML_processing {
|
||||
if (inWord) {
|
||||
Characters characters = event.asCharacters();
|
||||
if (gosType.equals("norm") && msd != null) {
|
||||
sentence.add(new Word(characters.getData(), lemma, msd));
|
||||
sentence.add(new Word(characters.getData(), lemma, msd, currentFiletaxonomyLong));
|
||||
} else {
|
||||
sentence.add(new Word(characters.getData()));
|
||||
sentence.add(new Word(characters.getData(), lemma, msd, currentFiletaxonomyLong));
|
||||
}
|
||||
|
||||
inWord = false;
|
||||
|
||||
Reference in New Issue
Block a user