Added functionality for n-grams (comma separation), minimal occurances etc.

This commit is contained in:
2018-07-31 08:58:17 +02:00
parent 681eb4f949
commit 179f09c4bd
27 changed files with 405 additions and 4962 deletions

View File

@@ -293,13 +293,17 @@ public class XML_processing {
// "word" node value
if (in_word) {
stavek.add(new Word(characters.getData(), lemma, msd));
stavek.add(new Word(characters.getData(), lemma, msd, null));
in_word = false;
} else if(inPunctuation){
String punctuation = ",";
stavek.get(stavek.size()-1).setWord(stavek.get(stavek.size()-1).getWord() + punctuation);
stavek.get(stavek.size()-1).setLemma(stavek.get(stavek.size()-1).getLemma() + punctuation);
stavek.get(stavek.size()-1).setMsd(stavek.get(stavek.size()-1).getMsd() + punctuation);
if (stavek.size() > 0){
stavek.get(stavek.size()-1).setWord(stavek.get(stavek.size()-1).getWord() + punctuation);
stavek.get(stavek.size()-1).setLemma(stavek.get(stavek.size()-1).getLemma() + punctuation);
stavek.get(stavek.size()-1).setMsd(stavek.get(stavek.size()-1).getMsd() + punctuation);
}
inPunctuation = false;
}
break;
@@ -652,6 +656,7 @@ public class XML_processing {
boolean inOrthDiv = false;
boolean computeForOrth = stats.getCorpus().isGosOrthMode();
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
String lemma = "";
String msd = "";
@@ -718,7 +723,10 @@ public class XML_processing {
if (tax != null) {
// keep only taxonomy properties
currentFiletaxonomy.add(String.valueOf(tax.getValue()));
String currentFiletaxonomyElement = String.valueOf(tax.getValue());
currentFiletaxonomy.add(currentFiletaxonomyElement);
Tax taxonomy = new Tax();
currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
}
} else if (qName.equalsIgnoreCase("div")) {
gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
@@ -730,9 +738,9 @@ public class XML_processing {
if (inWord) {
Characters characters = event.asCharacters();
if (gosType.equals("norm") && msd != null) {
sentence.add(new Word(characters.getData(), lemma, msd));
sentence.add(new Word(characters.getData(), lemma, msd, currentFiletaxonomyLong));
} else {
sentence.add(new Word(characters.getData()));
sentence.add(new Word(characters.getData(), lemma, msd, currentFiletaxonomyLong));
}
inWord = false;