Taxonomy refactored

This commit is contained in:
2018-11-26 13:41:35 +01:00
parent a7f3bdb925
commit 9efe3d529b
16 changed files with 1173 additions and 491 deletions

View File

@@ -536,8 +536,8 @@ public class XML_processing {
boolean inWord = false;
boolean inPunctuation = false;
boolean taxonomyMatch = true;
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
// ArrayList<Taxonomy> currentFiletaxonomyLong = new ArrayList<>();
String lemma = "";
String msd = "";
@@ -578,10 +578,10 @@ public class XML_processing {
if (tax != null) {
// keep only taxonomy properties
String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", "");
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""));
currentFiletaxonomy.add(currentFiletaxonomyElement);
Tax taxonomy = new Tax();
currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
}
}
break;
@@ -637,7 +637,7 @@ public class XML_processing {
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// count all UniGramOccurrences in sentence for statistics
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomyLong);
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
// add sentence to corpus if it passes filters
sentence = runFilters(sentence, stats.getFilter());
@@ -645,7 +645,7 @@ public class XML_processing {
if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
corpus.add(new Sentence(sentence, currentFiletaxonomy));
}
// taxonomyMatch = true;
@@ -713,8 +713,8 @@ public class XML_processing {
public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
boolean inWord = false;
boolean inPunctuation = false;
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
// ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
String lemma = "";
String msd = "";
@@ -757,10 +757,10 @@ public class XML_processing {
if (tax != null) {
// keep only taxonomy properties
String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", "");
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""));
currentFiletaxonomy.add(currentFiletaxonomyElement);
Tax taxonomy = new Tax();
currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
}
}
break;
@@ -793,7 +793,7 @@ public class XML_processing {
sentence = runFilters(sentence, stats.getFilter());
if (!ValidationUtil.isEmpty(sentence)) {
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
corpus.add(new Sentence(sentence, currentFiletaxonomy));
}
// and start a new one
@@ -820,7 +820,7 @@ public class XML_processing {
corpus.clear();
currentFiletaxonomy = new ArrayList<>();
currentFiletaxonomyLong = new ArrayList<>();
// currentFiletaxonomyLong = new ArrayList<>();
}
break;
@@ -848,8 +848,8 @@ public class XML_processing {
boolean inOrthDiv = false;
boolean computeForOrth = stats.getCorpus().isGosOrthMode();
boolean inSeparatedWord = false;
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
// ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
String lemma = "";
String msd = "";
@@ -923,10 +923,10 @@ public class XML_processing {
if (tax != null) {
// keep only taxonomy properties
String currentFiletaxonomyElement = String.valueOf(tax.getValue());
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()));
currentFiletaxonomy.add(currentFiletaxonomyElement);
Tax taxonomy = new Tax();
currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
}
} else if (qName.equalsIgnoreCase("div")) {
gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
@@ -1010,7 +1010,7 @@ public class XML_processing {
// add sentence to corpus if it passes filters
if (includeFile && !ValidationUtil.isEmpty(sentence)) {
sentence = runFilters(sentence, stats.getFilter());
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
corpus.add(new Sentence(sentence, currentFiletaxonomy));
}
wordIndex = 0;
@@ -1050,7 +1050,7 @@ public class XML_processing {
corpus.clear();
currentFiletaxonomy = new ArrayList<>();
currentFiletaxonomyLong = new ArrayList<>();
// currentFiletaxonomyLong = new ArrayList<>();
}
break;