|
|
|
@ -313,6 +313,17 @@ public class XML_processing {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (c3Content.equals(".") && includeThisBlock) {
|
|
|
|
|
if (stats.getFilter().getNgramValue() == 0){
|
|
|
|
|
int numSentenceParts = 0;
|
|
|
|
|
for(Word w : stavek){
|
|
|
|
|
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
|
|
|
|
|
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
|
|
|
|
|
}
|
|
|
|
|
stats.updateUniGramOccurrences(numSentenceParts, new ArrayList<>());
|
|
|
|
|
} else if(stats.getFilter().getNgramValue() >= 1) {
|
|
|
|
|
stats.updateUniGramOccurrences(stavek.size(), new ArrayList<>());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// add sentence to corpus
|
|
|
|
|
corpus.add(new Sentence(stavek, null));
|
|
|
|
|
// and start a new one
|
|
|
|
@ -637,8 +648,16 @@ public class XML_processing {
|
|
|
|
|
// parser reached end of the current sentence
|
|
|
|
|
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
|
|
|
|
// count all UniGramOccurrences in sentence for statistics
|
|
|
|
|
if (stats.getFilter().getNgramValue() == 0){
|
|
|
|
|
int numSentenceParts = 0;
|
|
|
|
|
for(Word w : sentence){
|
|
|
|
|
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
|
|
|
|
|
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
|
|
|
|
|
}
|
|
|
|
|
stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
|
|
|
|
|
} else if(stats.getFilter().getNgramValue() >= 1) {
|
|
|
|
|
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
// add sentence to corpus if it passes filters
|
|
|
|
|
sentence = runFilters(sentence, stats.getFilter());
|
|
|
|
|
|
|
|
|
@ -713,6 +732,7 @@ public class XML_processing {
|
|
|
|
|
public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
|
|
|
|
|
boolean inWord = false;
|
|
|
|
|
boolean inPunctuation = false;
|
|
|
|
|
boolean taxonomyMatch = true;
|
|
|
|
|
ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
|
|
|
|
|
// ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
|
|
|
|
|
String lemma = "";
|
|
|
|
@ -759,9 +779,13 @@ public class XML_processing {
|
|
|
|
|
// keep only taxonomy properties
|
|
|
|
|
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""));
|
|
|
|
|
currentFiletaxonomy.add(currentFiletaxonomyElement);
|
|
|
|
|
Tax taxonomy = new Tax();
|
|
|
|
|
// Tax taxonomy = new Tax();
|
|
|
|
|
// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
|
|
|
|
|
}
|
|
|
|
|
} else if (qName.equals("bibl")) {
|
|
|
|
|
// before proceeding to read this file, make sure that taxonomy filters are a match
|
|
|
|
|
taxonomyMatch = true;
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
@ -789,10 +813,21 @@ public class XML_processing {
|
|
|
|
|
|
|
|
|
|
// parser reached end of the current sentence
|
|
|
|
|
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
|
|
|
|
if (stats.getFilter().getNgramValue() == 0){
|
|
|
|
|
int numSentenceParts = 0;
|
|
|
|
|
for(Word w : sentence){
|
|
|
|
|
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
|
|
|
|
|
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
|
|
|
|
|
}
|
|
|
|
|
stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
|
|
|
|
|
} else if(stats.getFilter().getNgramValue() >= 1) {
|
|
|
|
|
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// add sentence to corpus if it passes filters
|
|
|
|
|
sentence = runFilters(sentence, stats.getFilter());
|
|
|
|
|
|
|
|
|
|
if (!ValidationUtil.isEmpty(sentence)) {
|
|
|
|
|
if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
|
|
|
|
|
corpus.add(new Sentence(sentence, currentFiletaxonomy));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -821,6 +856,19 @@ public class XML_processing {
|
|
|
|
|
|
|
|
|
|
currentFiletaxonomy = new ArrayList<>();
|
|
|
|
|
// currentFiletaxonomyLong = new ArrayList<>();
|
|
|
|
|
} else if (endElement.getName().getLocalPart().equals("bibl")) {
|
|
|
|
|
// before proceeding to read this file, make sure that taxonomy filters are a match
|
|
|
|
|
|
|
|
|
|
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
|
|
|
|
|
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
|
|
|
|
|
|
|
|
|
|
if (currentFiletaxonomy.isEmpty()) {
|
|
|
|
|
// taxonomies don't match so stop
|
|
|
|
|
// return false;
|
|
|
|
|
taxonomyMatch = false;
|
|
|
|
|
// System.out.println("TEST");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
@ -925,7 +973,7 @@ public class XML_processing {
|
|
|
|
|
// keep only taxonomy properties
|
|
|
|
|
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()));
|
|
|
|
|
currentFiletaxonomy.add(currentFiletaxonomyElement);
|
|
|
|
|
Tax taxonomy = new Tax();
|
|
|
|
|
// Tax taxonomy = new Tax();
|
|
|
|
|
// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
|
|
|
|
|
}
|
|
|
|
|
} else if (qName.equalsIgnoreCase("div")) {
|
|
|
|
@ -1007,6 +1055,17 @@ public class XML_processing {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sentence = GOSCorpusHM.remove(GOSCorpusHMKey);
|
|
|
|
|
if (stats.getFilter().getNgramValue() == 0){
|
|
|
|
|
int numSentenceParts = 0;
|
|
|
|
|
for(Word w : sentence){
|
|
|
|
|
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
|
|
|
|
|
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
|
|
|
|
|
}
|
|
|
|
|
stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
|
|
|
|
|
} else if(stats.getFilter().getNgramValue() >= 1) {
|
|
|
|
|
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// add sentence to corpus if it passes filters
|
|
|
|
|
if (includeFile && !ValidationUtil.isEmpty(sentence)) {
|
|
|
|
|
sentence = runFilters(sentence, stats.getFilter());
|
|
|
|
@ -1040,7 +1099,7 @@ public class XML_processing {
|
|
|
|
|
// disregard this entry if taxonomies don't match
|
|
|
|
|
includeFile = !currentFiletaxonomy.isEmpty();
|
|
|
|
|
|
|
|
|
|
currentFiletaxonomy = new ArrayList<>();
|
|
|
|
|
// currentFiletaxonomy = new ArrayList<>();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|