Added fixes on ssj500k functionality, fixed prefix/suffix bug and some other bugs.

This commit is contained in:
2018-12-01 10:50:11 +01:00
parent 9efe3d529b
commit ca83cb023b
14 changed files with 530 additions and 162 deletions

View File

@@ -313,6 +313,17 @@ public class XML_processing {
}
if (c3Content.equals(".") && includeThisBlock) {
if (stats.getFilter().getNgramValue() == 0){
int numSentenceParts = 0;
for(Word w : stavek){
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
}
stats.updateUniGramOccurrences(numSentenceParts, new ArrayList<>());
} else if(stats.getFilter().getNgramValue() >= 1) {
stats.updateUniGramOccurrences(stavek.size(), new ArrayList<>());
}
// add sentence to corpus
corpus.add(new Sentence(stavek, null));
// and start a new one
@@ -637,8 +648,16 @@ public class XML_processing {
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// count all UniGramOccurrences in sentence for statistics
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
if (stats.getFilter().getNgramValue() == 0){
int numSentenceParts = 0;
for(Word w : sentence){
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
}
stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
} else if(stats.getFilter().getNgramValue() >= 1) {
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
}
// add sentence to corpus if it passes filters
sentence = runFilters(sentence, stats.getFilter());
@@ -713,6 +732,7 @@ public class XML_processing {
public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
boolean inWord = false;
boolean inPunctuation = false;
boolean taxonomyMatch = true;
ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
// ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
String lemma = "";
@@ -759,10 +779,14 @@ public class XML_processing {
// keep only taxonomy properties
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""));
currentFiletaxonomy.add(currentFiletaxonomyElement);
Tax taxonomy = new Tax();
// Tax taxonomy = new Tax();
// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
}
}
} else if (qName.equals("bibl")) {
// before proceeding to read this file, make sure that taxonomy filters are a match
taxonomyMatch = true;
}
break;
case XMLStreamConstants.CHARACTERS:
@@ -789,10 +813,21 @@ public class XML_processing {
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
if (stats.getFilter().getNgramValue() == 0){
int numSentenceParts = 0;
for(Word w : sentence){
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
}
stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
} else if(stats.getFilter().getNgramValue() >= 1) {
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
}
// add sentence to corpus if it passes filters
sentence = runFilters(sentence, stats.getFilter());
if (!ValidationUtil.isEmpty(sentence)) {
if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
corpus.add(new Sentence(sentence, currentFiletaxonomy));
}
@@ -821,7 +856,20 @@ public class XML_processing {
currentFiletaxonomy = new ArrayList<>();
// currentFiletaxonomyLong = new ArrayList<>();
}
} else if (endElement.getName().getLocalPart().equals("bibl")) {
// before proceeding to read this file, make sure that taxonomy filters are a match
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
if (currentFiletaxonomy.isEmpty()) {
// taxonomies don't match so stop
// return false;
taxonomyMatch = false;
// System.out.println("TEST");
}
}
}
break;
}
@@ -925,7 +973,7 @@ public class XML_processing {
// keep only taxonomy properties
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()));
currentFiletaxonomy.add(currentFiletaxonomyElement);
Tax taxonomy = new Tax();
// Tax taxonomy = new Tax();
// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
}
} else if (qName.equalsIgnoreCase("div")) {
@@ -1007,6 +1055,17 @@ public class XML_processing {
sentence = GOSCorpusHM.remove(GOSCorpusHMKey);
if (stats.getFilter().getNgramValue() == 0){
int numSentenceParts = 0;
for(Word w : sentence){
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
}
stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
} else if(stats.getFilter().getNgramValue() >= 1) {
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
}
// add sentence to corpus if it passes filters
if (includeFile && !ValidationUtil.isEmpty(sentence)) {
sentence = runFilters(sentence, stats.getFilter());
@@ -1040,7 +1099,7 @@ public class XML_processing {
// disregard this entry if taxonomies don't match
includeFile = !currentFiletaxonomy.isEmpty();
currentFiletaxonomy = new ArrayList<>();
// currentFiletaxonomy = new ArrayList<>();
}
}