Added fixes on ssj500k functionality, fixed prefix/suffix bug and some other bugs.
This commit is contained in:
@@ -313,6 +313,17 @@ public class XML_processing {
|
||||
}
|
||||
|
||||
if (c3Content.equals(".") && includeThisBlock) {
|
||||
if (stats.getFilter().getNgramValue() == 0){
|
||||
int numSentenceParts = 0;
|
||||
for(Word w : stavek){
|
||||
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
|
||||
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
|
||||
}
|
||||
stats.updateUniGramOccurrences(numSentenceParts, new ArrayList<>());
|
||||
} else if(stats.getFilter().getNgramValue() >= 1) {
|
||||
stats.updateUniGramOccurrences(stavek.size(), new ArrayList<>());
|
||||
}
|
||||
|
||||
// add sentence to corpus
|
||||
corpus.add(new Sentence(stavek, null));
|
||||
// and start a new one
|
||||
@@ -637,8 +648,16 @@ public class XML_processing {
|
||||
// parser reached end of the current sentence
|
||||
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
||||
// count all UniGramOccurrences in sentence for statistics
|
||||
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
|
||||
|
||||
if (stats.getFilter().getNgramValue() == 0){
|
||||
int numSentenceParts = 0;
|
||||
for(Word w : sentence){
|
||||
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
|
||||
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
|
||||
}
|
||||
stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
|
||||
} else if(stats.getFilter().getNgramValue() >= 1) {
|
||||
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
|
||||
}
|
||||
// add sentence to corpus if it passes filters
|
||||
sentence = runFilters(sentence, stats.getFilter());
|
||||
|
||||
@@ -713,6 +732,7 @@ public class XML_processing {
|
||||
public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
|
||||
boolean inWord = false;
|
||||
boolean inPunctuation = false;
|
||||
boolean taxonomyMatch = true;
|
||||
ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
|
||||
// ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
|
||||
String lemma = "";
|
||||
@@ -759,10 +779,14 @@ public class XML_processing {
|
||||
// keep only taxonomy properties
|
||||
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""));
|
||||
currentFiletaxonomy.add(currentFiletaxonomyElement);
|
||||
Tax taxonomy = new Tax();
|
||||
// Tax taxonomy = new Tax();
|
||||
// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
|
||||
}
|
||||
}
|
||||
} else if (qName.equals("bibl")) {
|
||||
// before proceeding to read this file, make sure that taxonomy filters are a match
|
||||
taxonomyMatch = true;
|
||||
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.CHARACTERS:
|
||||
@@ -789,10 +813,21 @@ public class XML_processing {
|
||||
|
||||
// parser reached end of the current sentence
|
||||
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
||||
if (stats.getFilter().getNgramValue() == 0){
|
||||
int numSentenceParts = 0;
|
||||
for(Word w : sentence){
|
||||
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
|
||||
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
|
||||
}
|
||||
stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
|
||||
} else if(stats.getFilter().getNgramValue() >= 1) {
|
||||
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
|
||||
}
|
||||
|
||||
// add sentence to corpus if it passes filters
|
||||
sentence = runFilters(sentence, stats.getFilter());
|
||||
|
||||
if (!ValidationUtil.isEmpty(sentence)) {
|
||||
if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
|
||||
corpus.add(new Sentence(sentence, currentFiletaxonomy));
|
||||
}
|
||||
|
||||
@@ -821,7 +856,20 @@ public class XML_processing {
|
||||
|
||||
currentFiletaxonomy = new ArrayList<>();
|
||||
// currentFiletaxonomyLong = new ArrayList<>();
|
||||
}
|
||||
} else if (endElement.getName().getLocalPart().equals("bibl")) {
|
||||
// before proceeding to read this file, make sure that taxonomy filters are a match
|
||||
|
||||
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
|
||||
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
|
||||
|
||||
if (currentFiletaxonomy.isEmpty()) {
|
||||
// taxonomies don't match so stop
|
||||
// return false;
|
||||
taxonomyMatch = false;
|
||||
// System.out.println("TEST");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
@@ -925,7 +973,7 @@ public class XML_processing {
|
||||
// keep only taxonomy properties
|
||||
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()));
|
||||
currentFiletaxonomy.add(currentFiletaxonomyElement);
|
||||
Tax taxonomy = new Tax();
|
||||
// Tax taxonomy = new Tax();
|
||||
// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
|
||||
}
|
||||
} else if (qName.equalsIgnoreCase("div")) {
|
||||
@@ -1007,6 +1055,17 @@ public class XML_processing {
|
||||
|
||||
|
||||
sentence = GOSCorpusHM.remove(GOSCorpusHMKey);
|
||||
if (stats.getFilter().getNgramValue() == 0){
|
||||
int numSentenceParts = 0;
|
||||
for(Word w : sentence){
|
||||
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
|
||||
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
|
||||
}
|
||||
stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
|
||||
} else if(stats.getFilter().getNgramValue() >= 1) {
|
||||
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
|
||||
}
|
||||
|
||||
// add sentence to corpus if it passes filters
|
||||
if (includeFile && !ValidationUtil.isEmpty(sentence)) {
|
||||
sentence = runFilters(sentence, stats.getFilter());
|
||||
@@ -1040,7 +1099,7 @@ public class XML_processing {
|
||||
// disregard this entry if taxonomies don't match
|
||||
includeFile = !currentFiletaxonomy.isEmpty();
|
||||
|
||||
currentFiletaxonomy = new ArrayList<>();
|
||||
// currentFiletaxonomy = new ArrayList<>();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user