Solar grand fix.

This commit is contained in:
2019-01-11 13:00:42 +01:00
parent 9ee5ab9afc
commit 2d7d5169cc
17 changed files with 262 additions and 95 deletions

View File

@@ -306,7 +306,8 @@ public class XML_processing {
List<Sentence> corpus = new ArrayList<>();
// used for filter
Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto"));
// Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto"));
Set<String> headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO));
Map<String, String> headBlock = null;
boolean includeThisBlock = false;
@@ -356,36 +357,36 @@ public class XML_processing {
String qName = startElement.getName().getLocalPart();
// "word" node
if (qName.equals("w3")) {
if (qName.equals("w3") || qName.equals("w1") || qName.equals("w")) {
in_word = true;
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
} else if (qName.equals("c3")) {
} else if (qName.equals("c3") || qName.equals("c1") || qName.equals("c")) {
String c3Content = eventReader.nextEvent().asCharacters().getData();
if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() &&
stavek.size() > 0){
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() &&
stavek.size() > 0) {
stavek.add(createWord(c3Content, c3Content, "/", "", stats.getFilter()));
}
if (c3Content.equals(".") && includeThisBlock) {
if (stats.getFilter().getNgramValue() == 0){
int numSentenceParts = 0;
for(Word w : stavek){
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
}
stats.updateUniGramOccurrences(numSentenceParts, new ArrayList<>());
} else if(stats.getFilter().getNgramValue() >= 1) {
stats.updateUniGramOccurrences(stavek.size(), new ArrayList<>());
} else if ((qName.equals("st1") && startElement.getAttributeByName(QName.valueOf("tip")).getValue().equals("0")) || qName.equals("s")) {
if (stats.getFilter().getNgramValue() == 0){
int numSentenceParts = 0;
for(Word w : stavek){
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
}
stats.updateUniGramOccurrences(numSentenceParts, new ArrayList<>());
} else if(stats.getFilter().getNgramValue() >= 1) {
stats.updateUniGramOccurrences(stavek.size(), new ArrayList<>());
}
if(includeThisBlock) {
// add sentence to corpus
corpus.add(new Sentence(stavek, null));
// and start a new one
stavek = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
@@ -398,11 +399,28 @@ public class XML_processing {
corpus.clear();
}
}
} else if (headTags.contains(qName)) {
String tagContent = eventReader.nextEvent().asCharacters().getData();
headBlock.put(qName, tagContent);
stavek = new ArrayList<>();
} else if (qName.equals("head")) {
headBlock = new HashMap<>();
} else { // if (headTags.contains(qName)) {
boolean inHeadTags = false;
String headTag = "";
for (String tag : headTags){
if(I18N.getDefaultLocaleItem(tag).equals(qName)){
inHeadTags = true;
headTag = tag;
break;
}
}
if(inHeadTags) {
String tagContent = eventReader.nextEvent().asCharacters().getData();
headBlock.put(headTag, tagContent);
// String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
// resultFilters.get(headTag).add(tagContent);
}
}
break;
@@ -429,6 +447,7 @@ public class XML_processing {
} else if (qNameEnd.equals("body")) {
// new block, reset filter status
includeThisBlock = false;
stavek = new ArrayList<>();
}
// backup
@@ -470,6 +489,9 @@ public class XML_processing {
//noinspection unchecked
for (String value : valueObject) {
pass = validateHeadBlockEntry(readHeadBlock, key, value);
if (pass){
break;
}
}
}
@@ -635,9 +657,22 @@ public class XML_processing {
.replace("#", "");
resultTaxonomy.add(tax);
} else if (!parseTaxonomy && headTags.contains(elementName)) {
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
resultFilters.get(elementName).add(tagContent);
// solar
// } else if (!parseTaxonomy && headTags.contains(elementName)) {
} else if (!parseTaxonomy) {
boolean inHeadTags = false;
String headTag = "";
for (String tag : headTags){
if(I18N.getDefaultLocaleItem(tag).equals(elementName)){
inHeadTags = true;
headTag = tag;
break;
}
}
if(inHeadTags) {
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
resultFilters.get(headTag).add(tagContent);
}
}
}
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {