Solar grand fix.
This commit is contained in:
@@ -306,7 +306,8 @@ public class XML_processing {
|
||||
List<Sentence> corpus = new ArrayList<>();
|
||||
|
||||
// used for filter
|
||||
Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto"));
|
||||
// Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto"));
|
||||
Set<String> headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO));
|
||||
Map<String, String> headBlock = null;
|
||||
boolean includeThisBlock = false;
|
||||
|
||||
@@ -356,36 +357,36 @@ public class XML_processing {
|
||||
String qName = startElement.getName().getLocalPart();
|
||||
|
||||
// "word" node
|
||||
if (qName.equals("w3")) {
|
||||
if (qName.equals("w3") || qName.equals("w1") || qName.equals("w")) {
|
||||
in_word = true;
|
||||
|
||||
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
|
||||
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
||||
} else if (qName.equals("c3")) {
|
||||
} else if (qName.equals("c3") || qName.equals("c1") || qName.equals("c")) {
|
||||
String c3Content = eventReader.nextEvent().asCharacters().getData();
|
||||
|
||||
if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() &&
|
||||
stavek.size() > 0){
|
||||
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() &&
|
||||
stavek.size() > 0) {
|
||||
stavek.add(createWord(c3Content, c3Content, "/", "", stats.getFilter()));
|
||||
|
||||
}
|
||||
|
||||
if (c3Content.equals(".") && includeThisBlock) {
|
||||
if (stats.getFilter().getNgramValue() == 0){
|
||||
int numSentenceParts = 0;
|
||||
for(Word w : stavek){
|
||||
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
|
||||
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
|
||||
}
|
||||
stats.updateUniGramOccurrences(numSentenceParts, new ArrayList<>());
|
||||
} else if(stats.getFilter().getNgramValue() >= 1) {
|
||||
stats.updateUniGramOccurrences(stavek.size(), new ArrayList<>());
|
||||
} else if ((qName.equals("st1") && startElement.getAttributeByName(QName.valueOf("tip")).getValue().equals("0")) || qName.equals("s")) {
|
||||
if (stats.getFilter().getNgramValue() == 0){
|
||||
int numSentenceParts = 0;
|
||||
for(Word w : stavek){
|
||||
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
|
||||
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
|
||||
}
|
||||
stats.updateUniGramOccurrences(numSentenceParts, new ArrayList<>());
|
||||
} else if(stats.getFilter().getNgramValue() >= 1) {
|
||||
stats.updateUniGramOccurrences(stavek.size(), new ArrayList<>());
|
||||
}
|
||||
|
||||
if(includeThisBlock) {
|
||||
// add sentence to corpus
|
||||
corpus.add(new Sentence(stavek, null));
|
||||
// and start a new one
|
||||
stavek = new ArrayList<>();
|
||||
|
||||
/* Invoke Fork-Join when we reach maximum limit of
|
||||
* sentences (because we can't read everything to
|
||||
@@ -398,11 +399,28 @@ public class XML_processing {
|
||||
corpus.clear();
|
||||
}
|
||||
}
|
||||
} else if (headTags.contains(qName)) {
|
||||
String tagContent = eventReader.nextEvent().asCharacters().getData();
|
||||
headBlock.put(qName, tagContent);
|
||||
stavek = new ArrayList<>();
|
||||
} else if (qName.equals("head")) {
|
||||
headBlock = new HashMap<>();
|
||||
} else { // if (headTags.contains(qName)) {
|
||||
boolean inHeadTags = false;
|
||||
String headTag = "";
|
||||
for (String tag : headTags){
|
||||
if(I18N.getDefaultLocaleItem(tag).equals(qName)){
|
||||
inHeadTags = true;
|
||||
headTag = tag;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(inHeadTags) {
|
||||
String tagContent = eventReader.nextEvent().asCharacters().getData();
|
||||
headBlock.put(headTag, tagContent);
|
||||
// String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
|
||||
// resultFilters.get(headTag).add(tagContent);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
break;
|
||||
@@ -429,6 +447,7 @@ public class XML_processing {
|
||||
} else if (qNameEnd.equals("body")) {
|
||||
// new block, reset filter status
|
||||
includeThisBlock = false;
|
||||
stavek = new ArrayList<>();
|
||||
}
|
||||
|
||||
// backup
|
||||
@@ -470,6 +489,9 @@ public class XML_processing {
|
||||
//noinspection unchecked
|
||||
for (String value : valueObject) {
|
||||
pass = validateHeadBlockEntry(readHeadBlock, key, value);
|
||||
if (pass){
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -635,9 +657,22 @@ public class XML_processing {
|
||||
.replace("#", "");
|
||||
|
||||
resultTaxonomy.add(tax);
|
||||
} else if (!parseTaxonomy && headTags.contains(elementName)) {
|
||||
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
|
||||
resultFilters.get(elementName).add(tagContent);
|
||||
// solar
|
||||
// } else if (!parseTaxonomy && headTags.contains(elementName)) {
|
||||
} else if (!parseTaxonomy) {
|
||||
boolean inHeadTags = false;
|
||||
String headTag = "";
|
||||
for (String tag : headTags){
|
||||
if(I18N.getDefaultLocaleItem(tag).equals(elementName)){
|
||||
inHeadTags = true;
|
||||
headTag = tag;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(inHeadTags) {
|
||||
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
|
||||
resultFilters.get(headTag).add(tagContent);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
|
||||
|
||||
Reference in New Issue
Block a user