Added new ssj500k reading option. Fixed GOS taxonomy

This commit is contained in:
2018-09-03 13:31:41 +02:00
parent 426a9ccc46
commit 1d9e9b7ed6
9 changed files with 280 additions and 40 deletions

View File

@@ -52,7 +52,9 @@ public class XML_processing {
readXMLGos(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
readXMLSolar(path, stats);
}
} else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) {
readXMLSSJ500K(path, stats);
}
}
/**
@@ -91,6 +93,50 @@ public class XML_processing {
return "";
}
/**
* Reads and returns the value of a passed header attribute or an empty string.
* E.g. body base attribute, for discerning the corpus' type of ssj500k.
* Notice: returns only the value of the first occurrence of a given tag name.
*/
public static String readXMLHeaderAttribute(String path, String tag, String attribute) {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = null;
try {
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent xmlEvent = eventReader.nextEvent();
if (xmlEvent.isStartElement()) {
StartElement startElement = xmlEvent.asStartElement();
String var = startElement.getName().getLocalPart();
if (var.equalsIgnoreCase(tag)) {
HashMap<String, String> att = extractAttributes(startElement);
if (att.containsKey("base")) {
return att.get("base").substring(0, att.get("base").length() - 12);
}
return eventReader.nextEvent().asCharacters().getData();
}
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return "";
}
private static void fj(List<Sentence> corpus, StatisticsNew stats) {
ForkJoinPool pool = new ForkJoinPool();
@@ -403,7 +449,9 @@ public class XML_processing {
// init results now to avoid null pointers
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
} else {
} else if (corpusType == CorpusType.SSJ500K) {
headTagName = "bibl";
} else {
headTagName = "teiHeader";
}
@@ -437,7 +485,13 @@ public class XML_processing {
.replace("#", "");
resultTaxonomy.add(tax);
} else if (!parseTaxonomy && headTags.contains(elementName)) {
} else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) {
String tax = startElement.getAttributeByName(QName.valueOf("ref"))
.getValue()
.replace("#", "");
resultTaxonomy.add(tax);
} else if (!parseTaxonomy && headTags.contains(elementName)) {
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
resultFilters.get(elementName).add(tagContent);
}
@@ -646,6 +700,138 @@ public class XML_processing {
return true;
}
@SuppressWarnings("Duplicates")
public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
boolean inWord = false;
boolean inPunctuation = false;
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
String lemma = "";
String msd = "";
List<Word> sentence = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
String sentenceDelimiter = "s";
XMLEventReader eventReader = null;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
String qName = startElement.getName().getLocalPart();
// "word" node
if (qName.equals("w")) {
inWord = true;
if (!String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("msd:")){
System.out.println("MSD written incorrectly");
}
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(4);
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
}
else if (qName.equals("pc")){
inPunctuation = true;
}
// taxonomy node
else if (qName.equalsIgnoreCase("term")) {
// there are some term nodes at the beginning that are of no interest to us
// they differ by not having the attribute "ref", so test will equal null
Attribute tax = startElement.getAttributeByName(QName.valueOf("ref"));
if (tax != null) {
// keep only taxonomy properties
String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", "");
currentFiletaxonomy.add(currentFiletaxonomyElement);
Tax taxonomy = new Tax();
currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
}
}
break;
case XMLStreamConstants.CHARACTERS:
Characters characters = event.asCharacters();
// "word" node value
if (inWord) {
String word = characters.getData();
sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
inWord = false;
}
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
String punctuation = characters.getData();
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
inPunctuation = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
String var = endElement.getName().getLocalPart();
String debug = "";
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// add sentence to corpus if it passes filters
sentence = runFilters(sentence, stats.getFilter());
if (!ValidationUtil.isEmpty(sentence)) {
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
}
// and start a new one
sentence = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need the data anymore
corpus.clear();
// TODO: if (stats.isUseDB()) {
// stats.storeTmpResultsToDB();
// }
}
}
// fallback
else if (endElement.getName().getLocalPart().equalsIgnoreCase("div")) {
// join corpus and stats
fj(corpus, stats);
corpus.clear();
currentFiletaxonomy = new ArrayList<>();
currentFiletaxonomyLong = new ArrayList<>();
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return true;
}
@SuppressWarnings("Duplicates")
public static boolean readXMLGos(String path, StatisticsNew stats) {
boolean inWord = false;
@@ -853,6 +1039,9 @@ public class XML_processing {
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
fj(corpus, stats);
corpus.clear();
currentFiletaxonomy = new ArrayList<>();
currentFiletaxonomyLong = new ArrayList<>();
}
break;
@@ -914,7 +1103,7 @@ public class XML_processing {
return atts;
}
private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
public static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
List<String> wString = new ArrayList<>();
if (f.getWordParts().contains(CalculateFor.WORD))
wString.add(word);