|
|
|
@ -52,7 +52,9 @@ public class XML_processing {
|
|
|
|
|
readXMLGos(path, stats);
|
|
|
|
|
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
|
|
|
|
|
readXMLSolar(path, stats);
|
|
|
|
|
}
|
|
|
|
|
} else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) {
|
|
|
|
|
readXMLSSJ500K(path, stats);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
@ -91,6 +93,50 @@ public class XML_processing {
|
|
|
|
|
return "";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Reads and returns the value of a passed header attribute or an empty string.
|
|
|
|
|
* E.g. body base attribute, for discerning the corpus' type of ssj500k.
|
|
|
|
|
* Notice: returns only the value of the first occurrence of a given tag name.
|
|
|
|
|
*/
|
|
|
|
|
public static String readXMLHeaderAttribute(String path, String tag, String attribute) {
|
|
|
|
|
XMLInputFactory factory = XMLInputFactory.newInstance();
|
|
|
|
|
XMLEventReader eventReader = null;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
|
|
|
|
while (eventReader.hasNext()) {
|
|
|
|
|
XMLEvent xmlEvent = eventReader.nextEvent();
|
|
|
|
|
if (xmlEvent.isStartElement()) {
|
|
|
|
|
StartElement startElement = xmlEvent.asStartElement();
|
|
|
|
|
String var = startElement.getName().getLocalPart();
|
|
|
|
|
|
|
|
|
|
if (var.equalsIgnoreCase(tag)) {
|
|
|
|
|
HashMap<String, String> att = extractAttributes(startElement);
|
|
|
|
|
|
|
|
|
|
if (att.containsKey("base")) {
|
|
|
|
|
return att.get("base").substring(0, att.get("base").length() - 12);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return eventReader.nextEvent().asCharacters().getData();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} catch (FileNotFoundException | XMLStreamException e) {
|
|
|
|
|
e.printStackTrace();
|
|
|
|
|
} finally {
|
|
|
|
|
if (eventReader != null) {
|
|
|
|
|
try {
|
|
|
|
|
eventReader.close();
|
|
|
|
|
} catch (XMLStreamException e) {
|
|
|
|
|
logger.error("closing stream", e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return "";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static void fj(List<Sentence> corpus, StatisticsNew stats) {
|
|
|
|
|
ForkJoinPool pool = new ForkJoinPool();
|
|
|
|
|
|
|
|
|
@ -403,7 +449,9 @@ public class XML_processing {
|
|
|
|
|
|
|
|
|
|
// init results now to avoid null pointers
|
|
|
|
|
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
|
|
|
|
|
} else {
|
|
|
|
|
} else if (corpusType == CorpusType.SSJ500K) {
|
|
|
|
|
headTagName = "bibl";
|
|
|
|
|
} else {
|
|
|
|
|
headTagName = "teiHeader";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -437,7 +485,13 @@ public class XML_processing {
|
|
|
|
|
.replace("#", "");
|
|
|
|
|
|
|
|
|
|
resultTaxonomy.add(tax);
|
|
|
|
|
} else if (!parseTaxonomy && headTags.contains(elementName)) {
|
|
|
|
|
} else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) {
|
|
|
|
|
String tax = startElement.getAttributeByName(QName.valueOf("ref"))
|
|
|
|
|
.getValue()
|
|
|
|
|
.replace("#", "");
|
|
|
|
|
|
|
|
|
|
resultTaxonomy.add(tax);
|
|
|
|
|
} else if (!parseTaxonomy && headTags.contains(elementName)) {
|
|
|
|
|
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
|
|
|
|
|
resultFilters.get(elementName).add(tagContent);
|
|
|
|
|
}
|
|
|
|
@ -646,6 +700,138 @@ public class XML_processing {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@SuppressWarnings("Duplicates")
|
|
|
|
|
public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
|
|
|
|
|
boolean inWord = false;
|
|
|
|
|
boolean inPunctuation = false;
|
|
|
|
|
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
|
|
|
|
|
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
|
|
|
|
|
String lemma = "";
|
|
|
|
|
String msd = "";
|
|
|
|
|
|
|
|
|
|
List<Word> sentence = new ArrayList<>();
|
|
|
|
|
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
|
|
|
|
|
String sentenceDelimiter = "s";
|
|
|
|
|
|
|
|
|
|
XMLEventReader eventReader = null;
|
|
|
|
|
try {
|
|
|
|
|
XMLInputFactory factory = XMLInputFactory.newInstance();
|
|
|
|
|
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
|
|
|
|
|
|
|
|
|
while (eventReader.hasNext()) {
|
|
|
|
|
XMLEvent event = eventReader.nextEvent();
|
|
|
|
|
|
|
|
|
|
switch (event.getEventType()) {
|
|
|
|
|
case XMLStreamConstants.START_ELEMENT:
|
|
|
|
|
StartElement startElement = event.asStartElement();
|
|
|
|
|
String qName = startElement.getName().getLocalPart();
|
|
|
|
|
|
|
|
|
|
// "word" node
|
|
|
|
|
if (qName.equals("w")) {
|
|
|
|
|
inWord = true;
|
|
|
|
|
if (!String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("msd:")){
|
|
|
|
|
System.out.println("MSD written incorrectly");
|
|
|
|
|
}
|
|
|
|
|
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(4);
|
|
|
|
|
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
else if (qName.equals("pc")){
|
|
|
|
|
inPunctuation = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// taxonomy node
|
|
|
|
|
else if (qName.equalsIgnoreCase("term")) {
|
|
|
|
|
// there are some term nodes at the beginning that are of no interest to us
|
|
|
|
|
// they differ by not having the attribute "ref", so test will equal null
|
|
|
|
|
Attribute tax = startElement.getAttributeByName(QName.valueOf("ref"));
|
|
|
|
|
|
|
|
|
|
if (tax != null) {
|
|
|
|
|
// keep only taxonomy properties
|
|
|
|
|
String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", "");
|
|
|
|
|
currentFiletaxonomy.add(currentFiletaxonomyElement);
|
|
|
|
|
Tax taxonomy = new Tax();
|
|
|
|
|
currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case XMLStreamConstants.CHARACTERS:
|
|
|
|
|
Characters characters = event.asCharacters();
|
|
|
|
|
|
|
|
|
|
// "word" node value
|
|
|
|
|
if (inWord) {
|
|
|
|
|
String word = characters.getData();
|
|
|
|
|
sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
|
|
|
|
|
inWord = false;
|
|
|
|
|
}
|
|
|
|
|
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
|
|
|
|
|
String punctuation = characters.getData();
|
|
|
|
|
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
|
|
|
|
|
inPunctuation = false;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case XMLStreamConstants.END_ELEMENT:
|
|
|
|
|
EndElement endElement = event.asEndElement();
|
|
|
|
|
|
|
|
|
|
String var = endElement.getName().getLocalPart();
|
|
|
|
|
String debug = "";
|
|
|
|
|
|
|
|
|
|
// parser reached end of the current sentence
|
|
|
|
|
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
|
|
|
|
// add sentence to corpus if it passes filters
|
|
|
|
|
sentence = runFilters(sentence, stats.getFilter());
|
|
|
|
|
|
|
|
|
|
if (!ValidationUtil.isEmpty(sentence)) {
|
|
|
|
|
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// and start a new one
|
|
|
|
|
sentence = new ArrayList<>();
|
|
|
|
|
|
|
|
|
|
/* Invoke Fork-Join when we reach maximum limit of
|
|
|
|
|
* sentences (because we can't read everything to
|
|
|
|
|
* memory) or we reach the end of the file.
|
|
|
|
|
*/
|
|
|
|
|
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
|
|
|
|
fj(corpus, stats);
|
|
|
|
|
// empty the current corpus, since we don't need the data anymore
|
|
|
|
|
corpus.clear();
|
|
|
|
|
|
|
|
|
|
// TODO: if (stats.isUseDB()) {
|
|
|
|
|
// stats.storeTmpResultsToDB();
|
|
|
|
|
// }
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// fallback
|
|
|
|
|
else if (endElement.getName().getLocalPart().equalsIgnoreCase("div")) {
|
|
|
|
|
// join corpus and stats
|
|
|
|
|
fj(corpus, stats);
|
|
|
|
|
corpus.clear();
|
|
|
|
|
|
|
|
|
|
currentFiletaxonomy = new ArrayList<>();
|
|
|
|
|
currentFiletaxonomyLong = new ArrayList<>();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} catch (FileNotFoundException | XMLStreamException e) {
|
|
|
|
|
e.printStackTrace();
|
|
|
|
|
} finally {
|
|
|
|
|
if (eventReader != null) {
|
|
|
|
|
try {
|
|
|
|
|
eventReader.close();
|
|
|
|
|
} catch (XMLStreamException e) {
|
|
|
|
|
logger.error("closing stream", e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@SuppressWarnings("Duplicates")
|
|
|
|
|
public static boolean readXMLGos(String path, StatisticsNew stats) {
|
|
|
|
|
boolean inWord = false;
|
|
|
|
@ -853,6 +1039,9 @@ public class XML_processing {
|
|
|
|
|
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
|
|
|
|
|
fj(corpus, stats);
|
|
|
|
|
corpus.clear();
|
|
|
|
|
|
|
|
|
|
currentFiletaxonomy = new ArrayList<>();
|
|
|
|
|
currentFiletaxonomyLong = new ArrayList<>();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
@ -914,7 +1103,7 @@ public class XML_processing {
|
|
|
|
|
return atts;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
|
|
|
|
|
public static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
|
|
|
|
|
List<String> wString = new ArrayList<>();
|
|
|
|
|
if (f.getWordParts().contains(CalculateFor.WORD))
|
|
|
|
|
wString.add(word);
|
|
|
|
|