Added new ssj500k reading option. Fixed GOS taxonomy
This commit is contained in:
@@ -52,7 +52,9 @@ public class XML_processing {
|
||||
readXMLGos(path, stats);
|
||||
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
|
||||
readXMLSolar(path, stats);
|
||||
}
|
||||
} else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) {
|
||||
readXMLSSJ500K(path, stats);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -91,6 +93,50 @@ public class XML_processing {
|
||||
return "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads and returns the value of a passed header attribute or an empty string.
|
||||
* E.g. body base attribute, for discerning the corpus' type of ssj500k.
|
||||
* Notice: returns only the value of the first occurrence of a given tag name.
|
||||
*/
|
||||
public static String readXMLHeaderAttribute(String path, String tag, String attribute) {
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
XMLEventReader eventReader = null;
|
||||
|
||||
try {
|
||||
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
while (eventReader.hasNext()) {
|
||||
XMLEvent xmlEvent = eventReader.nextEvent();
|
||||
if (xmlEvent.isStartElement()) {
|
||||
StartElement startElement = xmlEvent.asStartElement();
|
||||
String var = startElement.getName().getLocalPart();
|
||||
|
||||
if (var.equalsIgnoreCase(tag)) {
|
||||
HashMap<String, String> att = extractAttributes(startElement);
|
||||
|
||||
if (att.containsKey("base")) {
|
||||
return att.get("base").substring(0, att.get("base").length() - 12);
|
||||
}
|
||||
|
||||
|
||||
|
||||
return eventReader.nextEvent().asCharacters().getData();
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException | XMLStreamException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
if (eventReader != null) {
|
||||
try {
|
||||
eventReader.close();
|
||||
} catch (XMLStreamException e) {
|
||||
logger.error("closing stream", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
private static void fj(List<Sentence> corpus, StatisticsNew stats) {
|
||||
ForkJoinPool pool = new ForkJoinPool();
|
||||
|
||||
@@ -403,7 +449,9 @@ public class XML_processing {
|
||||
|
||||
// init results now to avoid null pointers
|
||||
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
|
||||
} else {
|
||||
} else if (corpusType == CorpusType.SSJ500K) {
|
||||
headTagName = "bibl";
|
||||
} else {
|
||||
headTagName = "teiHeader";
|
||||
}
|
||||
|
||||
@@ -437,7 +485,13 @@ public class XML_processing {
|
||||
.replace("#", "");
|
||||
|
||||
resultTaxonomy.add(tax);
|
||||
} else if (!parseTaxonomy && headTags.contains(elementName)) {
|
||||
} else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) {
|
||||
String tax = startElement.getAttributeByName(QName.valueOf("ref"))
|
||||
.getValue()
|
||||
.replace("#", "");
|
||||
|
||||
resultTaxonomy.add(tax);
|
||||
} else if (!parseTaxonomy && headTags.contains(elementName)) {
|
||||
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
|
||||
resultFilters.get(elementName).add(tagContent);
|
||||
}
|
||||
@@ -646,6 +700,138 @@ public class XML_processing {
|
||||
return true;
|
||||
}
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
|
||||
boolean inWord = false;
|
||||
boolean inPunctuation = false;
|
||||
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
|
||||
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
|
||||
String lemma = "";
|
||||
String msd = "";
|
||||
|
||||
List<Word> sentence = new ArrayList<>();
|
||||
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
|
||||
String sentenceDelimiter = "s";
|
||||
|
||||
XMLEventReader eventReader = null;
|
||||
try {
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
|
||||
while (eventReader.hasNext()) {
|
||||
XMLEvent event = eventReader.nextEvent();
|
||||
|
||||
switch (event.getEventType()) {
|
||||
case XMLStreamConstants.START_ELEMENT:
|
||||
StartElement startElement = event.asStartElement();
|
||||
String qName = startElement.getName().getLocalPart();
|
||||
|
||||
// "word" node
|
||||
if (qName.equals("w")) {
|
||||
inWord = true;
|
||||
if (!String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("msd:")){
|
||||
System.out.println("MSD written incorrectly");
|
||||
}
|
||||
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(4);
|
||||
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
||||
}
|
||||
|
||||
else if (qName.equals("pc")){
|
||||
inPunctuation = true;
|
||||
}
|
||||
|
||||
// taxonomy node
|
||||
else if (qName.equalsIgnoreCase("term")) {
|
||||
// there are some term nodes at the beginning that are of no interest to us
|
||||
// they differ by not having the attribute "ref", so test will equal null
|
||||
Attribute tax = startElement.getAttributeByName(QName.valueOf("ref"));
|
||||
|
||||
if (tax != null) {
|
||||
// keep only taxonomy properties
|
||||
String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", "");
|
||||
currentFiletaxonomy.add(currentFiletaxonomyElement);
|
||||
Tax taxonomy = new Tax();
|
||||
currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.CHARACTERS:
|
||||
Characters characters = event.asCharacters();
|
||||
|
||||
// "word" node value
|
||||
if (inWord) {
|
||||
String word = characters.getData();
|
||||
sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
|
||||
inWord = false;
|
||||
}
|
||||
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
|
||||
String punctuation = characters.getData();
|
||||
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
|
||||
inPunctuation = false;
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.END_ELEMENT:
|
||||
EndElement endElement = event.asEndElement();
|
||||
|
||||
String var = endElement.getName().getLocalPart();
|
||||
String debug = "";
|
||||
|
||||
// parser reached end of the current sentence
|
||||
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
||||
// add sentence to corpus if it passes filters
|
||||
sentence = runFilters(sentence, stats.getFilter());
|
||||
|
||||
if (!ValidationUtil.isEmpty(sentence)) {
|
||||
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
|
||||
}
|
||||
|
||||
// and start a new one
|
||||
sentence = new ArrayList<>();
|
||||
|
||||
/* Invoke Fork-Join when we reach maximum limit of
|
||||
* sentences (because we can't read everything to
|
||||
* memory) or we reach the end of the file.
|
||||
*/
|
||||
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
||||
fj(corpus, stats);
|
||||
// empty the current corpus, since we don't need the data anymore
|
||||
corpus.clear();
|
||||
|
||||
// TODO: if (stats.isUseDB()) {
|
||||
// stats.storeTmpResultsToDB();
|
||||
// }
|
||||
}
|
||||
}
|
||||
// fallback
|
||||
else if (endElement.getName().getLocalPart().equalsIgnoreCase("div")) {
|
||||
// join corpus and stats
|
||||
fj(corpus, stats);
|
||||
corpus.clear();
|
||||
|
||||
currentFiletaxonomy = new ArrayList<>();
|
||||
currentFiletaxonomyLong = new ArrayList<>();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException | XMLStreamException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
if (eventReader != null) {
|
||||
try {
|
||||
eventReader.close();
|
||||
} catch (XMLStreamException e) {
|
||||
logger.error("closing stream", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public static boolean readXMLGos(String path, StatisticsNew stats) {
|
||||
boolean inWord = false;
|
||||
@@ -853,6 +1039,9 @@ public class XML_processing {
|
||||
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
|
||||
fj(corpus, stats);
|
||||
corpus.clear();
|
||||
|
||||
currentFiletaxonomy = new ArrayList<>();
|
||||
currentFiletaxonomyLong = new ArrayList<>();
|
||||
}
|
||||
|
||||
break;
|
||||
@@ -914,7 +1103,7 @@ public class XML_processing {
|
||||
return atts;
|
||||
}
|
||||
|
||||
private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
|
||||
public static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
|
||||
List<String> wString = new ArrayList<>();
|
||||
if (f.getWordParts().contains(CalculateFor.WORD))
|
||||
wString.add(word);
|
||||
|
||||
Reference in New Issue
Block a user