845 lines
29 KiB
Java
Executable File
845 lines
29 KiB
Java
Executable File
package alg;
|
|
|
|
import static data.Enums.solar.SolarFilters.*;
|
|
|
|
import java.io.FileInputStream;
|
|
import java.io.FileNotFoundException;
|
|
import java.util.*;
|
|
import java.util.concurrent.ForkJoinPool;
|
|
|
|
import javax.xml.namespace.QName;
|
|
import javax.xml.stream.XMLEventReader;
|
|
import javax.xml.stream.XMLInputFactory;
|
|
import javax.xml.stream.XMLStreamConstants;
|
|
import javax.xml.stream.XMLStreamException;
|
|
import javax.xml.stream.events.*;
|
|
|
|
import org.apache.logging.log4j.LogManager;
|
|
|
|
import data.*;
|
|
import gui.ValidationUtil;
|
|
|
|
public class XML_processing {
|
|
public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class);
|
|
|
|
// public static void processCorpus(Statistics stats) {
|
|
// // we can preset the list's size, so there won't be a need to resize it
|
|
// List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT);
|
|
//
|
|
// int i = 0;
|
|
// for (File f : Settings.corpus) {
|
|
// i++;
|
|
// readXML(f.toString(), stats);
|
|
// }
|
|
// }
|
|
|
|
// public static void readXML(String path, Statistics stats) {
|
|
// if (stats.getCorpusType() == CorpusType.GIGAFIDA) {
|
|
// readXMLGigafida(path, stats);
|
|
// } else if (stats.getCorpusType() == CorpusType.GOS) {
|
|
// readXMLGos(path, stats);
|
|
// } else if (stats.getCorpusType() == CorpusType.SOLAR) {
|
|
// readXMLSolar(path, stats);
|
|
// }
|
|
// }
|
|
|
|
public static void readXML(String path, StatisticsNew stats) {
|
|
if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA
|
|
|| stats.getCorpus().getCorpusType() == CorpusType.CCKRES) {
|
|
readXMLGigafida(path, stats);
|
|
} else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) {
|
|
readXMLGos(path, stats);
|
|
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
|
|
readXMLSolar(path, stats);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Reads and returns the value of a passed header tag or an empty string.
|
|
* E.g. title tag, for discerning the corpus' type.
|
|
* Notice: returns only the value of the first occurrence of a given tag name.
|
|
*/
|
|
public static String readXMLHeaderTag(String path, String tag) {
|
|
XMLInputFactory factory = XMLInputFactory.newInstance();
|
|
XMLEventReader eventReader = null;
|
|
|
|
try {
|
|
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
|
while (eventReader.hasNext()) {
|
|
XMLEvent xmlEvent = eventReader.nextEvent();
|
|
if (xmlEvent.isStartElement()) {
|
|
StartElement startElement = xmlEvent.asStartElement();
|
|
String var = startElement.getName().getLocalPart();
|
|
|
|
if (var.equalsIgnoreCase(tag)) {
|
|
return eventReader.nextEvent().asCharacters().getData();
|
|
}
|
|
}
|
|
}
|
|
} catch (FileNotFoundException | XMLStreamException e) {
|
|
e.printStackTrace();
|
|
} finally {
|
|
if (eventReader != null) {
|
|
try {
|
|
eventReader.close();
|
|
} catch (XMLStreamException e) {
|
|
logger.error("closing stream", e);
|
|
}
|
|
}
|
|
}
|
|
return "";
|
|
}
|
|
|
|
private static void fj(List<Sentence> corpus, StatisticsNew stats) {
|
|
ForkJoinPool pool = new ForkJoinPool();
|
|
|
|
if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) {
|
|
alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats);
|
|
pool.invoke(wc);
|
|
} else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) {
|
|
alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats);
|
|
pool.invoke(wc);
|
|
} else {
|
|
// TODO:
|
|
// alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats);
|
|
// pool.invoke(wc);
|
|
}
|
|
}
|
|
|
|
// public static void readXMLGos(String path, Statistics stats) {
|
|
// boolean in_word = false;
|
|
// String taksonomija = "";
|
|
// String lemma = "";
|
|
// String msd = "";
|
|
// String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm
|
|
//
|
|
// List<Word> stavek = new ArrayList<>();
|
|
// List<Sentence> corpus = new ArrayList<>();
|
|
// String sentenceDelimiter = "seg";
|
|
// String taxonomyPrefix = "gos.";
|
|
//
|
|
// try {
|
|
// XMLInputFactory factory = XMLInputFactory.newInstance();
|
|
// XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
|
//
|
|
// while (eventReader.hasNext()) {
|
|
// XMLEvent event = eventReader.nextEvent();
|
|
//
|
|
// switch (event.getEventType()) {
|
|
// case XMLStreamConstants.START_ELEMENT:
|
|
//
|
|
// StartElement startElement = event.asStartElement();
|
|
// String qName = startElement.getName().getLocalPart();
|
|
//
|
|
// // "word" node
|
|
// if (qName.equals("w")) {
|
|
// in_word = true;
|
|
//
|
|
// if (type.equals("norm")) {
|
|
// // make sure we're looking at <w lemma...> and not <w type...>
|
|
// Iterator var = startElement.getAttributes();
|
|
// ArrayList<Object> attributes = new ArrayList<>();
|
|
// while (var.hasNext()) {
|
|
// attributes.add(var.next());
|
|
// }
|
|
//
|
|
// if (attributes.contains("msd")) {
|
|
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
|
|
// } else {
|
|
// msd = null;
|
|
// }
|
|
//
|
|
// if (attributes.contains("lemma")) {
|
|
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
|
// }
|
|
// }
|
|
// }
|
|
// // taxonomy node
|
|
// else if (qName.equalsIgnoreCase("catRef")) {
|
|
// // there are some term nodes at the beginning that are of no interest to us
|
|
// // they differ by not having the attribute "ref", so test will equal null
|
|
// Attribute test = startElement.getAttributeByName(QName.valueOf("target"));
|
|
//
|
|
// if (test != null) {
|
|
// // keep only taxonomy properties
|
|
// taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, "");
|
|
// }
|
|
// } else if (qName.equalsIgnoreCase("div")) {
|
|
// type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
|
|
//
|
|
// }
|
|
// break;
|
|
//
|
|
// case XMLStreamConstants.CHARACTERS:
|
|
// Characters characters = event.asCharacters();
|
|
//
|
|
// // "word" node value
|
|
// if (in_word) {
|
|
// if (type.equals("norm") && msd != null) {
|
|
// stavek.add(new Word(characters.getData(), lemma, msd));
|
|
// } else {
|
|
// stavek.add(new Word(characters.getData()));
|
|
// }
|
|
//
|
|
// in_word = false;
|
|
// }
|
|
// break;
|
|
//
|
|
// case XMLStreamConstants.END_ELEMENT:
|
|
// EndElement endElement = event.asEndElement();
|
|
//
|
|
// // parser reached end of the current sentence
|
|
// if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
|
// // add sentence to corpus
|
|
// corpus.add(new Sentence(stavek, taksonomija, type));
|
|
// // and start a new one
|
|
// stavek = new ArrayList<>();
|
|
//
|
|
// /* Invoke Fork-Join when we reach maximum limit of
|
|
// * sentences (because we can't read everything to
|
|
// * memory) or we reach the end of the file.
|
|
// */
|
|
// if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
|
// fj(corpus, stats);
|
|
// // empty the current corpus, since we don't need
|
|
// // the data anymore
|
|
// corpus.clear();
|
|
// }
|
|
// }
|
|
//
|
|
// // backup
|
|
// if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
|
|
// fj(corpus, stats);
|
|
// corpus.clear();
|
|
// }
|
|
//
|
|
// break;
|
|
// }
|
|
// }
|
|
// } catch (FileNotFoundException | XMLStreamException e) {
|
|
// e.printStackTrace();
|
|
// }
|
|
// }
|
|
|
|
@SuppressWarnings("unused")
|
|
public static void readXMLSolar(String path, StatisticsNew stats) {
|
|
boolean in_word = false;
|
|
boolean inPunctuation = false;
|
|
String lemma = "";
|
|
String msd = "";
|
|
|
|
List<Word> stavek = new ArrayList<>();
|
|
List<Sentence> corpus = new ArrayList<>();
|
|
|
|
// used for filter
|
|
Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto"));
|
|
Map<String, String> headBlock = null;
|
|
boolean includeThisBlock = false;
|
|
|
|
try {
|
|
XMLInputFactory factory = XMLInputFactory.newInstance();
|
|
XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
|
|
|
while (eventReader.hasNext()) {
|
|
XMLEvent event = eventReader.nextEvent();
|
|
|
|
switch (event.getEventType()) {
|
|
case XMLStreamConstants.START_ELEMENT:
|
|
|
|
StartElement startElement = event.asStartElement();
|
|
// System.out.println(String.format("%s", startElement.toString()));
|
|
String qName = startElement.getName().getLocalPart();
|
|
|
|
// "word" node
|
|
if (qName.equals("w3")) {
|
|
in_word = true;
|
|
|
|
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
|
|
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
|
} else if (qName.equals("c3")) {
|
|
String c3Content = eventReader.nextEvent().asCharacters().getData();
|
|
|
|
if (c3Content.equals(".") && includeThisBlock) {
|
|
// add sentence to corpus
|
|
corpus.add(new Sentence(stavek));
|
|
// and start a new one
|
|
stavek = new ArrayList<>();
|
|
|
|
/* Invoke Fork-Join when we reach maximum limit of
|
|
* sentences (because we can't read everything to
|
|
* memory) or we reach the end of the file.
|
|
*/
|
|
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
|
fj(corpus, stats);
|
|
// empty the current corpus, since we don't need
|
|
// the data anymore
|
|
corpus.clear();
|
|
}
|
|
}
|
|
else if(includeThisBlock){
|
|
inPunctuation = true;
|
|
}
|
|
} else if (headTags.contains(qName)) {
|
|
String tagContent = eventReader.nextEvent().asCharacters().getData();
|
|
headBlock.put(qName, tagContent);
|
|
} else if (qName.equals("head")) {
|
|
headBlock = new HashMap<>();
|
|
}
|
|
|
|
break;
|
|
|
|
case XMLStreamConstants.CHARACTERS:
|
|
Characters characters = event.asCharacters();
|
|
|
|
// "word" node value
|
|
if (in_word) {
|
|
stavek.add(new Word(characters.getData(), lemma, msd));
|
|
in_word = false;
|
|
} else if(inPunctuation){
|
|
String punctuation = ",";
|
|
stavek.get(stavek.size()-1).setWord(stavek.get(stavek.size()-1).getWord() + punctuation);
|
|
stavek.get(stavek.size()-1).setLemma(stavek.get(stavek.size()-1).getLemma() + punctuation);
|
|
stavek.get(stavek.size()-1).setMsd(stavek.get(stavek.size()-1).getMsd() + punctuation);
|
|
inPunctuation = false;
|
|
}
|
|
break;
|
|
|
|
case XMLStreamConstants.END_ELEMENT:
|
|
EndElement endElement = event.asEndElement();
|
|
String qNameEnd = endElement.getName().getLocalPart();
|
|
|
|
if (qNameEnd.equals("head")) {
|
|
// validate and set boolean
|
|
if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) {
|
|
includeThisBlock = true;
|
|
}
|
|
} else if (qNameEnd.equals("body")) {
|
|
// new block, reset filter status
|
|
includeThisBlock = false;
|
|
}
|
|
|
|
// backup
|
|
if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) {
|
|
fj(corpus, stats);
|
|
corpus.clear();
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
} catch (FileNotFoundException | XMLStreamException e) {
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @param readHeadBlock block of tags read from the corpus
|
|
* @param userSetFilter tags with values set by the user
|
|
*
|
|
* @return
|
|
*/
|
|
private static boolean validateHeadBlock(Map<String, String> readHeadBlock, HashMap<String, HashSet<String>> userSetFilter) {
|
|
boolean pass = true;
|
|
|
|
if (userSetFilter == null) {
|
|
return true;
|
|
}
|
|
|
|
for (Map.Entry<String, HashSet<String>> filterEntry : userSetFilter.entrySet()) {
|
|
String key = filterEntry.getKey();
|
|
HashSet<String> valueObject = filterEntry.getValue();
|
|
|
|
// if (valueObject instanceof String) {
|
|
// pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject);
|
|
// } else
|
|
if (valueObject != null) {
|
|
//noinspection unchecked
|
|
for (String value : valueObject) {
|
|
pass = validateHeadBlockEntry(readHeadBlock, key, value);
|
|
}
|
|
}
|
|
|
|
if (!pass) {
|
|
// current head block does not include one of the set filters - not likely, but an edge case anyway
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// if it gets to this point, it passed all the filters
|
|
return true;
|
|
}
|
|
|
|
private static boolean validateHeadBlockEntry(Map<String, String> readHeadBlock, String userSetKey, String userSetValue) {
|
|
if (!readHeadBlock.keySet().contains(userSetKey)) {
|
|
// current head block does not include one of the set filters - not likely, but an edge case anyway
|
|
return false;
|
|
} else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) {
|
|
// different values -> doesn't pass the filter
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
|
|
*
|
|
* @param filepath
|
|
* @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
|
|
* @param corpusType
|
|
*/
|
|
public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
|
|
boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
|
|
// solar
|
|
Set<String> headTags = null;
|
|
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
|
|
// taxonomy corpora
|
|
HashSet<String> resultTaxonomy = new HashSet<>();
|
|
|
|
String headTagName;
|
|
|
|
if (corpusType == CorpusType.SOLAR) {
|
|
headTagName = "head";
|
|
// used for filter
|
|
headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO));
|
|
|
|
// init results now to avoid null pointers
|
|
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
|
|
} else {
|
|
headTagName = "teiHeader";
|
|
}
|
|
|
|
XMLInputFactory factory = XMLInputFactory.newInstance();
|
|
XMLEventReader xmlEventReader = null;
|
|
try {
|
|
xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
|
|
boolean insideHeader = false;
|
|
|
|
while (xmlEventReader.hasNext()) {
|
|
XMLEvent xmlEvent = xmlEventReader.nextEvent();
|
|
|
|
if (xmlEvent.isStartElement()) {
|
|
StartElement startElement = xmlEvent.asStartElement();
|
|
String elementName = startElement.getName().getLocalPart();
|
|
|
|
if (elementName.equalsIgnoreCase(headTagName)) {
|
|
// if the corpus is split into files, we skip bodies
|
|
// this toggle is true when we're inside a header (next block of code executes)
|
|
// and false when we're not (skip reading unnecessary attributes)
|
|
insideHeader = true;
|
|
}
|
|
|
|
if (insideHeader) {
|
|
if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) {
|
|
HashMap<String, String> atts = extractAttributes(startElement);
|
|
String debug = "";
|
|
|
|
String tax = startElement.getAttributeByName(QName.valueOf("target"))
|
|
.getValue()
|
|
.replace("#", "");
|
|
|
|
resultTaxonomy.add(tax);
|
|
} else if (!parseTaxonomy && headTags.contains(elementName)) {
|
|
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
|
|
resultFilters.get(elementName).add(tagContent);
|
|
}
|
|
}
|
|
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
|
|
// if the corpus is split into multiple files, each with only one header block per file
|
|
// that means we should stop after we reach the end of the header
|
|
return parseTaxonomy ? resultTaxonomy : resultFilters;
|
|
} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
|
|
// whole corpus in one file, so we have to continue reading in order to find all header blocks
|
|
insideHeader = false;
|
|
}
|
|
}
|
|
} catch (XMLStreamException e) {
|
|
logger.error("Streaming error", e);
|
|
return parseTaxonomy ? resultTaxonomy : resultFilters;
|
|
} catch (FileNotFoundException e) {
|
|
logger.error("File not found", e);
|
|
return parseTaxonomy ? resultTaxonomy : resultFilters;
|
|
// TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user
|
|
} finally {
|
|
if (xmlEventReader != null) {
|
|
try {
|
|
xmlEventReader.close();
|
|
} catch (XMLStreamException e) {
|
|
logger.error("closing stream", e);
|
|
}
|
|
}
|
|
}
|
|
return parseTaxonomy ? resultTaxonomy : resultFilters;
|
|
}
|
|
|
|
private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) {
|
|
return event.asEndElement()
|
|
.getName()
|
|
.getLocalPart()
|
|
.equalsIgnoreCase(headerTag);
|
|
}
|
|
|
|
@SuppressWarnings("Duplicates")
|
|
public static boolean readXMLGigafida(String path, StatisticsNew stats) {
|
|
boolean inWord = false;
|
|
boolean inPunctuation = false;
|
|
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
|
|
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
|
|
String lemma = "";
|
|
String msd = "";
|
|
|
|
List<Word> sentence = new ArrayList<>();
|
|
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
|
|
String sentenceDelimiter = "s";
|
|
|
|
XMLEventReader eventReader = null;
|
|
try {
|
|
XMLInputFactory factory = XMLInputFactory.newInstance();
|
|
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
|
|
|
while (eventReader.hasNext()) {
|
|
XMLEvent event = eventReader.nextEvent();
|
|
|
|
switch (event.getEventType()) {
|
|
case XMLStreamConstants.START_ELEMENT:
|
|
StartElement startElement = event.asStartElement();
|
|
String qName = startElement.getName().getLocalPart();
|
|
|
|
// "word" node
|
|
if (qName.equals("w")) {
|
|
inWord = true;
|
|
|
|
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
|
|
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
|
}
|
|
|
|
if (qName.equals("c")){
|
|
inPunctuation = true;
|
|
}
|
|
|
|
// taxonomy node
|
|
else if (qName.equalsIgnoreCase("catRef")) {
|
|
// there are some term nodes at the beginning that are of no interest to us
|
|
// they differ by not having the attribute "ref", so test will equal null
|
|
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
|
|
|
|
if (tax != null) {
|
|
// keep only taxonomy properties
|
|
String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", "");
|
|
currentFiletaxonomy.add(currentFiletaxonomyElement);
|
|
Tax taxonomy = new Tax();
|
|
currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
|
|
}
|
|
}
|
|
break;
|
|
|
|
case XMLStreamConstants.CHARACTERS:
|
|
Characters characters = event.asCharacters();
|
|
|
|
// "word" node value
|
|
if (inWord) {
|
|
String word = characters.getData();
|
|
sentence.add(new Word(word, lemma, msd, currentFiletaxonomyLong));
|
|
inWord = false;
|
|
}
|
|
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
|
|
// String punctuation = characters.getData();
|
|
String punctuation = ",";
|
|
|
|
sentence.get(sentence.size() - 1).setWord(sentence.get(sentence.size() - 1).getWord() + punctuation);
|
|
sentence.get(sentence.size() - 1).setLemma(sentence.get(sentence.size() - 1).getLemma() + punctuation);
|
|
sentence.get(sentence.size() - 1).setMsd(sentence.get(sentence.size() - 1).getMsd() + punctuation);
|
|
inPunctuation = false;
|
|
}
|
|
break;
|
|
|
|
// if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
|
|
// String actualPunctuation = characters.getData();
|
|
// if (actualPunctuation.equals(".") || actualPunctuation.equals("!") || actualPunctuation.equals("?") || actualPunctuation.equals("..."))
|
|
// break;
|
|
// String punctuation = ",";
|
|
// int skip_number = 0;
|
|
// if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue())){
|
|
// skip_number = stats.getFilter().getSkipValue();
|
|
// }
|
|
// for(int i = 1; i < skip_number + 2; i ++){
|
|
// if (i < sentence.size() && !sentence.get(sentence.size() - i).equals(punctuation)) {
|
|
// sentence.get(sentence.size() - i).setWord(sentence.get(sentence.size() - i).getWord() + punctuation);
|
|
// sentence.get(sentence.size() - i).setLemma(sentence.get(sentence.size() - i).getLemma() + punctuation);
|
|
// sentence.get(sentence.size() - i).setMsd(sentence.get(sentence.size() - i).getMsd() + punctuation);
|
|
// }
|
|
// }
|
|
// inPunctuation = false;
|
|
// }
|
|
|
|
case XMLStreamConstants.END_ELEMENT:
|
|
EndElement endElement = event.asEndElement();
|
|
|
|
String var = endElement.getName().getLocalPart();
|
|
String debug = "";
|
|
|
|
// parser reached end of the current sentence
|
|
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
|
// add sentence to corpus if it passes filters
|
|
sentence = runFilters(sentence, stats.getFilter());
|
|
|
|
if (!ValidationUtil.isEmpty(sentence)) {
|
|
corpus.add(new Sentence(sentence));
|
|
}
|
|
|
|
// and start a new one
|
|
sentence = new ArrayList<>();
|
|
|
|
/* Invoke Fork-Join when we reach maximum limit of
|
|
* sentences (because we can't read everything to
|
|
* memory) or we reach the end of the file.
|
|
*/
|
|
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
|
fj(corpus, stats);
|
|
// empty the current corpus, since we don't need the data anymore
|
|
corpus.clear();
|
|
|
|
// TODO: if (stats.isUseDB()) {
|
|
// stats.storeTmpResultsToDB();
|
|
// }
|
|
}
|
|
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
|
|
// before proceeding to read this file, make sure that taxonomy filters are a match
|
|
|
|
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
|
|
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
|
|
|
|
if (currentFiletaxonomy.isEmpty()) {
|
|
// taxonomies don't match so stop
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// fallback
|
|
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
|
|
// join corpus and stats
|
|
fj(corpus, stats);
|
|
corpus.clear();
|
|
|
|
// TODO: if (stats.isUseDB()) {
|
|
// stats.storeTmpResultsToDB();
|
|
// }
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
} catch (FileNotFoundException | XMLStreamException e) {
|
|
e.printStackTrace();
|
|
} finally {
|
|
if (eventReader != null) {
|
|
try {
|
|
eventReader.close();
|
|
} catch (XMLStreamException e) {
|
|
logger.error("closing stream", e);
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
@SuppressWarnings("Duplicates")
|
|
public static boolean readXMLGos(String path, StatisticsNew stats) {
|
|
boolean inWord = false;
|
|
boolean inPunctuation = false;
|
|
boolean inOrthDiv = false;
|
|
boolean computeForOrth = stats.getCorpus().isGosOrthMode();
|
|
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
|
|
String lemma = "";
|
|
String msd = "";
|
|
|
|
List<Word> sentence = new ArrayList<>();
|
|
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
|
|
String sentenceDelimiter = "seg";
|
|
|
|
String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm
|
|
|
|
XMLEventReader eventReader = null;
|
|
|
|
boolean includeFile = true;
|
|
|
|
try {
|
|
XMLInputFactory factory = XMLInputFactory.newInstance();
|
|
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
|
|
|
while (eventReader.hasNext()) {
|
|
XMLEvent event = eventReader.nextEvent();
|
|
// System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", "")));
|
|
|
|
switch (event.getEventType()) {
|
|
case XMLStreamConstants.START_ELEMENT:
|
|
StartElement startElement = event.asStartElement();
|
|
String qName = startElement.getName().getLocalPart();
|
|
|
|
if (qName.equals("div")) {
|
|
HashMap<String, String> atts = extractAttributes(startElement);
|
|
|
|
if (atts.keySet().contains("type")) {
|
|
inOrthDiv = atts.get("type").equals("orth");
|
|
}
|
|
}
|
|
|
|
// "word" node
|
|
if (qName.equals("w")) {
|
|
// check that it's not a type
|
|
HashMap<String, String> atts = extractAttributes(startElement);
|
|
|
|
if (!atts.containsKey("type")) {
|
|
inWord = true;
|
|
|
|
if (atts.containsKey("msd")) {
|
|
msd = atts.get("msd");
|
|
|
|
}
|
|
if (atts.containsKey("lemma")) {
|
|
lemma = atts.get("lemma");
|
|
}
|
|
//
|
|
// if (!inOrthDiv) {
|
|
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
|
|
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
|
// }
|
|
}
|
|
|
|
// }
|
|
}
|
|
// taxonomy node
|
|
else if (qName.equalsIgnoreCase("catRef")) {
|
|
// there are some term nodes at the beginning that are of no interest to us
|
|
// they differ by not having the attribute "ref", so test will equal null
|
|
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
|
|
|
|
if (tax != null) {
|
|
// keep only taxonomy properties
|
|
currentFiletaxonomy.add(String.valueOf(tax.getValue()));
|
|
}
|
|
} else if (qName.equalsIgnoreCase("div")) {
|
|
gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
|
|
}
|
|
break;
|
|
|
|
case XMLStreamConstants.CHARACTERS:
|
|
// "word" node value
|
|
if (inWord) {
|
|
Characters characters = event.asCharacters();
|
|
if (gosType.equals("norm") && msd != null) {
|
|
sentence.add(new Word(characters.getData(), lemma, msd));
|
|
} else {
|
|
sentence.add(new Word(characters.getData()));
|
|
}
|
|
|
|
inWord = false;
|
|
}
|
|
break;
|
|
|
|
case XMLStreamConstants.END_ELEMENT:
|
|
EndElement endElement = event.asEndElement();
|
|
|
|
// parser reached end of the current sentence
|
|
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
|
// add sentence to corpus if it passes filters
|
|
boolean saveSentence = computeForOrth == inOrthDiv;
|
|
|
|
if (includeFile && saveSentence && !ValidationUtil.isEmpty(sentence)) {
|
|
sentence = runFilters(sentence, stats.getFilter());
|
|
corpus.add(new Sentence(sentence));
|
|
}
|
|
|
|
// and start a new one
|
|
sentence = new ArrayList<>();
|
|
|
|
/* Invoke Fork-Join when we reach maximum limit of
|
|
* sentences (because we can't read everything to
|
|
* memory) or we reach the end of the file.
|
|
*/
|
|
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
|
fj(corpus, stats);
|
|
// empty the current corpus, since we don't need
|
|
// the data anymore
|
|
corpus.clear();
|
|
}
|
|
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
|
|
// before proceeding to read this file, make sure that taxonomy filters are a match
|
|
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
|
|
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
|
|
|
|
// disregard this entry if taxonomies don't match
|
|
includeFile = !currentFiletaxonomy.isEmpty();
|
|
|
|
currentFiletaxonomy = new ArrayList<>();
|
|
}
|
|
}
|
|
|
|
// backup
|
|
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
|
|
fj(corpus, stats);
|
|
corpus.clear();
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
} catch (FileNotFoundException | XMLStreamException e) {
|
|
e.printStackTrace();
|
|
} finally {
|
|
if (eventReader != null) {
|
|
try {
|
|
eventReader.close();
|
|
} catch (XMLStreamException e) {
|
|
logger.error("closing stream", e);
|
|
} catch (Exception e) {
|
|
logger.error("general error", e);
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Runs the sentence through some filters, so we don't do calculations when unnecessary.
|
|
* Filters:
|
|
* <ol>
|
|
* <li><b>Ngrams:</b> omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)</li>
|
|
* <li><b>Letter ngrams:</b> omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)</li>
|
|
* </ol>
|
|
*
|
|
* @return Empty sentence (if fails 1.) or a sentence with some words removed (2.)
|
|
*/
|
|
private static List<Word> runFilters(List<Word> sentence, Filter filter) {
|
|
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
|
|
// ngram level: if not 0 must be less than or equal to number of words in this sentence.
|
|
if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) {
|
|
return null;
|
|
}
|
|
|
|
// if we're calculating values for letters, omit words that are shorter than string length
|
|
if (filter.getNgramValue() == 0) {
|
|
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength())
|
|
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength()));
|
|
}
|
|
}
|
|
|
|
return sentence;
|
|
}
|
|
|
|
private static HashMap<String, String> extractAttributes(StartElement se) {
|
|
Iterator attributesIt = se.getAttributes();
|
|
HashMap<String, String> atts = new HashMap<>();
|
|
|
|
while (attributesIt.hasNext()) {
|
|
Attribute a = (Attribute) attributesIt.next();
|
|
atts.put(a.getName().getLocalPart(), a.getValue());
|
|
}
|
|
|
|
return atts;
|
|
}
|
|
}
|