You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
list/src/main/java/alg/XML_processing.java

1780 lines
83 KiB

package alg;
import static data.Enums.solar.SolarFilters.*;
import java.io.*;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ForkJoinPool;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.*;
import gui.I18N;
import javafx.beans.InvalidationListener;
import javafx.beans.property.ReadOnlyDoubleProperty;
import javafx.beans.property.ReadOnlyDoubleWrapper;
import javafx.concurrent.Task;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.logging.log4j.LogManager;
import data.*;
import gui.ValidationUtil;
public class XML_processing {
public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class);
// progress tracking functionality
private static final ReadOnlyDoubleWrapper progress = new ReadOnlyDoubleWrapper();
public static boolean isCancelled = false;
public static Date startTime = new Date();
public static boolean isCollocability = false;
public static InvalidationListener progressBarListener;
public double getProgress() {
return progressProperty().get();
}
public ReadOnlyDoubleProperty progressProperty() {
return progress ;
}
// public static void processCorpus(Statistics stats) {
// // we can preset the list's size, so there won't be a need to resize it
// List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT);
//
// int i = 0;
// for (File f : Settings.corpus) {
// i++;
// readXML(f.toString(), stats);
// }
// }
// public static void readXML(String path, Statistics stats) {
// if (stats.getCorpusType() == CorpusType.GIGAFIDA) {
// readXMLGigafida(path, stats);
// } else if (stats.getCorpusType() == CorpusType.GOS) {
// readXMLGos(path, stats);
// } else if (stats.getCorpusType() == CorpusType.SOLAR) {
// readXMLSolar(path, stats);
// }
// }
public static boolean readXML(String path, StatisticsNew stats) {
if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA
|| stats.getCorpus().getCorpusType() == CorpusType.CCKRES) {
return readXMLGigafida(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) {
return readXMLGos(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
return readXMLSolar(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K ||
stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2) {
return readXMLSSJ500K(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.VERT) {
return readVERT(path, stats);
}
// task.updateProgress(fileNum, size);
return false;
}
/**
* Reads and returns the value of a passed header tag or an empty string.
* E.g. title tag, for discerning the corpus' type.
* Notice: returns only the value of the first occurrence of a given tag name.
*/
public static String readXMLHeaderTag(String path, String tag) {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = null;
try {
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent xmlEvent = eventReader.nextEvent();
if (xmlEvent.isStartElement()) {
StartElement startElement = xmlEvent.asStartElement();
String var = startElement.getName().getLocalPart();
if (var.equalsIgnoreCase(tag)) {
return eventReader.nextEvent().asCharacters().getData();
}
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return "";
}
/**
* Reads and returns the value of a passed header attribute or an empty string.
* E.g. body base attribute, for discerning the corpus' type of ssj500k.
* Notice: returns only the value of the first occurrence of a given tag name.
*/
public static String readXMLHeaderAttribute(String path, String tag, String attribute) {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = null;
try {
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent xmlEvent = eventReader.nextEvent();
if (xmlEvent.isStartElement()) {
StartElement startElement = xmlEvent.asStartElement();
String var = startElement.getName().getLocalPart();
if (var.equalsIgnoreCase(tag)) {
HashMap<String, String> att = extractAttributes(startElement);
if (att.containsKey("base")) {
return att.get("base").substring(0, att.get("base").length() - 12);
}
return eventReader.nextEvent().asCharacters().getData();
}
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return "";
}
private static void fj(List<Sentence> corpus, StatisticsNew stats) {
ForkJoinPool pool = new ForkJoinPool();
if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) {
alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats);
pool.invoke(wc);
} else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) {
alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats);
pool.invoke(wc);
} else {
// TODO:
// alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats);
// pool.invoke(wc);
}
}
// public static void readXMLGos(String path, Statistics stats) {
// boolean in_word = false;
// String taksonomija = "";
// String lemma = "";
// String msd = "";
// String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm
//
// List<Word> stavek = new ArrayList<>();
// List<Sentence> corpus = new ArrayList<>();
// String sentenceDelimiter = "seg";
// String taxonomyPrefix = "gos.";
//
// try {
// XMLInputFactory factory = XMLInputFactory.newInstance();
// XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
//
// while (eventReader.hasNext()) {
// XMLEvent event = eventReader.nextEvent();
//
// switch (event.getEventType()) {
// case XMLStreamConstants.START_ELEMENT:
//
// StartElement startElement = event.asStartElement();
// String qName = startElement.getName().getLocalPart();
//
// // "word" node
// if (qName.equals("w")) {
// in_word = true;
//
// if (type.equals("norm")) {
// // make sure we're looking at <w lemma...> and not <w type...>
// Iterator var = startElement.getAttributes();
// ArrayList<Object> attributes = new ArrayList<>();
// while (var.hasNext()) {
// attributes.add(var.next());
// }
//
// if (attributes.contains("msd")) {
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
// } else {
// msd = null;
// }
//
// if (attributes.contains("lemma")) {
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
// }
// }
// }
// // taxonomy node
// else if (qName.equalsIgnoreCase("catRef")) {
// // there are some term nodes at the beginning that are of no interest to us
// // they differ by not having the attribute "ref", so test will equal null
// Attribute test = startElement.getAttributeByName(QName.valueOf("target"));
//
// if (test != null) {
// // keep only taxonomy properties
// taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, "");
// }
// } else if (qName.equalsIgnoreCase("div")) {
// type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
//
// }
// break;
//
// case XMLStreamConstants.CHARACTERS:
// Characters characters = event.asCharacters();
//
// // "word" node value
// if (in_word) {
// if (type.equals("norm") && msd != null) {
// stavek.add(new Word(characters.getData(), lemma, msd));
// } else {
// stavek.add(new Word(characters.getData()));
// }
//
// in_word = false;
// }
// break;
//
// case XMLStreamConstants.END_ELEMENT:
// EndElement endElement = event.asEndElement();
//
// // parser reached end of the current sentence
// if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// // add sentence to corpus
// corpus.add(new Sentence(stavek, taksonomija, type));
// // and start a new one
// stavek = new ArrayList<>();
//
// /* Invoke Fork-Join when we reach maximum limit of
// * sentences (because we can't read everything to
// * memory) or we reach the end of the file.
// */
// if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
// fj(corpus, stats);
// // empty the current corpus, since we don't need
// // the data anymore
// corpus.clear();
// }
// }
//
// // backup
// if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
// fj(corpus, stats);
// corpus.clear();
// }
//
// break;
// }
// }
// } catch (FileNotFoundException | XMLStreamException e) {
// e.printStackTrace();
// }
// }
@SuppressWarnings("unused")
public static boolean readXMLSolar(String path, StatisticsNew stats) {
boolean in_word = false;
boolean inPunctuation = false;
String lemma = "";
String msd = "";
List<Word> stavek = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>();
// used for filter
// Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto"));
Set<String> headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO));
Map<String, String> headBlock = null;
boolean includeThisBlock = false;
int numLines = 0;
int lineNum = 0;
progress.set(0.0);
if(!isCollocability) {
startTime = new Date();
}
// get number of lines
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext())
{
eventReader.next();
numLines ++;
// Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file
}
} catch (IOException e) {
e.printStackTrace();
} catch (XMLStreamException e) {
e.printStackTrace();
}
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
int percentage = (int) (lineNum * 100.0 / numLines);
if(progress.get() < percentage) {
progress.set(percentage);
}
if(isCancelled) {
return false;
}
lineNum ++;
XMLEvent event = eventReader.nextEvent();
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
// System.out.println(String.format("%s", startElement.toString()));
String qName = startElement.getName().getLocalPart();
// "word" node
if (qName.equals("w3") || qName.equals("w1") || qName.equals("w")) {
in_word = true;
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
} else if (qName.equals("c3") || qName.equals("c1") || qName.equals("c")) {
String c3Content = eventReader.nextEvent().asCharacters().getData();
if (stats.getFilter().getNotePunctuations() &&
stavek.size() > 0) {
stavek.add(createWord(c3Content, c3Content, "/", "", stats.getFilter()));
}
} else if ((qName.equals("st1") && startElement.getAttributeByName(QName.valueOf("tip")).getValue().equals("0")) || qName.equals("s")) {
if (stats.getFilter().getNgramValue() == 0){
int numSentenceParts = 0;
for(Word w : stavek){
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
}
stats.updateUniGramOccurrences(numSentenceParts, new ArrayList<>());
} else if(stats.getFilter().getNgramValue() >= 1) {
stats.updateUniGramOccurrences(stavek.size(), new ArrayList<>());
}
if(includeThisBlock) {
// add sentence to corpus
corpus.add(new Sentence(stavek, null));
// and start a new one
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need
// the data anymore
corpus.clear();
}
}
stavek = new ArrayList<>();
} else if (qName.equals("head")) {
headBlock = new HashMap<>();
} else { // if (headTags.contains(qName)) {
boolean inHeadTags = false;
String headTag = "";
for (String tag : headTags){
if(I18N.getDefaultLocaleItem(tag).equals(qName)){
inHeadTags = true;
headTag = tag;
break;
}
}
if(inHeadTags) {
String tagContent = eventReader.nextEvent().asCharacters().getData();
headBlock.put(headTag, tagContent);
// String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
// resultFilters.get(headTag).add(tagContent);
}
}
break;
case XMLStreamConstants.CHARACTERS:
Characters characters = event.asCharacters();
// "word" node value
if (in_word) {
stavek.add(createWord(characters.getData(), lemma, msd, "", stats.getFilter()));
in_word = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
String qNameEnd = endElement.getName().getLocalPart();
if (qNameEnd.equals("head")) {
// validate and set boolean
if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) {
includeThisBlock = true;
}
} else if (qNameEnd.equals("body")) {
// new block, reset filter status
includeThisBlock = false;
stavek = new ArrayList<>();
}
// backup
if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) {
fj(corpus, stats);
corpus.clear();
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
}
return true;
}
/**
* @param readHeadBlock block of tags read from the corpus
* @param userSetFilter tags with values set by the user
*
* @return
*/
private static boolean validateHeadBlock(Map<String, String> readHeadBlock, HashMap<String, HashSet<String>> userSetFilter) {
boolean pass = true;
if (userSetFilter == null) {
return true;
}
for (Map.Entry<String, HashSet<String>> filterEntry : userSetFilter.entrySet()) {
String key = filterEntry.getKey();
HashSet<String> valueObject = filterEntry.getValue();
// if (valueObject instanceof String) {
// pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject);
// } else
if (valueObject != null) {
//noinspection unchecked
for (String value : valueObject) {
pass = validateHeadBlockEntry(readHeadBlock, key, value);
if (pass){
break;
}
}
}
if (!pass) {
// current head block does not include one of the set filters - not likely, but an edge case anyway
return false;
}
}
// if it gets to this point, it passed all the filters
return true;
}
private static boolean validateHeadBlockEntry(Map<String, String> readHeadBlock, String userSetKey, String userSetValue) {
if (!readHeadBlock.keySet().contains(userSetKey)) {
// current head block does not include one of the set filters - not likely, but an edge case anyway
return false;
} else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) {
// different values -> doesn't pass the filter
return false;
}
return true;
}
/**
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
*
* @param filepath
* @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
* @param corpusType
*/
public static HashSet<String> readVertHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
// taxonomy corpora
HashSet<String> resultTaxonomy = new HashSet<>();
LineIterator it = null;
try {
it = FileUtils.lineIterator(new File(filepath), "UTF-8");
try {
boolean insideHeader = false;
while (it.hasNext()) {
String line = it.nextLine();
if (line.length() > 4 && line.substring(1, 5).equals("text")) {
// split over "\" "
String[] split = line.split("\" ");
// String mediumId = "";
// String typeId = "";
// String proofreadId = "";
boolean idsPresent = false;
for (String el : split) {
String[] attribute = el.split("=\"");
if (attribute[0].equals("medium_id")) {
// mediumId = attribute[1];
idsPresent = true;
resultTaxonomy.add(attribute[1]);
} else if (attribute[0].equals("type_id")) {
// typeId = attribute[1];
idsPresent = true;
resultTaxonomy.add(attribute[1]);
} else if (attribute[0].equals("proofread_id")) {
// proofreadId = attribute[1];
idsPresent = true;
resultTaxonomy.add(attribute[1]);
}
}
if (!idsPresent){
for (String el : split) {
String[] attribute = el.split("=\"");
if (attribute[0].equals("medium")) {
// mediumId = attribute[1];
resultTaxonomy.add(attribute[1]);
} else if (attribute[0].equals("type")) {
// typeId = attribute[1];
resultTaxonomy.add(attribute[1]);
} else if (attribute[0].equals("proofread")) {
// proofreadId = attribute[1];
resultTaxonomy.add(attribute[1]);
}
}
}
}
}
} finally {
LineIterator.closeQuietly(it);
}
} catch (IOException e) {
e.printStackTrace();
}
resultTaxonomy.remove("-");
return resultTaxonomy;
}
/**
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
*
* @param filepath
* @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
* @param corpusType
*/
public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
// solar
Set<String> headTags = null;
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
// taxonomy corpora
HashSet<String> resultTaxonomy = new HashSet<>();
String headTagName;
if (corpusType == CorpusType.SOLAR) {
headTagName = "head";
// used for filter
headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO));
// init results now to avoid null pointers
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
} else if (corpusType == CorpusType.SSJ500K) {
headTagName = "bibl";
} else {
headTagName = "teiHeader";
}
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader xmlEventReader = null;
try {
xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
boolean insideHeader = false;
while (xmlEventReader.hasNext()) {
XMLEvent xmlEvent = xmlEventReader.nextEvent();
if (xmlEvent.isStartElement()) {
StartElement startElement = xmlEvent.asStartElement();
String elementName = startElement.getName().getLocalPart();
if (elementName.equalsIgnoreCase(headTagName)) {
// if the corpus is split into files, we skip bodies
// this toggle is true when we're inside a header (next block of code executes)
// and false when we're not (skip reading unnecessary attributes)
insideHeader = true;
}
if (insideHeader) {
if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) {
HashMap<String, String> atts = extractAttributes(startElement);
String debug = "";
String tax = startElement.getAttributeByName(QName.valueOf("target"))
.getValue()
.replace("#", "");
if (tax.indexOf(':') >= 0) {
tax = tax.split(":")[1];
}
resultTaxonomy.add(tax);
} else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) {
String tax = startElement.getAttributeByName(QName.valueOf("ref"))
.getValue()
.replace("#", "");
resultTaxonomy.add(tax);
// solar
// } else if (!parseTaxonomy && headTags.contains(elementName)) {
} else if (!parseTaxonomy) {
boolean inHeadTags = false;
String headTag = "";
for (String tag : headTags){
if(I18N.getDefaultLocaleItem(tag).equals(elementName)){
inHeadTags = true;
headTag = tag;
break;
}
}
if(inHeadTags) {
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
resultFilters.get(headTag).add(tagContent);
}
}
}
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
// if the corpus is split into multiple files, each with only one header block per file
// that means we should stop after we reach the end of the header
return parseTaxonomy ? resultTaxonomy : resultFilters;
} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
// whole corpus in one file, so we have to continue reading in order to find all header blocks
insideHeader = false;
}
}
} catch (XMLStreamException e) {
logger.error("Streaming error", e);
return parseTaxonomy ? resultTaxonomy : resultFilters;
} catch (FileNotFoundException e) {
logger.error("File not found", e);
return parseTaxonomy ? resultTaxonomy : resultFilters;
// TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user
} finally {
if (xmlEventReader != null) {
try {
xmlEventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return parseTaxonomy ? resultTaxonomy : resultFilters;
}
private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) {
return event.asEndElement()
.getName()
.getLocalPart()
.equalsIgnoreCase(headerTag);
}
@SuppressWarnings("Duplicates")
public static boolean readXMLGigafida(String path, StatisticsNew stats) {
boolean inWord = false;
boolean inPunctuation = false;
boolean taxonomyMatch = true;
ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
// ArrayList<Taxonomy> currentFiletaxonomyLong = new ArrayList<>();
String lemma = "";
String msd = "";
List<Word> sentence = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
String sentenceDelimiter = "s";
XMLEventReader eventReader = null;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
String qName = startElement.getName().getLocalPart();
// "word" node
if (qName.equals("w")) {
inWord = true;
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
}
if (qName.equals("c")){
inPunctuation = true;
}
// taxonomy node
else if (qName.equalsIgnoreCase("catRef")) {
// there are some term nodes at the beginning that are of no interest to us
// they differ by not having the attribute "ref", so test will equal null
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
if (tax != null) {
// keep only taxonomy properties
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement);
Tax taxonomy = new Tax();
// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
}
}
break;
case XMLStreamConstants.CHARACTERS:
Characters characters = event.asCharacters();
// "word" node value
if (inWord) {
String word = characters.getData();
sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
inWord = false;
}
// if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
if (stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
String punctuation = characters.getData();
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
inPunctuation = false;
// String punctuation = ",";
//
// sentence.get(sentence.size() - 1).setWord(sentence.get(sentence.size() - 1).getWord() + punctuation);
// sentence.get(sentence.size() - 1).setLemma(sentence.get(sentence.size() - 1).getLemma() + punctuation);
// sentence.get(sentence.size() - 1).setMsd(sentence.get(sentence.size() - 1).getMsd() + punctuation);
// inPunctuation = false;
}
break;
// if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
// String actualPunctuation = characters.getData();
// if (actualPunctuation.equals(".") || actualPunctuation.equals("!") || actualPunctuation.equals("?") || actualPunctuation.equals("..."))
// break;
// String punctuation = ",";
// int skip_number = 0;
// if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue())){
// skip_number = stats.getFilter().getSkipValue();
// }
// for(int i = 1; i < skip_number + 2; i ++){
// if (i < sentence.size() && !sentence.get(sentence.size() - i).equals(punctuation)) {
// sentence.get(sentence.size() - i).setWord(sentence.get(sentence.size() - i).getWord() + punctuation);
// sentence.get(sentence.size() - i).setLemma(sentence.get(sentence.size() - i).getLemma() + punctuation);
// sentence.get(sentence.size() - i).setMsd(sentence.get(sentence.size() - i).getMsd() + punctuation);
// }
// }
// inPunctuation = false;
// }
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
String var = endElement.getName().getLocalPart();
String debug = "";
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// count all UniGramOccurrences in sentence for statistics
if (stats.getFilter().getNgramValue() == 0){
int numSentenceParts = 0;
for(Word w : sentence){
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
}
stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
} else if(stats.getFilter().getNgramValue() >= 1) {
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
}
// add sentence to corpus if it passes filters
sentence = runFilters(sentence, stats.getFilter());
if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
corpus.add(new Sentence(sentence, currentFiletaxonomy));
}
// taxonomyMatch = true;
// and start a new one
sentence = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need the data anymore
corpus.clear();
// TODO: if (stats.isUseDB()) {
// stats.storeTmpResultsToDB();
// }
}
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
// before proceeding to read this file, make sure that taxonomy filters are a match
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
if (stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.UNION")) && currentFiletaxonomy.isEmpty()) {
// taxonomies don't match so stop
// union (select words that match any of selected taxonomy
// return false;
taxonomyMatch = false;
//
} else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){
// intersection (select only words that precisely match selected taxonomy
taxonomyMatch = false;
}
}
}
// fallback
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
// join corpus and stats
fj(corpus, stats);
corpus.clear();
// TODO: if (stats.isUseDB()) {
// stats.storeTmpResultsToDB();
// }
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
throw new java.lang.RuntimeException("XMLStreamException | FileNotFoundException");
// e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return true;
}
@SuppressWarnings("Duplicates")
public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
boolean inWord = false;
boolean inPunctuation = false;
boolean taxonomyMatch = true;
ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
// ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
String lemma = "";
String msd = "";
List<Word> sentence = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
String sentenceDelimiter = "s";
int numLines = 0;
int lineNum = 0;
progress.set(0.0);
if(!isCollocability) {
startTime = new Date();
}
// get number of lines
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext())
{
eventReader.next();
numLines ++;
// Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file
}
} catch (IOException e) {
e.printStackTrace();
} catch (XMLStreamException e) {
e.printStackTrace();
}
XMLEventReader eventReader = null;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
int percentage = (int) (lineNum * 100.0 / numLines);
if(progress.get() < percentage) {
progress.set(percentage);
}
if(isCancelled) {
return false;
}
lineNum ++;
XMLEvent event = eventReader.nextEvent();
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
String qName = startElement.getName().getLocalPart();
// "word" node
if (qName.equals("w")) {
inWord = true;
if (!(String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("msd:") ||
String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("mte:"))){
System.out.println("MSD written incorrectly");
}
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(4);
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
}
else if (qName.equals("pc")){
inPunctuation = true;
}
// taxonomy node
else if (stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("term")) {
// there are some term nodes at the beginning that are of no interest to us
// they differ by not having the attribute "ref", so test will equal null
Attribute tax = startElement.getAttributeByName(QName.valueOf("ref"));
if (tax != null) {
// keep only taxonomy properties
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement);
// Tax taxonomy = new Tax();
// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
}
} else if (stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("catRef")) {
// get value from attribute target
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
if (tax != null && !tax.getValue().equals("dedup:nodup")) {
// keep only taxonomy properties
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).split(":")[1], stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement);
// Tax taxonomy = new Tax();
// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
}
// if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) {
// HashMap<String, String> atts = extractAttributes(startElement);
// String debug = "";
//
// String tax = startElement.getAttributeByName(QName.valueOf("target"))
// .getValue()
// .replace("#", "");
//
// if (tax.indexOf(':') >= 0) {
// tax = tax.split(":")[1];
// }
// resultTaxonomy.add(tax);
// } else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) {
// String tax = startElement.getAttributeByName(QName.valueOf("ref"))
// .getValue()
// .replace("#", "");
//
// resultTaxonomy.add(tax);
// } else if (!parseTaxonomy && headTags.contains(elementName)) {
// String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
// resultFilters.get(elementName).add(tagContent);
// }
} else if (qName.equals("bibl")) {
// before proceeding to read this file, make sure that taxonomy filters are a match
taxonomyMatch = true;
} else if (qName.equals("text")){
taxonomyMatch = true;
}
break;
case XMLStreamConstants.CHARACTERS:
Characters characters = event.asCharacters();
// "word" node value
if (inWord) {
String word = characters.getData();
// if (word.equals("Banovec")){
// System.out.println("Test");
// }
sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
inWord = false;
}
if (stats.getFilter().getNotePunctuations() && inPunctuation) {
// if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
String punctuation = characters.getData();
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
inPunctuation = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
String var = endElement.getName().getLocalPart();
String debug = "";
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
if (stats.getFilter().getNgramValue() == 0){
int numSentenceParts = 0;
for(Word w : sentence){
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
}
stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
} else if(stats.getFilter().getNgramValue() >= 1) {
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
}
// add sentence to corpus if it passes filters
sentence = runFilters(sentence, stats.getFilter());
if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
corpus.add(new Sentence(sentence, currentFiletaxonomy));
}
// and start a new one
sentence = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need the data anymore
corpus.clear();
// TODO: if (stats.isUseDB()) {
// stats.storeTmpResultsToDB();
// }
}
}
// fallback
else if (endElement.getName().getLocalPart().equalsIgnoreCase("div") &&
stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) {
// join corpus and stats
fj(corpus, stats);
corpus.clear();
currentFiletaxonomy = new ArrayList<>();
// currentFiletaxonomyLong = new ArrayList<>();
} else if (endElement.getName().getLocalPart().equals("bibl")) {
// before proceeding to read this file, make sure that taxonomy filters are a match
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
if (stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.UNION")) && currentFiletaxonomy.isEmpty()) {
// taxonomies don't match so stop
// union (select words that match any of selected taxonomy
// return false;
taxonomyMatch = false;
//
} else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){
// intersection (select only words that precisely match selected taxonomy
taxonomyMatch = false;
}
}
} else if (endElement.getName().getLocalPart().equals("text")){
taxonomyMatch = false;
}
break;
}
}
if (corpus.size() > 0) {
fj(corpus, stats);
// empty the current corpus, since we don't need the data anymore
corpus.clear();
// TODO: if (stats.isUseDB()) {
// stats.storeTmpResultsToDB();
// }
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return true;
}
@SuppressWarnings("Duplicates")
public static boolean readXMLGos(String path, StatisticsNew stats) {
boolean inWord = false;
boolean inPunctuation = false;
boolean inOrthDiv = false;
boolean computeForOrth = stats.getCorpus().isGosOrthMode();
boolean inSeparatedWord = false;
ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
// ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
String lemma = "";
String msd = "";
List<Word> sentence = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
Map<String, List<Word>> GOSCorpusHM = new ConcurrentHashMap<>();
String GOSCorpusHMKey = "";
String sentenceDelimiter = "seg";
int wordIndex = 0;
String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm
int numLines = 0;
int lineNum = 0;
progress.set(0.0);
if(!isCollocability) {
startTime = new Date();
}
// get number of lines
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext())
{
eventReader.next();
numLines ++;
// Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file
}
} catch (IOException e) {
e.printStackTrace();
} catch (XMLStreamException e) {
e.printStackTrace();
}
XMLEventReader eventReader = null;
boolean includeFile = true;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
// created hashmap to combine words with normalized words
while (eventReader.hasNext()) {
int percentage = (int) (lineNum * 100.0 / numLines);
if(progress.get() < percentage) {
progress.set(percentage);
}
if(isCancelled) {
return false;
}
lineNum ++;
XMLEvent event = eventReader.nextEvent();
// System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", "")));
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
String qName = startElement.getName().getLocalPart();
if (qName.equals("div")) {
HashMap<String, String> atts = extractAttributes(startElement);
if (atts.keySet().contains("type")) {
inOrthDiv = atts.get("type").equals("orth");
}
}
// "word" node
if (qName.equals("w")) {
// check that it's not a type
HashMap<String, String> atts = extractAttributes(startElement);
if (!atts.containsKey("type")) {
inWord = true;
if (atts.containsKey("msd")) {
msd = atts.get("msd");
}
if (atts.containsKey("lemma")) {
lemma = atts.get("lemma");
}
//
// if (!inOrthDiv) {
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
// }
} else if (atts.containsKey("type") && atts.get("type").equals("separated")) {
inSeparatedWord = true;
}
// }
}
// taxonomy node
else if (qName.equalsIgnoreCase("catRef")) {
// there are some term nodes at the beginning that are of no interest to us
// they differ by not having the attribute "ref", so test will equal null
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
if (tax != null) {
// keep only taxonomy properties
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()), stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement);
// Tax taxonomy = new Tax();
// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
}
} else if (qName.equalsIgnoreCase("div")) {
gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
} else if (qName.equalsIgnoreCase("seg")) {
HashMap<String, String> atts = extractAttributes(startElement);
if (atts.keySet().contains("id")) {
if (inOrthDiv) {
GOSCorpusHMKey = atts.get("id") + ".norm";
} else {
GOSCorpusHMKey = atts.get("id");
}
} else {
System.out.println("No attribute \"id\"");
}
}
break;
case XMLStreamConstants.CHARACTERS:
// "word" node value
if (inWord) {
// if (GOSCorpusHMKey.equals("gos.028-0108.norm") && wordIndex > 8){
// System.out.println(wordIndex);
// }
// if algorithm is in orthodox part add new word to sentence
if (inOrthDiv){
// GOSCorpusHM.put(GOSCorpusHMKey, sentence);
String word = "";
Characters characters = event.asCharacters();
sentence.add(createWord(characters.getData(), "", "", "", stats.getFilter()));
// if algorithm is in normalized part find orthodox word and add other info to it
} else {
Characters characters = event.asCharacters();
// System.out.println(wordIndex);
// System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex);
if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) {
Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex);
currentWord.setLemma(lemma, stats.getFilter().getWordParts());
currentWord.setMsd(msd, stats.getFilter().getWordParts());
currentWord.setNormalizedWord(characters.getData(), stats.getFilter().getWordParts());
wordIndex += 1;
// when a word is separated from one to many we have to create these duplicates
if (inSeparatedWord){
GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, createWord(currentWord.getWord(stats.getFilter().getWordParts()),
"", "", "", stats.getFilter()));
}
} //else {
// System.out.println("Error");
// }
}
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
if (endElement.getName().getLocalPart().equals("w")) {
if (inWord){
inWord = false;
} else if(inSeparatedWord) {
// when there are no separated words left we have to delete last aditional duplicate
GOSCorpusHM.get(GOSCorpusHMKey).remove(wordIndex);
inSeparatedWord = false;
}
}
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
if (inOrthDiv){
// add sentence to corpus
GOSCorpusHM.put(GOSCorpusHMKey, sentence);
} else {
sentence = GOSCorpusHM.remove(GOSCorpusHMKey);
if (stats.getFilter().getNgramValue() == 0){
int numSentenceParts = 0;
for(Word w : sentence){
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
}
stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
} else if(stats.getFilter().getNgramValue() >= 1) {
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
}
// add sentence to corpus if it passes filters
if (includeFile && !ValidationUtil.isEmpty(sentence)) {
// for(Word w : sentence) {
// if (w.getW1().equals("")) {
// System.out.println("HERE!!!");
// }
// }
sentence = runFilters(sentence, stats.getFilter());
// for(Word w : sentence) {
// if (w.getW1().equals("")) {
// System.out.println("HERE!!!");
// }
// }
corpus.add(new Sentence(sentence, currentFiletaxonomy));
}
wordIndex = 0;
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need
// the data anymore
corpus.clear();
}
}
// start a new sentence
sentence = new ArrayList<>();
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
// before proceeding to read this file, make sure that taxonomy filters are a match
// if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
// currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
//
// // disregard this entry if taxonomies don't match
// includeFile = !currentFiletaxonomy.isEmpty();
//
//// currentFiletaxonomy = new ArrayList<>();
// }
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
if (stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.UNION")) && currentFiletaxonomy.isEmpty()) {
// taxonomies don't match so stop
// union (select words that match any of selected taxonomy
// return false;
includeFile = false;
//
} else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){
// intersection (select only words that precisely match selected taxonomy
includeFile = false;
} else {
includeFile = true;
}
}
}
// backup
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
fj(corpus, stats);
corpus.clear();
currentFiletaxonomy = new ArrayList<>();
// currentFiletaxonomyLong = new ArrayList<>();
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
} catch (Exception e) {
logger.error("general error", e);
}
}
}
return true;
}
@SuppressWarnings("Duplicates")
public static boolean readVERT(String path, StatisticsNew stats) {
// taxonomy corpora
// HashSet<String> resultTaxonomy = new HashSet<>();
// regi path
String regiPath = path.substring(0, path.length()-4) + "regi";
LineIterator regiIt;
int wordIndex = -1;
int lemmaIndex = -1;
int msdIndex = -1;
boolean slovene = false;
try {
// read regi file
regiIt = FileUtils.lineIterator(new File(regiPath), "UTF-8");
try {
boolean insideHeader = false;
int attributeIndex = 0;
while (regiIt.hasNext()) {
String line = regiIt.nextLine();
if (line.length() >= 9 && line.substring(0, 9).equals("ATTRIBUTE")) {
// split over "\" "
String[] split = line.split(" ");
if (split[1].equals("word") && wordIndex == -1){
wordIndex = attributeIndex;
} else if (split[1].equals("lempos") && lemmaIndex == -1){
lemmaIndex = attributeIndex;
} else if (split[1].equals("tag") && msdIndex == -1){
msdIndex = attributeIndex;
}
attributeIndex ++;
if (wordIndex >= 0 && lemmaIndex >= 0 && msdIndex >= 0){
break;
}
} else if (line.length() >= 8 && line.substring(0, 8).equals("LANGUAGE")) {
String[] split = line.split(" ");
if (split[1].equals("\"Slovenian\"")){
slovene = true;
}
}
}
} finally {
LineIterator.closeQuietly(regiIt);
}
} catch (IOException e) {
throw new java.lang.RuntimeException("IOException");
// e.printStackTrace();
}
int numLines = 0;
// get number of lines
try (FileReader input = new FileReader(path);
LineNumberReader count = new LineNumberReader(input)
)
{
while (count.skip(Long.MAX_VALUE) > 0)
{
// Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file
}
numLines = count.getLineNumber() + 1; // +1 because line index starts at 0
} catch (IOException e) {
e.printStackTrace();
}
LineIterator it;
ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
boolean inParagraph = false;
boolean inSentence = false;
boolean taxonomyMatch = true;
int lineNum = 0;
int numSentences = 0;
int numSentencesLimit = 1000;
List<Word> sentence = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT);
progress.set(0.0);
if(!isCollocability) {
startTime = new Date();
}
try {
it = FileUtils.lineIterator(new File(path), "UTF-8");
try {
boolean insideHeader = false;
while (it.hasNext()) {
int percentage = (int) (lineNum * 100.0 / numLines);
if(progress.get() < percentage) {
progress.set(percentage);
}
if(isCancelled) {
return false;
}
lineNum ++;
String line = it.nextLine();
// beginning tags
// taxonomy
if (stats.getCorpus().getTaxonomy().size() > 0 && line.length() > 4 && line.substring(1, 5).equals("text")) {
String[] split = line.split("\" ");
currentFiletaxonomy = new ArrayList<>();
boolean medium = false;
boolean type = false;
boolean proofread = false;
for (String el : split) {
String[] attribute = el.split("=\"");
boolean idsPresent = false;
if (attribute[0].equals("medium_id") && !attribute[1].equals("-")) {
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement);
medium = true;
} else if (attribute[0].equals("type_id") && !attribute[1].equals("-")) {
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement);
type = true;
} else if (attribute[0].equals("proofread_id") && !attribute[1].equals("-")) {
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement);
proofread = true;
}
if (attribute[0].equals("medium") && !attribute[1].equals("-") && !medium) {
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement);
} else if (attribute[0].equals("type") && !attribute[1].equals("-") && !type) {
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement);
} else if (attribute[0].equals("proofread") && !attribute[1].equals("-") && !attribute[1].equals("-\">") && !proofread) {
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement);
}
}
taxonomyMatch = true;
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
if (stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.UNION")) && currentFiletaxonomy.isEmpty()) {
// taxonomies don't match so stop
// union (select words that match any of selected taxonomy
// return false;
taxonomyMatch = false;
//
} else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){
// intersection (select only words that precisely match selected taxonomy
taxonomyMatch = false;
}
}
}
// else if((line.length() >= 3 && line.substring(0, 2).equals("<p") && line.substring(line.length() - 1, line.length()).equals(">")) ||
// (line.length() >= 3 && line.substring(0, 3).equals("<ab") && line.substring(line.length() - 1, line.length()).equals(">"))){
// inParagraph = true;
// } else if((line.length() == 4 && line.equals("</p>")) || (line.length() == 5 && line.equals("</ab>"))){
// inParagraph = false;
// }
else if(line.length() >= 3 && line.substring(0, 2).equals("<s") && line.substring(line.length() - 1, line.length()).equals(">")){
inSentence = true;
} else if(line.length() == 4 && line.equals("</s>")){
inSentence = false;
if (stats.getFilter().getNgramValue() == 0){
int numSentenceParts = 0;
for(Word w : sentence){
int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
}
stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
} else if(stats.getFilter().getNgramValue() >= 1) {
stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
}
sentence = runFilters(sentence, stats.getFilter());
if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
corpus.add(new Sentence(sentence, currentFiletaxonomy));
}
if (numSentences == numSentencesLimit) {
fj(corpus, stats);
corpus.clear();
numSentences = 0;
} else {
numSentences ++;
}
// and start a new one
sentence = new ArrayList<>();
// corpus.add(new Sentence(sentence, currentFiletaxonomy));
} else if(!(line.charAt(0) == '<' && line.charAt(line.length() - 1) == '>') && inSentence){
// } else if(!(line.charAt(0) == '<' && line.charAt(line.length() - 1) == '>') && inSentence && inParagraph){
String[] split = line.split("\t");
if(slovene) {
if (split[lemmaIndex].length() > 2 && split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) == '-' && Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1)) &&
!split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-u")) {
Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter());
sentence.add(word);
} else if (stats.getFilter().getNotePunctuations() && (split[lemmaIndex].length() <= 2 || (split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) != '-' && !Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1))))) {
Word word = createWord(split[wordIndex], split[lemmaIndex], split[msdIndex], split[wordIndex], stats.getFilter());
sentence.add(word);
} else if (split[lemmaIndex].length() > 2 && !split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-u") ||
stats.getFilter().getNotePunctuations()) {
Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter());
sentence.add(word);
}
} else {
if (split[lemmaIndex].length() > 2 && split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) == '-' && Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1)) &&
!split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-z")) {
Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter());
sentence.add(word);
} else if (stats.getFilter().getNotePunctuations() && (split[lemmaIndex].length() <= 2 || (split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) != '-' && !Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1))))) {
Word word = createWord(split[wordIndex], split[lemmaIndex], split[msdIndex], split[wordIndex], stats.getFilter());
sentence.add(word);
} else if (split[lemmaIndex].length() > 2 && !split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-z") ||
stats.getFilter().getNotePunctuations()) {
Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter());
sentence.add(word);
}
}
}
}
if (corpus.size() > 0) {
fj(corpus, stats);
corpus.clear();
}
} finally {
LineIterator.closeQuietly(it);
}
} catch (IOException e) {
e.printStackTrace();
}
// resultTaxonomy.remove("-");
return true;
}
/**
* Runs the sentence through some filters, so we don't do calculations when unnecessary.
* Filters:
* <ol>
* <li><b>Ngrams:</b> omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)</li>
* <li><b>Letter ngrams:</b> omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)</li>
* </ol>
*
* @return Empty sentence (if fails 1.) or a sentence with some words removed (2.)
*/
private static List<Word> runFilters(List<Word> sentence, Filter filter) {
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
// ngram level: if not 0 must be less than or equal to number of words in this sentence.
if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) {
return new ArrayList<>();
}
// if we're calculating values for letters, omit words that are shorter than string length
if (filter.getNgramValue() == 0) {
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord(filter.getWordParts()).length() < filter.getStringLength())
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma(filter.getWordParts()).length() < filter.getStringLength()));
}
}
return sentence;
}
private static HashMap<String, String> extractAttributes(StartElement se) {
Iterator attributesIt = se.getAttributes();
HashMap<String, String> atts = new HashMap<>();
while (attributesIt.hasNext()) {
Attribute a = (Attribute) attributesIt.next();
atts.put(a.getName().getLocalPart(), a.getValue());
}
return atts;
}
public static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
List<String> wString = new ArrayList<>();
if (f.getWordParts().contains(CalculateFor.WORD))
wString.add(word);
if (f.getWordParts().contains(CalculateFor.LEMMA))
wString.add(lemma);
if (f.getWordParts().contains(CalculateFor.MORPHOSYNTACTIC_SPECS))
wString.add(msd);
if (f.getWordParts().contains(CalculateFor.NORMALIZED_WORD))
wString.add(normalizedWord);
// find appropriate strings and put them in word
Word w;
switch (f.getWordParts().size()) {
case 1:
w = new Word1(wString.get(0));
break;
case 2:
w = new Word2(wString.get(0), wString.get(1));
break;
case 3:
w = new Word3(wString.get(0), wString.get(1), wString.get(2));
break;
case 4:
w = new Word4(wString.get(0), wString.get(1), wString.get(2), wString.get(3));
break;
default:
w = null;
}
return w;
}
}