package alg; import static data.Enums.solar.SolarFilters.*; import java.io.*; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ForkJoinPool; import javax.xml.namespace.QName; import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.events.*; import gui.I18N; import javafx.beans.InvalidationListener; import javafx.beans.property.ReadOnlyDoubleProperty; import javafx.beans.property.ReadOnlyDoubleWrapper; import javafx.concurrent.Task; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; import org.apache.logging.log4j.LogManager; import data.*; import gui.ValidationUtil; public class XML_processing { public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class); // progress tracking functionality private static final ReadOnlyDoubleWrapper progress = new ReadOnlyDoubleWrapper(); public static boolean isCancelled = false; public static Date startTime = new Date(); public static boolean isCollocability = false; public static InvalidationListener progressBarListener; public double getProgress() { return progressProperty().get(); } public ReadOnlyDoubleProperty progressProperty() { return progress ; } // public static void processCorpus(Statistics stats) { // // we can preset the list's size, so there won't be a need to resize it // List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // // int i = 0; // for (File f : Settings.corpus) { // i++; // readXML(f.toString(), stats); // } // } // public static void readXML(String path, Statistics stats) { // if (stats.getCorpusType() == CorpusType.GIGAFIDA) { // readXMLGigafida(path, stats); // } else if (stats.getCorpusType() == CorpusType.GOS) { // readXMLGos(path, stats); // } else if (stats.getCorpusType() == CorpusType.SOLAR) { // readXMLSolar(path, stats); // } // } public static boolean readXML(String path, StatisticsNew stats) { if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA || stats.getCorpus().getCorpusType() == CorpusType.CCKRES) { return readXMLGigafida(path, stats); } else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) { return readXMLGos(path, stats); } else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) { return readXMLSolar(path, stats); } else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K || stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2) { return readXMLSSJ500K(path, stats); } else if (stats.getCorpus().getCorpusType() == CorpusType.VERT) { return readVERT(path, stats); } // task.updateProgress(fileNum, size); return false; } /** * Reads and returns the value of a passed header tag or an empty string. * E.g. title tag, for discerning the corpus' type. * Notice: returns only the value of the first occurrence of a given tag name. */ public static String readXMLHeaderTag(String path, String tag) { XMLInputFactory factory = XMLInputFactory.newInstance(); XMLEventReader eventReader = null; try { eventReader = factory.createXMLEventReader(new FileInputStream(path)); while (eventReader.hasNext()) { XMLEvent xmlEvent = eventReader.nextEvent(); if (xmlEvent.isStartElement()) { StartElement startElement = xmlEvent.asStartElement(); String var = startElement.getName().getLocalPart(); if (var.equalsIgnoreCase(tag)) { return eventReader.nextEvent().asCharacters().getData(); } } } } catch (FileNotFoundException | XMLStreamException e) { e.printStackTrace(); } finally { if (eventReader != null) { try { eventReader.close(); } catch (XMLStreamException e) { logger.error("closing stream", e); } } } return ""; } /** * Reads and returns the value of a passed header attribute or an empty string. * E.g. body base attribute, for discerning the corpus' type of ssj500k. * Notice: returns only the value of the first occurrence of a given tag name. */ public static String readXMLHeaderAttribute(String path, String tag, String attribute) { XMLInputFactory factory = XMLInputFactory.newInstance(); XMLEventReader eventReader = null; try { eventReader = factory.createXMLEventReader(new FileInputStream(path)); while (eventReader.hasNext()) { XMLEvent xmlEvent = eventReader.nextEvent(); if (xmlEvent.isStartElement()) { StartElement startElement = xmlEvent.asStartElement(); String var = startElement.getName().getLocalPart(); if (var.equalsIgnoreCase(tag)) { HashMap att = extractAttributes(startElement); if (att.containsKey("base")) { return att.get("base").substring(0, att.get("base").length() - 12); } return eventReader.nextEvent().asCharacters().getData(); } } } } catch (FileNotFoundException | XMLStreamException e) { e.printStackTrace(); } finally { if (eventReader != null) { try { eventReader.close(); } catch (XMLStreamException e) { logger.error("closing stream", e); } } } return ""; } private static void fj(List corpus, StatisticsNew stats) { ForkJoinPool pool = new ForkJoinPool(); if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) { alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats); pool.invoke(wc); } else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) { alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats); pool.invoke(wc); } else { // TODO: // alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats); // pool.invoke(wc); } } // public static void readXMLGos(String path, Statistics stats) { // boolean in_word = false; // String taksonomija = ""; // String lemma = ""; // String msd = ""; // String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm // // List stavek = new ArrayList<>(); // List corpus = new ArrayList<>(); // String sentenceDelimiter = "seg"; // String taxonomyPrefix = "gos."; // // try { // XMLInputFactory factory = XMLInputFactory.newInstance(); // XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); // // while (eventReader.hasNext()) { // XMLEvent event = eventReader.nextEvent(); // // switch (event.getEventType()) { // case XMLStreamConstants.START_ELEMENT: // // StartElement startElement = event.asStartElement(); // String qName = startElement.getName().getLocalPart(); // // // "word" node // if (qName.equals("w")) { // in_word = true; // // if (type.equals("norm")) { // // make sure we're looking at and not // Iterator var = startElement.getAttributes(); // ArrayList attributes = new ArrayList<>(); // while (var.hasNext()) { // attributes.add(var.next()); // } // // if (attributes.contains("msd")) { // msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); // } else { // msd = null; // } // // if (attributes.contains("lemma")) { // lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); // } // } // } // // taxonomy node // else if (qName.equalsIgnoreCase("catRef")) { // // there are some term nodes at the beginning that are of no interest to us // // they differ by not having the attribute "ref", so test will equal null // Attribute test = startElement.getAttributeByName(QName.valueOf("target")); // // if (test != null) { // // keep only taxonomy properties // taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, ""); // } // } else if (qName.equalsIgnoreCase("div")) { // type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue()); // // } // break; // // case XMLStreamConstants.CHARACTERS: // Characters characters = event.asCharacters(); // // // "word" node value // if (in_word) { // if (type.equals("norm") && msd != null) { // stavek.add(new Word(characters.getData(), lemma, msd)); // } else { // stavek.add(new Word(characters.getData())); // } // // in_word = false; // } // break; // // case XMLStreamConstants.END_ELEMENT: // EndElement endElement = event.asEndElement(); // // // parser reached end of the current sentence // if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { // // add sentence to corpus // corpus.add(new Sentence(stavek, taksonomija, type)); // // and start a new one // stavek = new ArrayList<>(); // // /* Invoke Fork-Join when we reach maximum limit of // * sentences (because we can't read everything to // * memory) or we reach the end of the file. // */ // if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { // fj(corpus, stats); // // empty the current corpus, since we don't need // // the data anymore // corpus.clear(); // } // } // // // backup // if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { // fj(corpus, stats); // corpus.clear(); // } // // break; // } // } // } catch (FileNotFoundException | XMLStreamException e) { // e.printStackTrace(); // } // } @SuppressWarnings("unused") public static boolean readXMLSolar(String path, StatisticsNew stats) { boolean in_word = false; boolean inPunctuation = false; String lemma = ""; String msd = ""; List stavek = new ArrayList<>(); List corpus = new ArrayList<>(); // used for filter // Set headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto")); Set headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO)); Map headBlock = null; boolean includeThisBlock = false; int numLines = 0; int lineNum = 0; progress.set(0.0); if(!isCollocability) { startTime = new Date(); } // get number of lines try { XMLInputFactory factory = XMLInputFactory.newInstance(); XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); while (eventReader.hasNext()) { eventReader.next(); numLines ++; // Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file } } catch (IOException e) { e.printStackTrace(); } catch (XMLStreamException e) { e.printStackTrace(); } try { XMLInputFactory factory = XMLInputFactory.newInstance(); XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); while (eventReader.hasNext()) { int percentage = (int) (lineNum * 100.0 / numLines); if(progress.get() < percentage) { progress.set(percentage); } if(isCancelled) { return false; } lineNum ++; XMLEvent event = eventReader.nextEvent(); switch (event.getEventType()) { case XMLStreamConstants.START_ELEMENT: StartElement startElement = event.asStartElement(); // System.out.println(String.format("%s", startElement.toString())); String qName = startElement.getName().getLocalPart(); // "word" node if (qName.equals("w3") || qName.equals("w1") || qName.equals("w")) { in_word = true; msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); } else if (qName.equals("c3") || qName.equals("c1") || qName.equals("c")) { String c3Content = eventReader.nextEvent().asCharacters().getData(); if (stats.getFilter().getNotePunctuations() && stavek.size() > 0) { stavek.add(createWord(c3Content, c3Content, "/", "", stats.getFilter())); } } else if ((qName.equals("st1") && startElement.getAttributeByName(QName.valueOf("tip")).getValue().equals("0")) || qName.equals("s")) { if (stats.getFilter().getNgramValue() == 0){ int numSentenceParts = 0; for(Word w : stavek){ int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; } stats.updateUniGramOccurrences(numSentenceParts, new ArrayList<>()); } else if(stats.getFilter().getNgramValue() >= 1) { stats.updateUniGramOccurrences(stavek.size(), new ArrayList<>()); } if(includeThisBlock) { // add sentence to corpus corpus.add(new Sentence(stavek, null)); // and start a new one /* Invoke Fork-Join when we reach maximum limit of * sentences (because we can't read everything to * memory) or we reach the end of the file. */ if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { fj(corpus, stats); // empty the current corpus, since we don't need // the data anymore corpus.clear(); } } stavek = new ArrayList<>(); } else if (qName.equals("head")) { headBlock = new HashMap<>(); } else { // if (headTags.contains(qName)) { boolean inHeadTags = false; String headTag = ""; for (String tag : headTags){ if(I18N.getDefaultLocaleItem(tag).equals(qName)){ inHeadTags = true; headTag = tag; break; } } if(inHeadTags) { String tagContent = eventReader.nextEvent().asCharacters().getData(); headBlock.put(headTag, tagContent); // String tagContent = xmlEventReader.nextEvent().asCharacters().getData(); // resultFilters.get(headTag).add(tagContent); } } break; case XMLStreamConstants.CHARACTERS: Characters characters = event.asCharacters(); // "word" node value if (in_word) { stavek.add(createWord(characters.getData(), lemma, msd, "", stats.getFilter())); in_word = false; } break; case XMLStreamConstants.END_ELEMENT: EndElement endElement = event.asEndElement(); String qNameEnd = endElement.getName().getLocalPart(); if (qNameEnd.equals("head")) { // validate and set boolean if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) { includeThisBlock = true; } } else if (qNameEnd.equals("body")) { // new block, reset filter status includeThisBlock = false; stavek = new ArrayList<>(); } // backup if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) { fj(corpus, stats); corpus.clear(); } break; } } } catch (FileNotFoundException | XMLStreamException e) { e.printStackTrace(); } return true; } /** * @param readHeadBlock block of tags read from the corpus * @param userSetFilter tags with values set by the user * * @return */ private static boolean validateHeadBlock(Map readHeadBlock, HashMap> userSetFilter) { boolean pass = true; if (userSetFilter == null) { return true; } for (Map.Entry> filterEntry : userSetFilter.entrySet()) { String key = filterEntry.getKey(); HashSet valueObject = filterEntry.getValue(); // if (valueObject instanceof String) { // pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject); // } else if (valueObject != null) { //noinspection unchecked for (String value : valueObject) { pass = validateHeadBlockEntry(readHeadBlock, key, value); if (pass){ break; } } } if (!pass) { // current head block does not include one of the set filters - not likely, but an edge case anyway return false; } } // if it gets to this point, it passed all the filters return true; } private static boolean validateHeadBlockEntry(Map readHeadBlock, String userSetKey, String userSetValue) { if (!readHeadBlock.keySet().contains(userSetKey)) { // current head block does not include one of the set filters - not likely, but an edge case anyway return false; } else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) { // different values -> doesn't pass the filter return false; } return true; } /** * Parses XML headers for information about its taxonomy (if supported) or filters (solar) * * @param filepath * @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file * @param corpusType */ public static HashSet readVertHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) { // taxonomy corpora HashSet resultTaxonomy = new HashSet<>(); LineIterator it = null; try { it = FileUtils.lineIterator(new File(filepath), "UTF-8"); try { boolean insideHeader = false; while (it.hasNext()) { String line = it.nextLine(); if (line.length() > 4 && line.substring(1, 5).equals("text")) { // split over "\" " String[] split = line.split("\" "); // String mediumId = ""; // String typeId = ""; // String proofreadId = ""; boolean idsPresent = false; for (String el : split) { String[] attribute = el.split("=\""); if (attribute[0].equals("medium_id")) { // mediumId = attribute[1]; idsPresent = true; resultTaxonomy.add(attribute[1]); } else if (attribute[0].equals("type_id")) { // typeId = attribute[1]; idsPresent = true; resultTaxonomy.add(attribute[1]); } else if (attribute[0].equals("proofread_id")) { // proofreadId = attribute[1]; idsPresent = true; resultTaxonomy.add(attribute[1]); } } if (!idsPresent){ for (String el : split) { String[] attribute = el.split("=\""); if (attribute[0].equals("medium")) { // mediumId = attribute[1]; resultTaxonomy.add(attribute[1]); } else if (attribute[0].equals("type")) { // typeId = attribute[1]; resultTaxonomy.add(attribute[1]); } else if (attribute[0].equals("proofread")) { // proofreadId = attribute[1]; resultTaxonomy.add(attribute[1]); } } } } } } finally { LineIterator.closeQuietly(it); } } catch (IOException e) { e.printStackTrace(); } resultTaxonomy.remove("-"); return resultTaxonomy; } /** * Parses XML headers for information about its taxonomy (if supported) or filters (solar) * * @param filepath * @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file * @param corpusType */ public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) { boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType); // solar Set headTags = null; HashMap> resultFilters = new HashMap<>(); // taxonomy corpora HashSet resultTaxonomy = new HashSet<>(); String headTagName; if (corpusType == CorpusType.SOLAR) { headTagName = "head"; // used for filter headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO)); // init results now to avoid null pointers headTags.forEach(f -> resultFilters.put(f, new HashSet<>())); } else if (corpusType == CorpusType.SSJ500K) { headTagName = "bibl"; } else { headTagName = "teiHeader"; } XMLInputFactory factory = XMLInputFactory.newInstance(); XMLEventReader xmlEventReader = null; try { xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath)); boolean insideHeader = false; while (xmlEventReader.hasNext()) { XMLEvent xmlEvent = xmlEventReader.nextEvent(); if (xmlEvent.isStartElement()) { StartElement startElement = xmlEvent.asStartElement(); String elementName = startElement.getName().getLocalPart(); if (elementName.equalsIgnoreCase(headTagName)) { // if the corpus is split into files, we skip bodies // this toggle is true when we're inside a header (next block of code executes) // and false when we're not (skip reading unnecessary attributes) insideHeader = true; } if (insideHeader) { if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) { HashMap atts = extractAttributes(startElement); String debug = ""; String tax = startElement.getAttributeByName(QName.valueOf("target")) .getValue() .replace("#", ""); if (tax.indexOf(':') >= 0) { tax = tax.split(":")[1]; } resultTaxonomy.add(tax); } else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) { String tax = startElement.getAttributeByName(QName.valueOf("ref")) .getValue() .replace("#", ""); resultTaxonomy.add(tax); // solar // } else if (!parseTaxonomy && headTags.contains(elementName)) { } else if (!parseTaxonomy) { boolean inHeadTags = false; String headTag = ""; for (String tag : headTags){ if(I18N.getDefaultLocaleItem(tag).equals(elementName)){ inHeadTags = true; headTag = tag; break; } } if(inHeadTags) { String tagContent = xmlEventReader.nextEvent().asCharacters().getData(); resultFilters.get(headTag).add(tagContent); } } } } else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) { // if the corpus is split into multiple files, each with only one header block per file // that means we should stop after we reach the end of the header return parseTaxonomy ? resultTaxonomy : resultFilters; } else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) { // whole corpus in one file, so we have to continue reading in order to find all header blocks insideHeader = false; } } } catch (XMLStreamException e) { logger.error("Streaming error", e); return parseTaxonomy ? resultTaxonomy : resultFilters; } catch (FileNotFoundException e) { logger.error("File not found", e); return parseTaxonomy ? resultTaxonomy : resultFilters; // TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user } finally { if (xmlEventReader != null) { try { xmlEventReader.close(); } catch (XMLStreamException e) { logger.error("closing stream", e); } } } return parseTaxonomy ? resultTaxonomy : resultFilters; } private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) { return event.asEndElement() .getName() .getLocalPart() .equalsIgnoreCase(headerTag); } @SuppressWarnings("Duplicates") public static boolean readXMLGigafida(String path, StatisticsNew stats) { boolean inWord = false; boolean inPunctuation = false; boolean taxonomyMatch = true; ArrayList currentFiletaxonomy = new ArrayList<>(); // ArrayList currentFiletaxonomyLong = new ArrayList<>(); String lemma = ""; String msd = ""; List sentence = new ArrayList<>(); List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it String sentenceDelimiter = "s"; XMLEventReader eventReader = null; try { XMLInputFactory factory = XMLInputFactory.newInstance(); eventReader = factory.createXMLEventReader(new FileInputStream(path)); while (eventReader.hasNext()) { XMLEvent event = eventReader.nextEvent(); switch (event.getEventType()) { case XMLStreamConstants.START_ELEMENT: StartElement startElement = event.asStartElement(); String qName = startElement.getName().getLocalPart(); // "word" node if (qName.equals("w")) { inWord = true; msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); } if (qName.equals("c")){ inPunctuation = true; } // taxonomy node else if (qName.equalsIgnoreCase("catRef")) { // there are some term nodes at the beginning that are of no interest to us // they differ by not having the attribute "ref", so test will equal null Attribute tax = startElement.getAttributeByName(QName.valueOf("target")); if (tax != null) { // keep only taxonomy properties Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus()); currentFiletaxonomy.add(currentFiletaxonomyElement); Tax taxonomy = new Tax(); // currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement)); } } break; case XMLStreamConstants.CHARACTERS: Characters characters = event.asCharacters(); // "word" node value if (inWord) { String word = characters.getData(); sentence.add(createWord(word, lemma, msd, word, stats.getFilter())); inWord = false; } // if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { if (stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { String punctuation = characters.getData(); sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter())); inPunctuation = false; // String punctuation = ","; // // sentence.get(sentence.size() - 1).setWord(sentence.get(sentence.size() - 1).getWord() + punctuation); // sentence.get(sentence.size() - 1).setLemma(sentence.get(sentence.size() - 1).getLemma() + punctuation); // sentence.get(sentence.size() - 1).setMsd(sentence.get(sentence.size() - 1).getMsd() + punctuation); // inPunctuation = false; } break; // if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { // String actualPunctuation = characters.getData(); // if (actualPunctuation.equals(".") || actualPunctuation.equals("!") || actualPunctuation.equals("?") || actualPunctuation.equals("...")) // break; // String punctuation = ","; // int skip_number = 0; // if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue())){ // skip_number = stats.getFilter().getSkipValue(); // } // for(int i = 1; i < skip_number + 2; i ++){ // if (i < sentence.size() && !sentence.get(sentence.size() - i).equals(punctuation)) { // sentence.get(sentence.size() - i).setWord(sentence.get(sentence.size() - i).getWord() + punctuation); // sentence.get(sentence.size() - i).setLemma(sentence.get(sentence.size() - i).getLemma() + punctuation); // sentence.get(sentence.size() - i).setMsd(sentence.get(sentence.size() - i).getMsd() + punctuation); // } // } // inPunctuation = false; // } case XMLStreamConstants.END_ELEMENT: EndElement endElement = event.asEndElement(); String var = endElement.getName().getLocalPart(); String debug = ""; // parser reached end of the current sentence if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { // count all UniGramOccurrences in sentence for statistics if (stats.getFilter().getNgramValue() == 0){ int numSentenceParts = 0; for(Word w : sentence){ int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; } stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy); } else if(stats.getFilter().getNgramValue() >= 1) { stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy); } // add sentence to corpus if it passes filters sentence = runFilters(sentence, stats.getFilter()); if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) { corpus.add(new Sentence(sentence, currentFiletaxonomy)); } // taxonomyMatch = true; // and start a new one sentence = new ArrayList<>(); /* Invoke Fork-Join when we reach maximum limit of * sentences (because we can't read everything to * memory) or we reach the end of the file. */ if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { fj(corpus, stats); // empty the current corpus, since we don't need the data anymore corpus.clear(); // TODO: if (stats.isUseDB()) { // stats.storeTmpResultsToDB(); // } } } else if (endElement.getName().getLocalPart().equals("teiHeader")) { // before proceeding to read this file, make sure that taxonomy filters are a match if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection if (stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.UNION")) && currentFiletaxonomy.isEmpty()) { // taxonomies don't match so stop // union (select words that match any of selected taxonomy // return false; taxonomyMatch = false; // } else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){ // intersection (select only words that precisely match selected taxonomy taxonomyMatch = false; } } } // fallback else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { // join corpus and stats fj(corpus, stats); corpus.clear(); // TODO: if (stats.isUseDB()) { // stats.storeTmpResultsToDB(); // } } break; } } } catch (FileNotFoundException | XMLStreamException e) { throw new java.lang.RuntimeException("XMLStreamException | FileNotFoundException"); // e.printStackTrace(); } finally { if (eventReader != null) { try { eventReader.close(); } catch (XMLStreamException e) { logger.error("closing stream", e); } } } return true; } @SuppressWarnings("Duplicates") public static boolean readXMLSSJ500K(String path, StatisticsNew stats) { boolean inWord = false; boolean inPunctuation = false; boolean taxonomyMatch = true; ArrayList currentFiletaxonomy = new ArrayList<>(); // ArrayList currentFiletaxonomyLong = new ArrayList<>(); String lemma = ""; String msd = ""; List sentence = new ArrayList<>(); List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it String sentenceDelimiter = "s"; int numLines = 0; int lineNum = 0; progress.set(0.0); if(!isCollocability) { startTime = new Date(); } // get number of lines try { XMLInputFactory factory = XMLInputFactory.newInstance(); XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); while (eventReader.hasNext()) { eventReader.next(); numLines ++; // Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file } } catch (IOException e) { e.printStackTrace(); } catch (XMLStreamException e) { e.printStackTrace(); } XMLEventReader eventReader = null; try { XMLInputFactory factory = XMLInputFactory.newInstance(); eventReader = factory.createXMLEventReader(new FileInputStream(path)); while (eventReader.hasNext()) { int percentage = (int) (lineNum * 100.0 / numLines); if(progress.get() < percentage) { progress.set(percentage); } if(isCancelled) { return false; } lineNum ++; XMLEvent event = eventReader.nextEvent(); switch (event.getEventType()) { case XMLStreamConstants.START_ELEMENT: StartElement startElement = event.asStartElement(); String qName = startElement.getName().getLocalPart(); // "word" node if (qName.equals("w")) { inWord = true; if (!(String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("msd:") || String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("mte:"))){ System.out.println("MSD written incorrectly"); } msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(4); lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); } else if (qName.equals("pc")){ inPunctuation = true; } // taxonomy node else if (stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("term")) { // there are some term nodes at the beginning that are of no interest to us // they differ by not having the attribute "ref", so test will equal null Attribute tax = startElement.getAttributeByName(QName.valueOf("ref")); if (tax != null) { // keep only taxonomy properties Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus()); currentFiletaxonomy.add(currentFiletaxonomyElement); // Tax taxonomy = new Tax(); // currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement)); } } else if (stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("catRef")) { // get value from attribute target Attribute tax = startElement.getAttributeByName(QName.valueOf("target")); if (tax != null && !tax.getValue().equals("dedup:nodup")) { // keep only taxonomy properties Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).split(":")[1], stats.getCorpus()); currentFiletaxonomy.add(currentFiletaxonomyElement); // Tax taxonomy = new Tax(); // currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement)); } // if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) { // HashMap atts = extractAttributes(startElement); // String debug = ""; // // String tax = startElement.getAttributeByName(QName.valueOf("target")) // .getValue() // .replace("#", ""); // // if (tax.indexOf(':') >= 0) { // tax = tax.split(":")[1]; // } // resultTaxonomy.add(tax); // } else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) { // String tax = startElement.getAttributeByName(QName.valueOf("ref")) // .getValue() // .replace("#", ""); // // resultTaxonomy.add(tax); // } else if (!parseTaxonomy && headTags.contains(elementName)) { // String tagContent = xmlEventReader.nextEvent().asCharacters().getData(); // resultFilters.get(elementName).add(tagContent); // } } else if (qName.equals("bibl")) { // before proceeding to read this file, make sure that taxonomy filters are a match taxonomyMatch = true; } else if (qName.equals("text")){ taxonomyMatch = true; } break; case XMLStreamConstants.CHARACTERS: Characters characters = event.asCharacters(); // "word" node value if (inWord) { String word = characters.getData(); // if (word.equals("Banovec")){ // System.out.println("Test"); // } sentence.add(createWord(word, lemma, msd, word, stats.getFilter())); inWord = false; } if (stats.getFilter().getNotePunctuations() && inPunctuation) { // if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { String punctuation = characters.getData(); sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter())); inPunctuation = false; } break; case XMLStreamConstants.END_ELEMENT: EndElement endElement = event.asEndElement(); String var = endElement.getName().getLocalPart(); String debug = ""; // parser reached end of the current sentence if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { if (stats.getFilter().getNgramValue() == 0){ int numSentenceParts = 0; for(Word w : sentence){ int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; } stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy); } else if(stats.getFilter().getNgramValue() >= 1) { stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy); } // add sentence to corpus if it passes filters sentence = runFilters(sentence, stats.getFilter()); if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) { corpus.add(new Sentence(sentence, currentFiletaxonomy)); } // and start a new one sentence = new ArrayList<>(); /* Invoke Fork-Join when we reach maximum limit of * sentences (because we can't read everything to * memory) or we reach the end of the file. */ if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { fj(corpus, stats); // empty the current corpus, since we don't need the data anymore corpus.clear(); // TODO: if (stats.isUseDB()) { // stats.storeTmpResultsToDB(); // } } } // fallback else if (endElement.getName().getLocalPart().equalsIgnoreCase("div") && stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) { // join corpus and stats fj(corpus, stats); corpus.clear(); currentFiletaxonomy = new ArrayList<>(); // currentFiletaxonomyLong = new ArrayList<>(); } else if (endElement.getName().getLocalPart().equals("bibl")) { // before proceeding to read this file, make sure that taxonomy filters are a match if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection if (stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.UNION")) && currentFiletaxonomy.isEmpty()) { // taxonomies don't match so stop // union (select words that match any of selected taxonomy // return false; taxonomyMatch = false; // } else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){ // intersection (select only words that precisely match selected taxonomy taxonomyMatch = false; } } } else if (endElement.getName().getLocalPart().equals("text")){ taxonomyMatch = false; } break; } } if (corpus.size() > 0) { fj(corpus, stats); // empty the current corpus, since we don't need the data anymore corpus.clear(); // TODO: if (stats.isUseDB()) { // stats.storeTmpResultsToDB(); // } } } catch (FileNotFoundException | XMLStreamException e) { e.printStackTrace(); } finally { if (eventReader != null) { try { eventReader.close(); } catch (XMLStreamException e) { logger.error("closing stream", e); } } } return true; } @SuppressWarnings("Duplicates") public static boolean readXMLGos(String path, StatisticsNew stats) { boolean inWord = false; boolean inPunctuation = false; boolean inOrthDiv = false; boolean computeForOrth = stats.getCorpus().isGosOrthMode(); boolean inSeparatedWord = false; ArrayList currentFiletaxonomy = new ArrayList<>(); // ArrayList currentFiletaxonomyLong = new ArrayList<>(); String lemma = ""; String msd = ""; List sentence = new ArrayList<>(); List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it Map> GOSCorpusHM = new ConcurrentHashMap<>(); String GOSCorpusHMKey = ""; String sentenceDelimiter = "seg"; int wordIndex = 0; String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm int numLines = 0; int lineNum = 0; progress.set(0.0); if(!isCollocability) { startTime = new Date(); } // get number of lines try { XMLInputFactory factory = XMLInputFactory.newInstance(); XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); while (eventReader.hasNext()) { eventReader.next(); numLines ++; // Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file } } catch (IOException e) { e.printStackTrace(); } catch (XMLStreamException e) { e.printStackTrace(); } XMLEventReader eventReader = null; boolean includeFile = true; try { XMLInputFactory factory = XMLInputFactory.newInstance(); eventReader = factory.createXMLEventReader(new FileInputStream(path)); // created hashmap to combine words with normalized words while (eventReader.hasNext()) { int percentage = (int) (lineNum * 100.0 / numLines); if(progress.get() < percentage) { progress.set(percentage); } if(isCancelled) { return false; } lineNum ++; XMLEvent event = eventReader.nextEvent(); // System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", ""))); switch (event.getEventType()) { case XMLStreamConstants.START_ELEMENT: StartElement startElement = event.asStartElement(); String qName = startElement.getName().getLocalPart(); if (qName.equals("div")) { HashMap atts = extractAttributes(startElement); if (atts.keySet().contains("type")) { inOrthDiv = atts.get("type").equals("orth"); } } // "word" node if (qName.equals("w")) { // check that it's not a type HashMap atts = extractAttributes(startElement); if (!atts.containsKey("type")) { inWord = true; if (atts.containsKey("msd")) { msd = atts.get("msd"); } if (atts.containsKey("lemma")) { lemma = atts.get("lemma"); } // // if (!inOrthDiv) { // msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); // lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); // } } else if (atts.containsKey("type") && atts.get("type").equals("separated")) { inSeparatedWord = true; } // } } // taxonomy node else if (qName.equalsIgnoreCase("catRef")) { // there are some term nodes at the beginning that are of no interest to us // they differ by not having the attribute "ref", so test will equal null Attribute tax = startElement.getAttributeByName(QName.valueOf("target")); if (tax != null) { // keep only taxonomy properties Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()), stats.getCorpus()); currentFiletaxonomy.add(currentFiletaxonomyElement); // Tax taxonomy = new Tax(); // currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement)); } } else if (qName.equalsIgnoreCase("div")) { gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue()); } else if (qName.equalsIgnoreCase("seg")) { HashMap atts = extractAttributes(startElement); if (atts.keySet().contains("id")) { if (inOrthDiv) { GOSCorpusHMKey = atts.get("id") + ".norm"; } else { GOSCorpusHMKey = atts.get("id"); } } else { System.out.println("No attribute \"id\""); } } break; case XMLStreamConstants.CHARACTERS: // "word" node value if (inWord) { // if (GOSCorpusHMKey.equals("gos.028-0108.norm") && wordIndex > 8){ // System.out.println(wordIndex); // } // if algorithm is in orthodox part add new word to sentence if (inOrthDiv){ // GOSCorpusHM.put(GOSCorpusHMKey, sentence); String word = ""; Characters characters = event.asCharacters(); sentence.add(createWord(characters.getData(), "", "", "", stats.getFilter())); // if algorithm is in normalized part find orthodox word and add other info to it } else { Characters characters = event.asCharacters(); // System.out.println(wordIndex); // System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex); if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) { Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex); currentWord.setLemma(lemma, stats.getFilter().getWordParts()); currentWord.setMsd(msd, stats.getFilter().getWordParts()); currentWord.setNormalizedWord(characters.getData(), stats.getFilter().getWordParts()); wordIndex += 1; // when a word is separated from one to many we have to create these duplicates if (inSeparatedWord){ GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, createWord(currentWord.getWord(stats.getFilter().getWordParts()), "", "", "", stats.getFilter())); } } //else { // System.out.println("Error"); // } } } break; case XMLStreamConstants.END_ELEMENT: EndElement endElement = event.asEndElement(); if (endElement.getName().getLocalPart().equals("w")) { if (inWord){ inWord = false; } else if(inSeparatedWord) { // when there are no separated words left we have to delete last aditional duplicate GOSCorpusHM.get(GOSCorpusHMKey).remove(wordIndex); inSeparatedWord = false; } } // parser reached end of the current sentence if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { if (inOrthDiv){ // add sentence to corpus GOSCorpusHM.put(GOSCorpusHMKey, sentence); } else { sentence = GOSCorpusHM.remove(GOSCorpusHMKey); if (stats.getFilter().getNgramValue() == 0){ int numSentenceParts = 0; for(Word w : sentence){ int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; } stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy); } else if(stats.getFilter().getNgramValue() >= 1) { stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy); } // add sentence to corpus if it passes filters if (includeFile && !ValidationUtil.isEmpty(sentence)) { // for(Word w : sentence) { // if (w.getW1().equals("")) { // System.out.println("HERE!!!"); // } // } sentence = runFilters(sentence, stats.getFilter()); // for(Word w : sentence) { // if (w.getW1().equals("")) { // System.out.println("HERE!!!"); // } // } corpus.add(new Sentence(sentence, currentFiletaxonomy)); } wordIndex = 0; /* Invoke Fork-Join when we reach maximum limit of * sentences (because we can't read everything to * memory) or we reach the end of the file. */ if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { fj(corpus, stats); // empty the current corpus, since we don't need // the data anymore corpus.clear(); } } // start a new sentence sentence = new ArrayList<>(); } else if (endElement.getName().getLocalPart().equals("teiHeader")) { // before proceeding to read this file, make sure that taxonomy filters are a match // if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { // currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection // // // disregard this entry if taxonomies don't match // includeFile = !currentFiletaxonomy.isEmpty(); // //// currentFiletaxonomy = new ArrayList<>(); // } if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection if (stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.UNION")) && currentFiletaxonomy.isEmpty()) { // taxonomies don't match so stop // union (select words that match any of selected taxonomy // return false; includeFile = false; // } else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){ // intersection (select only words that precisely match selected taxonomy includeFile = false; } else { includeFile = true; } } } // backup else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { fj(corpus, stats); corpus.clear(); currentFiletaxonomy = new ArrayList<>(); // currentFiletaxonomyLong = new ArrayList<>(); } break; } } } catch (FileNotFoundException | XMLStreamException e) { e.printStackTrace(); } finally { if (eventReader != null) { try { eventReader.close(); } catch (XMLStreamException e) { logger.error("closing stream", e); } catch (Exception e) { logger.error("general error", e); } } } return true; } @SuppressWarnings("Duplicates") public static boolean readVERT(String path, StatisticsNew stats) { // taxonomy corpora // HashSet resultTaxonomy = new HashSet<>(); // regi path String regiPath = path.substring(0, path.length()-4) + "regi"; LineIterator regiIt; int wordIndex = -1; int lemmaIndex = -1; int msdIndex = -1; boolean slovene = false; try { // read regi file regiIt = FileUtils.lineIterator(new File(regiPath), "UTF-8"); try { boolean insideHeader = false; int attributeIndex = 0; while (regiIt.hasNext()) { String line = regiIt.nextLine(); if (line.length() >= 9 && line.substring(0, 9).equals("ATTRIBUTE")) { // split over "\" " String[] split = line.split(" "); if (split[1].equals("word") && wordIndex == -1){ wordIndex = attributeIndex; } else if (split[1].equals("lempos") && lemmaIndex == -1){ lemmaIndex = attributeIndex; } else if (split[1].equals("tag") && msdIndex == -1){ msdIndex = attributeIndex; } attributeIndex ++; if (wordIndex >= 0 && lemmaIndex >= 0 && msdIndex >= 0){ break; } } else if (line.length() >= 8 && line.substring(0, 8).equals("LANGUAGE")) { String[] split = line.split(" "); if (split[1].equals("\"Slovenian\"")){ slovene = true; } } } } finally { LineIterator.closeQuietly(regiIt); } } catch (IOException e) { throw new java.lang.RuntimeException("IOException"); // e.printStackTrace(); } int numLines = 0; // get number of lines try (FileReader input = new FileReader(path); LineNumberReader count = new LineNumberReader(input) ) { while (count.skip(Long.MAX_VALUE) > 0) { // Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file } numLines = count.getLineNumber() + 1; // +1 because line index starts at 0 } catch (IOException e) { e.printStackTrace(); } LineIterator it; ArrayList currentFiletaxonomy = new ArrayList<>(); boolean inParagraph = false; boolean inSentence = false; boolean taxonomyMatch = true; int lineNum = 0; int numSentences = 0; int numSentencesLimit = 1000; List sentence = new ArrayList<>(); List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); progress.set(0.0); if(!isCollocability) { startTime = new Date(); } try { it = FileUtils.lineIterator(new File(path), "UTF-8"); try { boolean insideHeader = false; while (it.hasNext()) { int percentage = (int) (lineNum * 100.0 / numLines); if(progress.get() < percentage) { progress.set(percentage); } if(isCancelled) { return false; } lineNum ++; String line = it.nextLine(); // beginning tags // taxonomy if (stats.getCorpus().getTaxonomy().size() > 0 && line.length() > 4 && line.substring(1, 5).equals("text")) { String[] split = line.split("\" "); currentFiletaxonomy = new ArrayList<>(); boolean medium = false; boolean type = false; boolean proofread = false; for (String el : split) { String[] attribute = el.split("=\""); boolean idsPresent = false; if (attribute[0].equals("medium_id") && !attribute[1].equals("-")) { Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus()); currentFiletaxonomy.add(currentFiletaxonomyElement); medium = true; } else if (attribute[0].equals("type_id") && !attribute[1].equals("-")) { Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus()); currentFiletaxonomy.add(currentFiletaxonomyElement); type = true; } else if (attribute[0].equals("proofread_id") && !attribute[1].equals("-")) { Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus()); currentFiletaxonomy.add(currentFiletaxonomyElement); proofread = true; } if (attribute[0].equals("medium") && !attribute[1].equals("-") && !medium) { Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus()); currentFiletaxonomy.add(currentFiletaxonomyElement); } else if (attribute[0].equals("type") && !attribute[1].equals("-") && !type) { Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus()); currentFiletaxonomy.add(currentFiletaxonomyElement); } else if (attribute[0].equals("proofread") && !attribute[1].equals("-") && !attribute[1].equals("-\">") && !proofread) { Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus()); currentFiletaxonomy.add(currentFiletaxonomyElement); } } taxonomyMatch = true; if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection if (stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.UNION")) && currentFiletaxonomy.isEmpty()) { // taxonomies don't match so stop // union (select words that match any of selected taxonomy // return false; taxonomyMatch = false; // } else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){ // intersection (select only words that precisely match selected taxonomy taxonomyMatch = false; } } } // else if((line.length() >= 3 && line.substring(0, 2).equals("")) || // (line.length() >= 3 && line.substring(0, 3).equals(""))){ // inParagraph = true; // } else if((line.length() == 4 && line.equals("

")) || (line.length() == 5 && line.equals(""))){ // inParagraph = false; // } else if(line.length() >= 3 && line.substring(0, 2).equals("")){ inSentence = true; } else if(line.length() == 4 && line.equals("")){ inSentence = false; if (stats.getFilter().getNgramValue() == 0){ int numSentenceParts = 0; for(Word w : sentence){ int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; } stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy); } else if(stats.getFilter().getNgramValue() >= 1) { stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy); } sentence = runFilters(sentence, stats.getFilter()); if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) { corpus.add(new Sentence(sentence, currentFiletaxonomy)); } if (numSentences == numSentencesLimit) { fj(corpus, stats); corpus.clear(); numSentences = 0; } else { numSentences ++; } // and start a new one sentence = new ArrayList<>(); // corpus.add(new Sentence(sentence, currentFiletaxonomy)); } else if(!(line.charAt(0) == '<' && line.charAt(line.length() - 1) == '>') && inSentence){ // } else if(!(line.charAt(0) == '<' && line.charAt(line.length() - 1) == '>') && inSentence && inParagraph){ String[] split = line.split("\t"); if(slovene) { if (split[lemmaIndex].length() > 2 && split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) == '-' && Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1)) && !split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-u")) { Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter()); sentence.add(word); } else if (stats.getFilter().getNotePunctuations() && (split[lemmaIndex].length() <= 2 || (split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) != '-' && !Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1))))) { Word word = createWord(split[wordIndex], split[lemmaIndex], split[msdIndex], split[wordIndex], stats.getFilter()); sentence.add(word); } else if (split[lemmaIndex].length() > 2 && !split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-u") || stats.getFilter().getNotePunctuations()) { Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter()); sentence.add(word); } } else { if (split[lemmaIndex].length() > 2 && split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) == '-' && Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1)) && !split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-z")) { Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter()); sentence.add(word); } else if (stats.getFilter().getNotePunctuations() && (split[lemmaIndex].length() <= 2 || (split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) != '-' && !Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1))))) { Word word = createWord(split[wordIndex], split[lemmaIndex], split[msdIndex], split[wordIndex], stats.getFilter()); sentence.add(word); } else if (split[lemmaIndex].length() > 2 && !split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-z") || stats.getFilter().getNotePunctuations()) { Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter()); sentence.add(word); } } } } if (corpus.size() > 0) { fj(corpus, stats); corpus.clear(); } } finally { LineIterator.closeQuietly(it); } } catch (IOException e) { e.printStackTrace(); } // resultTaxonomy.remove("-"); return true; } /** * Runs the sentence through some filters, so we don't do calculations when unnecessary. * Filters: *
    *
  1. Ngrams: omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)
  2. *
  3. Letter ngrams: omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)
  4. *
* * @return Empty sentence (if fails 1.) or a sentence with some words removed (2.) */ private static List runFilters(List sentence, Filter filter) { if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { // ngram level: if not 0 must be less than or equal to number of words in this sentence. if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) { return new ArrayList<>(); } // if we're calculating values for letters, omit words that are shorter than string length if (filter.getNgramValue() == 0) { sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord(filter.getWordParts()).length() < filter.getStringLength()) || (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma(filter.getWordParts()).length() < filter.getStringLength())); } } return sentence; } private static HashMap extractAttributes(StartElement se) { Iterator attributesIt = se.getAttributes(); HashMap atts = new HashMap<>(); while (attributesIt.hasNext()) { Attribute a = (Attribute) attributesIt.next(); atts.put(a.getName().getLocalPart(), a.getValue()); } return atts; } public static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){ List wString = new ArrayList<>(); if (f.getWordParts().contains(CalculateFor.WORD)) wString.add(word); if (f.getWordParts().contains(CalculateFor.LEMMA)) wString.add(lemma); if (f.getWordParts().contains(CalculateFor.MORPHOSYNTACTIC_SPECS)) wString.add(msd); if (f.getWordParts().contains(CalculateFor.NORMALIZED_WORD)) wString.add(normalizedWord); // find appropriate strings and put them in word Word w; switch (f.getWordParts().size()) { case 1: w = new Word1(wString.get(0)); break; case 2: w = new Word2(wString.get(0), wString.get(1)); break; case 3: w = new Word3(wString.get(0), wString.get(1), wString.get(2)); break; case 4: w = new Word4(wString.get(0), wString.get(1), wString.get(2), wString.get(3)); break; default: w = null; } return w; } }