package alg; import static data.Enums.solar.SolarFilters.*; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.util.*; import java.util.concurrent.ForkJoinPool; import javax.xml.namespace.QName; import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.events.*; import org.apache.logging.log4j.LogManager; import data.*; import gui.ValidationUtil; public class XML_processing { public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class); // public static void processCorpus(Statistics stats) { // // we can preset the list's size, so there won't be a need to resize it // List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // // int i = 0; // for (File f : Settings.corpus) { // i++; // readXML(f.toString(), stats); // } // } // public static void readXML(String path, Statistics stats) { // if (stats.getCorpusType() == CorpusType.GIGAFIDA) { // readXMLGigafida(path, stats); // } else if (stats.getCorpusType() == CorpusType.GOS) { // readXMLGos(path, stats); // } else if (stats.getCorpusType() == CorpusType.SOLAR) { // readXMLSolar(path, stats); // } // } public static void readXML(String path, StatisticsNew stats) { if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA || stats.getCorpus().getCorpusType() == CorpusType.CCKRES) { readXMLGigafida(path, stats); } else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) { readXMLGos(path, stats); } else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) { readXMLSolar(path, stats); } } /** * Reads and returns the value of a passed header tag or an empty string. * E.g. title tag, for discerning the corpus' type. * Notice: returns only the value of the first occurrence of a given tag name. */ public static String readXMLHeaderTag(String path, String tag) { XMLInputFactory factory = XMLInputFactory.newInstance(); XMLEventReader eventReader = null; try { eventReader = factory.createXMLEventReader(new FileInputStream(path)); while (eventReader.hasNext()) { XMLEvent xmlEvent = eventReader.nextEvent(); if (xmlEvent.isStartElement()) { StartElement startElement = xmlEvent.asStartElement(); String var = startElement.getName().getLocalPart(); if (var.equalsIgnoreCase(tag)) { return eventReader.nextEvent().asCharacters().getData(); } } } } catch (FileNotFoundException | XMLStreamException e) { e.printStackTrace(); } finally { if (eventReader != null) { try { eventReader.close(); } catch (XMLStreamException e) { logger.error("closing stream", e); } } } return ""; } private static void fj(List corpus, StatisticsNew stats) { ForkJoinPool pool = new ForkJoinPool(); if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) { alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats); pool.invoke(wc); } else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) { alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats); pool.invoke(wc); } else { // TODO: // alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats); // pool.invoke(wc); } } // public static void readXMLGos(String path, Statistics stats) { // boolean in_word = false; // String taksonomija = ""; // String lemma = ""; // String msd = ""; // String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm // // List stavek = new ArrayList<>(); // List corpus = new ArrayList<>(); // String sentenceDelimiter = "seg"; // String taxonomyPrefix = "gos."; // // try { // XMLInputFactory factory = XMLInputFactory.newInstance(); // XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); // // while (eventReader.hasNext()) { // XMLEvent event = eventReader.nextEvent(); // // switch (event.getEventType()) { // case XMLStreamConstants.START_ELEMENT: // // StartElement startElement = event.asStartElement(); // String qName = startElement.getName().getLocalPart(); // // // "word" node // if (qName.equals("w")) { // in_word = true; // // if (type.equals("norm")) { // // make sure we're looking at and not // Iterator var = startElement.getAttributes(); // ArrayList attributes = new ArrayList<>(); // while (var.hasNext()) { // attributes.add(var.next()); // } // // if (attributes.contains("msd")) { // msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); // } else { // msd = null; // } // // if (attributes.contains("lemma")) { // lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); // } // } // } // // taxonomy node // else if (qName.equalsIgnoreCase("catRef")) { // // there are some term nodes at the beginning that are of no interest to us // // they differ by not having the attribute "ref", so test will equal null // Attribute test = startElement.getAttributeByName(QName.valueOf("target")); // // if (test != null) { // // keep only taxonomy properties // taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, ""); // } // } else if (qName.equalsIgnoreCase("div")) { // type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue()); // // } // break; // // case XMLStreamConstants.CHARACTERS: // Characters characters = event.asCharacters(); // // // "word" node value // if (in_word) { // if (type.equals("norm") && msd != null) { // stavek.add(new Word(characters.getData(), lemma, msd)); // } else { // stavek.add(new Word(characters.getData())); // } // // in_word = false; // } // break; // // case XMLStreamConstants.END_ELEMENT: // EndElement endElement = event.asEndElement(); // // // parser reached end of the current sentence // if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { // // add sentence to corpus // corpus.add(new Sentence(stavek, taksonomija, type)); // // and start a new one // stavek = new ArrayList<>(); // // /* Invoke Fork-Join when we reach maximum limit of // * sentences (because we can't read everything to // * memory) or we reach the end of the file. // */ // if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { // fj(corpus, stats); // // empty the current corpus, since we don't need // // the data anymore // corpus.clear(); // } // } // // // backup // if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { // fj(corpus, stats); // corpus.clear(); // } // // break; // } // } // } catch (FileNotFoundException | XMLStreamException e) { // e.printStackTrace(); // } // } @SuppressWarnings("unused") public static void readXMLSolar(String path, StatisticsNew stats) { boolean in_word = false; String lemma = ""; String msd = ""; List stavek = new ArrayList<>(); List corpus = new ArrayList<>(); // used for filter Set headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto")); Map headBlock = null; boolean includeThisBlock = false; try { XMLInputFactory factory = XMLInputFactory.newInstance(); XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); while (eventReader.hasNext()) { XMLEvent event = eventReader.nextEvent(); switch (event.getEventType()) { case XMLStreamConstants.START_ELEMENT: StartElement startElement = event.asStartElement(); // System.out.println(String.format("%s", startElement.toString())); String qName = startElement.getName().getLocalPart(); // "word" node if (qName.equals("w3")) { in_word = true; msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); } else if (qName.equals("c3")) { String c3Content = eventReader.nextEvent().asCharacters().getData(); if (c3Content.equals(".") && includeThisBlock) { // add sentence to corpus corpus.add(new Sentence(stavek)); // and start a new one stavek = new ArrayList<>(); /* Invoke Fork-Join when we reach maximum limit of * sentences (because we can't read everything to * memory) or we reach the end of the file. */ if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { fj(corpus, stats); // empty the current corpus, since we don't need // the data anymore corpus.clear(); } } } else if (headTags.contains(qName)) { String tagContent = eventReader.nextEvent().asCharacters().getData(); headBlock.put(qName, tagContent); } else if (qName.equals("head")) { headBlock = new HashMap<>(); } break; case XMLStreamConstants.CHARACTERS: Characters characters = event.asCharacters(); // "word" node value if (in_word) { stavek.add(new Word(characters.getData(), lemma, msd)); in_word = false; } break; case XMLStreamConstants.END_ELEMENT: EndElement endElement = event.asEndElement(); String qNameEnd = endElement.getName().getLocalPart(); if (qNameEnd.equals("head")) { // validate and set boolean if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) { includeThisBlock = true; } } else if (qNameEnd.equals("body")) { // new block, reset filter status includeThisBlock = false; } // backup if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) { fj(corpus, stats); corpus.clear(); } break; } } } catch (FileNotFoundException | XMLStreamException e) { e.printStackTrace(); } } /** * @param readHeadBlock block of tags read from the corpus * @param userSetFilter tags with values set by the user * * @return */ private static boolean validateHeadBlock(Map readHeadBlock, HashMap> userSetFilter) { boolean pass = true; if (userSetFilter == null) { return true; } for (Map.Entry> filterEntry : userSetFilter.entrySet()) { String key = filterEntry.getKey(); HashSet valueObject = filterEntry.getValue(); // if (valueObject instanceof String) { // pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject); // } else if (valueObject != null) { //noinspection unchecked for (String value : valueObject) { pass = validateHeadBlockEntry(readHeadBlock, key, value); } } if (!pass) { // current head block does not include one of the set filters - not likely, but an edge case anyway return false; } } // if it gets to this point, it passed all the filters return true; } private static boolean validateHeadBlockEntry(Map readHeadBlock, String userSetKey, String userSetValue) { if (!readHeadBlock.keySet().contains(userSetKey)) { // current head block does not include one of the set filters - not likely, but an edge case anyway return false; } else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) { // different values -> doesn't pass the filter return false; } return true; } /** * Parses XML headers for information about its taxonomy (if supported) or filters (solar) * * @param filepath * @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file * @param corpusType */ public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) { boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType); // solar Set headTags = null; HashMap> resultFilters = new HashMap<>(); // taxonomy corpora HashSet resultTaxonomy = new HashSet<>(); String headTagName; if (corpusType == CorpusType.SOLAR) { headTagName = "head"; // used for filter headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO)); // init results now to avoid null pointers headTags.forEach(f -> resultFilters.put(f, new HashSet<>())); } else { headTagName = "teiHeader"; } XMLInputFactory factory = XMLInputFactory.newInstance(); XMLEventReader xmlEventReader = null; try { xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath)); boolean insideHeader = false; while (xmlEventReader.hasNext()) { XMLEvent xmlEvent = xmlEventReader.nextEvent(); if (xmlEvent.isStartElement()) { StartElement startElement = xmlEvent.asStartElement(); String elementName = startElement.getName().getLocalPart(); if (elementName.equalsIgnoreCase(headTagName)) { // if the corpus is split into files, we skip bodies // this toggle is true when we're inside a header (next block of code executes) // and false when we're not (skip reading unnecessary attributes) insideHeader = true; } if (insideHeader) { if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) { HashMap atts = extractAttributes(startElement); String debug = ""; String tax = startElement.getAttributeByName(QName.valueOf("target")) .getValue() .replace("#", ""); resultTaxonomy.add(tax); } else if (!parseTaxonomy && headTags.contains(elementName)) { String tagContent = xmlEventReader.nextEvent().asCharacters().getData(); resultFilters.get(elementName).add(tagContent); } } } else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) { // if the corpus is split into multiple files, each with only one header block per file // that means we should stop after we reach the end of the header return parseTaxonomy ? resultTaxonomy : resultFilters; } else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) { // whole corpus in one file, so we have to continue reading in order to find all header blocks insideHeader = false; } } } catch (XMLStreamException e) { logger.error("Streaming error", e); return parseTaxonomy ? resultTaxonomy : resultFilters; } catch (FileNotFoundException e) { logger.error("File not found", e); return parseTaxonomy ? resultTaxonomy : resultFilters; // TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user } finally { if (xmlEventReader != null) { try { xmlEventReader.close(); } catch (XMLStreamException e) { logger.error("closing stream", e); } } } return parseTaxonomy ? resultTaxonomy : resultFilters; } private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) { return event.asEndElement() .getName() .getLocalPart() .equalsIgnoreCase(headerTag); } @SuppressWarnings("Duplicates") public static boolean readXMLGigafida(String path, StatisticsNew stats) { boolean inWord = false; ArrayList currentFiletaxonomy = new ArrayList<>(); ArrayList currentFiletaxonomyLong = new ArrayList<>(); String lemma = ""; String msd = ""; List sentence = new ArrayList<>(); List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it String sentenceDelimiter = "s"; XMLEventReader eventReader = null; try { XMLInputFactory factory = XMLInputFactory.newInstance(); eventReader = factory.createXMLEventReader(new FileInputStream(path)); while (eventReader.hasNext()) { XMLEvent event = eventReader.nextEvent(); switch (event.getEventType()) { case XMLStreamConstants.START_ELEMENT: StartElement startElement = event.asStartElement(); String qName = startElement.getName().getLocalPart(); // "word" node if (qName.equals("w")) { inWord = true; msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); } // taxonomy node else if (qName.equalsIgnoreCase("catRef")) { // there are some term nodes at the beginning that are of no interest to us // they differ by not having the attribute "ref", so test will equal null Attribute tax = startElement.getAttributeByName(QName.valueOf("target")); if (tax != null) { // keep only taxonomy properties String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", ""); currentFiletaxonomy.add(currentFiletaxonomyElement); Tax taxonomy = new Tax(); currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement)); } } break; case XMLStreamConstants.CHARACTERS: Characters characters = event.asCharacters(); // "word" node value if (inWord) { String word = characters.getData(); sentence.add(new Word(word, lemma, msd, currentFiletaxonomyLong)); inWord = false; } break; case XMLStreamConstants.END_ELEMENT: EndElement endElement = event.asEndElement(); String var = endElement.getName().getLocalPart(); String debug = ""; // parser reached end of the current sentence if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { // add sentence to corpus if it passes filters sentence = runFilters(sentence, stats.getFilter()); if (!ValidationUtil.isEmpty(sentence)) { corpus.add(new Sentence(sentence)); } // and start a new one sentence = new ArrayList<>(); /* Invoke Fork-Join when we reach maximum limit of * sentences (because we can't read everything to * memory) or we reach the end of the file. */ if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { fj(corpus, stats); // empty the current corpus, since we don't need the data anymore corpus.clear(); // TODO: if (stats.isUseDB()) { // stats.storeTmpResultsToDB(); // } } } else if (endElement.getName().getLocalPart().equals("teiHeader")) { // before proceeding to read this file, make sure that taxonomy filters are a match if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection if (currentFiletaxonomy.isEmpty()) { // taxonomies don't match so stop return false; } } } // fallback else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { // join corpus and stats fj(corpus, stats); corpus.clear(); // TODO: if (stats.isUseDB()) { // stats.storeTmpResultsToDB(); // } } break; } } } catch (FileNotFoundException | XMLStreamException e) { e.printStackTrace(); } finally { if (eventReader != null) { try { eventReader.close(); } catch (XMLStreamException e) { logger.error("closing stream", e); } } } return true; } @SuppressWarnings("Duplicates") public static boolean readXMLGos(String path, StatisticsNew stats) { boolean inWord = false; boolean inOrthDiv = false; boolean computeForOrth = stats.getCorpus().isGosOrthMode(); ArrayList currentFiletaxonomy = new ArrayList<>(); String lemma = ""; String msd = ""; List sentence = new ArrayList<>(); List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it String sentenceDelimiter = "seg"; String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm XMLEventReader eventReader = null; boolean includeFile = true; try { XMLInputFactory factory = XMLInputFactory.newInstance(); eventReader = factory.createXMLEventReader(new FileInputStream(path)); while (eventReader.hasNext()) { XMLEvent event = eventReader.nextEvent(); // System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", ""))); switch (event.getEventType()) { case XMLStreamConstants.START_ELEMENT: StartElement startElement = event.asStartElement(); String qName = startElement.getName().getLocalPart(); if (qName.equals("div")) { HashMap atts = extractAttributes(startElement); if (atts.keySet().contains("type")) { inOrthDiv = atts.get("type").equals("orth"); } } // "word" node if (qName.equals("w")) { // check that it's not a type HashMap atts = extractAttributes(startElement); if (!atts.containsKey("type")) { inWord = true; if (atts.containsKey("msd")) { msd = atts.get("msd"); } if (atts.containsKey("lemma")) { lemma = atts.get("lemma"); } // // if (!inOrthDiv) { // msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); // lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); // } } // } } // taxonomy node else if (qName.equalsIgnoreCase("catRef")) { // there are some term nodes at the beginning that are of no interest to us // they differ by not having the attribute "ref", so test will equal null Attribute tax = startElement.getAttributeByName(QName.valueOf("target")); if (tax != null) { // keep only taxonomy properties currentFiletaxonomy.add(String.valueOf(tax.getValue())); } } else if (qName.equalsIgnoreCase("div")) { gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue()); } break; case XMLStreamConstants.CHARACTERS: // "word" node value if (inWord) { Characters characters = event.asCharacters(); if (gosType.equals("norm") && msd != null) { sentence.add(new Word(characters.getData(), lemma, msd)); } else { sentence.add(new Word(characters.getData())); } inWord = false; } break; case XMLStreamConstants.END_ELEMENT: EndElement endElement = event.asEndElement(); // parser reached end of the current sentence if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { // add sentence to corpus if it passes filters boolean saveSentence = computeForOrth == inOrthDiv; if (includeFile && saveSentence && !ValidationUtil.isEmpty(sentence)) { sentence = runFilters(sentence, stats.getFilter()); corpus.add(new Sentence(sentence)); } // and start a new one sentence = new ArrayList<>(); /* Invoke Fork-Join when we reach maximum limit of * sentences (because we can't read everything to * memory) or we reach the end of the file. */ if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { fj(corpus, stats); // empty the current corpus, since we don't need // the data anymore corpus.clear(); } } else if (endElement.getName().getLocalPart().equals("teiHeader")) { // before proceeding to read this file, make sure that taxonomy filters are a match if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection // disregard this entry if taxonomies don't match includeFile = !currentFiletaxonomy.isEmpty(); currentFiletaxonomy = new ArrayList<>(); } } // backup else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { fj(corpus, stats); corpus.clear(); } break; } } } catch (FileNotFoundException | XMLStreamException e) { e.printStackTrace(); } finally { if (eventReader != null) { try { eventReader.close(); } catch (XMLStreamException e) { logger.error("closing stream", e); } catch (Exception e) { logger.error("general error", e); } } } return true; } /** * Runs the sentence through some filters, so we don't do calculations when unnecessary. * Filters: *
    *
  1. Ngrams: omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)
  2. *
  3. Letter ngrams: omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)
  4. *
* * @return Empty sentence (if fails 1.) or a sentence with some words removed (2.) */ private static List runFilters(List sentence, Filter filter) { if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { // ngram level: if not 0 must be less than or equal to number of words in this sentence. if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) { return null; } // if we're calculating values for letters, omit words that are shorter than string length if (filter.getNgramValue() == 0) { sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength()) || (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength())); } } return sentence; } private static HashMap extractAttributes(StartElement se) { Iterator attributesIt = se.getAttributes(); HashMap atts = new HashMap<>(); while (attributesIt.hasNext()) { Attribute a = (Attribute) attributesIt.next(); atts.put(a.getName().getLocalPart(), a.getValue()); } return atts; } }