list/src/main/java/alg/XML_processing.java

package alg;

import static data.Enums.solar.SolarFilters.*;

import java.io.*;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ForkJoinPool;

import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.*;

import gui.I18N;
import javafx.beans.InvalidationListener;
import javafx.beans.property.ReadOnlyDoubleProperty;
import javafx.beans.property.ReadOnlyDoubleWrapper;
import javafx.concurrent.Task;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.logging.log4j.LogManager;

import data.*;
import gui.ValidationUtil;

public class XML_processing {
    public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class);

    // progress tracking functionality
    private static final ReadOnlyDoubleWrapper progress = new ReadOnlyDoubleWrapper();

    public static boolean isCancelled = false;
    public static Date startTime = new Date();
    public static boolean isCollocability = false;
    public static InvalidationListener progressBarListener;

    public double getProgress() {
        return progressProperty().get();
    }

    public ReadOnlyDoubleProperty progressProperty() {
        return progress ;
    }

    // public static void processCorpus(Statistics stats) {
    // 	// we can preset the list's size, so there won't be a need to resize it
    // 	List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT);
    //
    // 	int i = 0;
    // 	for (File f : Settings.corpus) {
    // 		i++;
    // 		readXML(f.toString(), stats);
    // 	}
    // }

    // public static void readXML(String path, Statistics stats) {
    // 	if (stats.getCorpusType() == CorpusType.GIGAFIDA) {
    // 		readXMLGigafida(path, stats);
    // 	} else if (stats.getCorpusType() == CorpusType.GOS) {
    // 		readXMLGos(path, stats);
    // 	} else if (stats.getCorpusType() == CorpusType.SOLAR) {
    // 		readXMLSolar(path, stats);
    // 	}
    // }

    public static boolean readXML(String path, StatisticsNew stats) {
        if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA
                || stats.getCorpus().getCorpusType() == CorpusType.CCKRES) {
            return readXMLGigafida(path, stats);
        } else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) {
            return readXMLGos(path, stats);
        } else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
            return readXMLSolar(path, stats);
        } else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K ||
                stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2) {
            return readXMLSSJ500K(path, stats);
        } else if (stats.getCorpus().getCorpusType() == CorpusType.VERT) {
            return readVERT(path, stats);
        }
//        task.updateProgress(fileNum, size);
        return false;
    }

    /**
     * Reads and returns the value of a passed header tag or an empty string.
     * E.g. title tag, for discerning the corpus' type.
     * Notice: returns only the value of the first occurrence of a given tag name.
     */
    public static String readXMLHeaderTag(String path, String tag) {
        XMLInputFactory factory = XMLInputFactory.newInstance();
        XMLEventReader eventReader = null;

        try {
            eventReader = factory.createXMLEventReader(new FileInputStream(path));
            while (eventReader.hasNext()) {
                XMLEvent xmlEvent = eventReader.nextEvent();
                if (xmlEvent.isStartElement()) {
                    StartElement startElement = xmlEvent.asStartElement();
                    String var = startElement.getName().getLocalPart();

                    if (var.equalsIgnoreCase(tag)) {
                        return eventReader.nextEvent().asCharacters().getData();
                    }
                }
            }
        } catch (FileNotFoundException | XMLStreamException e) {
            e.printStackTrace();
        } finally {
            if (eventReader != null) {
                try {
                    eventReader.close();
                } catch (XMLStreamException e) {
                    logger.error("closing stream", e);
                }
            }
        }
        return "";
    }

    /**
     * Reads and returns the value of a passed header attribute or an empty string.
     * E.g. body base attribute, for discerning the corpus' type of ssj500k.
     * Notice: returns only the value of the first occurrence of a given tag name.
     */
    public static String readXMLHeaderAttribute(String path, String tag, String attribute) {
        XMLInputFactory factory = XMLInputFactory.newInstance();
        XMLEventReader eventReader = null;

        try {
            eventReader = factory.createXMLEventReader(new FileInputStream(path));
            while (eventReader.hasNext()) {
                XMLEvent xmlEvent = eventReader.nextEvent();
                if (xmlEvent.isStartElement()) {
                    StartElement startElement = xmlEvent.asStartElement();
                    String var = startElement.getName().getLocalPart();

                    if (var.equalsIgnoreCase(tag)) {
                        HashMap<String, String> att = extractAttributes(startElement);

                        if (att.containsKey("base")) {
                            return att.get("base").substring(0, att.get("base").length() - 12);
                        }


                        return eventReader.nextEvent().asCharacters().getData();
                    }
                }
            }
        } catch (FileNotFoundException | XMLStreamException e) {
            e.printStackTrace();
        } finally {
            if (eventReader != null) {
                try {
                    eventReader.close();
                } catch (XMLStreamException e) {
                    logger.error("closing stream", e);
                }
            }
        }
        return "";
    }

    private static void fj(List<Sentence> corpus, StatisticsNew stats) {
        ForkJoinPool pool = new ForkJoinPool();

        if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) {
            alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats);
            pool.invoke(wc);
        } else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) {
            alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats);
            pool.invoke(wc);
        } else {
            // TODO:
            // alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats);
            // pool.invoke(wc);
        }
    }

    // public static void readXMLGos(String path, Statistics stats) {
    // 	boolean in_word = false;
    // 	String taksonomija = "";
    // 	String lemma = "";
    // 	String msd = "";
    // 	String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm
    //
    // 	List<Word> stavek = new ArrayList<>();
    // 	List<Sentence> corpus = new ArrayList<>();
    // 	String sentenceDelimiter = "seg";
    // 	String taxonomyPrefix = "gos.";
    //
    // 	try {
    // 		XMLInputFactory factory = XMLInputFactory.newInstance();
    // 		XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
    //
    // 		while (eventReader.hasNext()) {
    // 			XMLEvent event = eventReader.nextEvent();
    //
    // 			switch (event.getEventType()) {
    // 				case XMLStreamConstants.START_ELEMENT:
    //
    // 					StartElement startElement = event.asStartElement();
    // 					String qName = startElement.getName().getLocalPart();
    //
    // 					// "word" node
    // 					if (qName.equals("w")) {
    // 						in_word = true;
    //
    // 						if (type.equals("norm")) {
    // 							// make sure we're looking at <w lemma...> and not <w type...>
    // 							Iterator var = startElement.getAttributes();
    // 							ArrayList<Object> attributes = new ArrayList<>();
    // 							while (var.hasNext()) {
    // 								attributes.add(var.next());
    // 							}
    //
    // 							if (attributes.contains("msd")) {
    // 								msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
    // 							} else {
    // 								msd = null;
    // 							}
    //
    // 							if (attributes.contains("lemma")) {
    // 								lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
    // 							}
    // 						}
    // 					}
    // 					// taxonomy node
    // 					else if (qName.equalsIgnoreCase("catRef")) {
    // 						// there are some term nodes at the beginning that are of no interest to us
    // 						// they differ by not having the attribute "ref", so test will equal null
    // 						Attribute test = startElement.getAttributeByName(QName.valueOf("target"));
    //
    // 						if (test != null) {
    // 							// keep only taxonomy properties
    // 							taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, "");
    // 						}
    // 					} else if (qName.equalsIgnoreCase("div")) {
    // 						type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
    //
    // 					}
    // 					break;
    //
    // 				case XMLStreamConstants.CHARACTERS:
    // 					Characters characters = event.asCharacters();
    //
    // 					// "word" node value
    // 					if (in_word) {
    // 						if (type.equals("norm") && msd != null) {
    // 							stavek.add(new Word(characters.getData(), lemma, msd));
    // 						} else {
    // 							stavek.add(new Word(characters.getData()));
    // 						}
    //
    // 						in_word = false;
    // 					}
    // 					break;
    //
    // 				case XMLStreamConstants.END_ELEMENT:
    // 					EndElement endElement = event.asEndElement();
    //
    // 					// parser reached end of the current sentence
    // 					if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
    // 						// add sentence to corpus
    // 						corpus.add(new Sentence(stavek, taksonomija, type));
    // 						// and start a new one
    // 						stavek = new ArrayList<>();
    //
    // 						/* Invoke Fork-Join when we reach maximum limit of
    // 						 * sentences (because we can't read everything to
    // 						 * memory) or we reach the end of the file.
    // 						 */
    // 						if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
    // 							fj(corpus, stats);
    // 							// empty the current corpus, since we don't need
    // 							// the data anymore
    // 							corpus.clear();
    // 						}
    // 					}
    //
    // 					// backup
    // 					if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
    // 						fj(corpus, stats);
    // 						corpus.clear();
    // 					}
    //
    // 					break;
    // 			}
    // 		}
    // 	} catch (FileNotFoundException | XMLStreamException e) {
    // 		e.printStackTrace();
    // 	}
    // }

    @SuppressWarnings("unused")
    public static boolean readXMLSolar(String path, StatisticsNew stats) {
        boolean in_word = false;
        boolean inPunctuation = false;
        String lemma = "";
        String msd = "";

        List<Word> stavek = new ArrayList<>();
        List<Sentence> corpus = new ArrayList<>();

        // used for filter
//        Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto"));
        Set<String> headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO));
        Map<String, String> headBlock = null;
        boolean includeThisBlock = false;

        int numLines = 0;
        int lineNum = 0;
        progress.set(0.0);
        if(!isCollocability) {
            startTime = new Date();
        }
        // get number of lines
        try {
            XMLInputFactory factory = XMLInputFactory.newInstance();
            XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));

            while (eventReader.hasNext())
            {
                eventReader.next();
                numLines ++;
                // Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (XMLStreamException e) {
            e.printStackTrace();
        }

        try {
            XMLInputFactory factory = XMLInputFactory.newInstance();
            XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));

            while (eventReader.hasNext()) {
                int percentage = (int) (lineNum * 100.0 / numLines);
                if(progress.get() < percentage) {
                    progress.set(percentage);
                }
                if(isCancelled) {
                    return false;
                }
                lineNum ++;
                XMLEvent event = eventReader.nextEvent();

                switch (event.getEventType()) {
                    case XMLStreamConstants.START_ELEMENT:

                        StartElement startElement = event.asStartElement();
                        // System.out.println(String.format("%s", startElement.toString()));
                        String qName = startElement.getName().getLocalPart();

                        // "word" node
                        if (qName.equals("w3") || qName.equals("w1") || qName.equals("w")) {
                            in_word = true;

                            msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
                            lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
                        } else if (qName.equals("c3") || qName.equals("c1") || qName.equals("c")) {
                            String c3Content = eventReader.nextEvent().asCharacters().getData();

                            if (stats.getFilter().getNotePunctuations() &&
                                    stavek.size() > 0) {
                                stavek.add(createWord(c3Content, c3Content, "/", "", stats.getFilter()));

                            }

                        } else if ((qName.equals("st1") && startElement.getAttributeByName(QName.valueOf("tip")).getValue().equals("0")) || qName.equals("s")) {
                            if (stats.getFilter().getNgramValue() == 0){
                                int numSentenceParts = 0;
                                for(Word w : stavek){
                                    int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
                                    numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
                                }
                                stats.updateUniGramOccurrences(numSentenceParts, new ArrayList<>());
                            } else if(stats.getFilter().getNgramValue() >= 1) {
                                stats.updateUniGramOccurrences(stavek.size(), new ArrayList<>());
                            }

                            if(includeThisBlock) {
                                // add sentence to corpus
                                corpus.add(new Sentence(stavek, null));
                                // and start a new one

                                /* Invoke Fork-Join when we reach maximum limit of
                                 * sentences (because we can't read everything to
                                 * memory) or we reach the end of the file.
                                 */
                                if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
                                    fj(corpus, stats);
                                    // empty the current corpus, since we don't need
                                    // the data anymore
                                    corpus.clear();
                                }
                            }
                            stavek = new ArrayList<>();
                        } else if (qName.equals("head")) {
                            headBlock = new HashMap<>();
                        } else { // if (headTags.contains(qName)) {
                            boolean inHeadTags = false;
                            String headTag = "";
                            for (String tag : headTags){
                                if(I18N.getDefaultLocaleItem(tag).equals(qName)){
                                    inHeadTags = true;
                                    headTag = tag;
                                    break;
                                }
                            }
                            if(inHeadTags) {
                                String tagContent = eventReader.nextEvent().asCharacters().getData();
                                headBlock.put(headTag, tagContent);
//                                String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
//                                resultFilters.get(headTag).add(tagContent);
                            }


                        }

                        break;

                    case XMLStreamConstants.CHARACTERS:
                        Characters characters = event.asCharacters();

                        // "word" node value
                        if (in_word) {
                            stavek.add(createWord(characters.getData(), lemma, msd, "", stats.getFilter()));
                            in_word = false;
                        }
                        break;

                    case XMLStreamConstants.END_ELEMENT:
                        EndElement endElement = event.asEndElement();
                        String qNameEnd = endElement.getName().getLocalPart();

                        if (qNameEnd.equals("head")) {
                            // validate and set boolean
                            if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) {
                                includeThisBlock = true;
                            }
                        } else if (qNameEnd.equals("body")) {
                            // new block, reset filter status
                            includeThisBlock = false;
                            stavek = new ArrayList<>();
                        }

                        // backup
                        if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) {
                            fj(corpus, stats);
                            corpus.clear();
                        }

                        break;
                }
            }
        } catch (FileNotFoundException | XMLStreamException e) {
            e.printStackTrace();
        }
        return true;
    }

    /**
     * @param readHeadBlock block of tags read from the corpus
     * @param userSetFilter tags with values set by the user
     *
     * @return
     */
    private static boolean validateHeadBlock(Map<String, String> readHeadBlock, HashMap<String, HashSet<String>> userSetFilter) {
        boolean pass = true;

        if (userSetFilter == null) {
            return true;
        }

        for (Map.Entry<String, HashSet<String>> filterEntry : userSetFilter.entrySet()) {
            String key = filterEntry.getKey();
            HashSet<String> valueObject = filterEntry.getValue();

            // if (valueObject instanceof String) {
            // 	pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject);
            // } else
            if (valueObject != null) {
                //noinspection unchecked
                for (String value : valueObject) {
                    pass = validateHeadBlockEntry(readHeadBlock, key, value);
                    if (pass){
                        break;
                    }
                }
            }

            if (!pass) {
                // current head block does not include one of the set filters - not likely, but an edge case anyway
                return false;
            }
        }

        // if it gets to this point, it passed all the filters
        return true;
    }

    private static boolean validateHeadBlockEntry(Map<String, String> readHeadBlock, String userSetKey, String userSetValue) {
        if (!readHeadBlock.keySet().contains(userSetKey)) {
            // current head block does not include one of the set filters - not likely, but an edge case anyway
            return false;
        } else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) {
            // different values -> doesn't pass the filter
            return false;
        }

        return true;
    }


    /**
     * Parses XML headers for information about its taxonomy (if supported) or filters (solar)
     *
     * @param filepath
     * @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
     * @param corpusType
     */
    public static HashSet<String> readVertHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
        // taxonomy corpora
        HashSet<String> resultTaxonomy = new HashSet<>();

        LineIterator it = null;
        try {
            it = FileUtils.lineIterator(new File(filepath), "UTF-8");
            try {
                boolean insideHeader = false;

                while (it.hasNext()) {
                    String line = it.nextLine();

                    if (line.length() > 4 && line.substring(1, 5).equals("text")) {
                        // split over "\" "
                        String[] split = line.split("\" ");
//						String mediumId = "";
//						String typeId = "";
//						String proofreadId = "";
                        boolean idsPresent = false;
                        for (String el : split) {
                            String[] attribute = el.split("=\"");
                            if (attribute[0].equals("medium_id")) {
//								mediumId = attribute[1];
                                idsPresent = true;
                                resultTaxonomy.add(attribute[1]);
                            } else if (attribute[0].equals("type_id")) {
//								typeId = attribute[1];
                                idsPresent = true;
                                resultTaxonomy.add(attribute[1]);
                            } else if (attribute[0].equals("proofread_id")) {
//								proofreadId = attribute[1];
                                idsPresent = true;
                                resultTaxonomy.add(attribute[1]);
                            }
                        }
                        if (!idsPresent){
                            for (String el : split) {
                                String[] attribute = el.split("=\"");
                                if (attribute[0].equals("medium")) {
//								mediumId = attribute[1];
                                    resultTaxonomy.add(attribute[1]);
                                } else if (attribute[0].equals("type")) {
//								typeId = attribute[1];
                                    resultTaxonomy.add(attribute[1]);
                                } else if (attribute[0].equals("proofread")) {
//								proofreadId = attribute[1];
                                    resultTaxonomy.add(attribute[1]);
                                }
                            }
                        }
                    }
                }
            } finally {
                LineIterator.closeQuietly(it);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        resultTaxonomy.remove("-");
        return resultTaxonomy;
    }

    /**
     * Parses XML headers for information about its taxonomy (if supported) or filters (solar)
     *
     * @param filepath
     * @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
     * @param corpusType
     */
    public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
        boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
        // solar
        Set<String> headTags = null;
        HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
        // taxonomy corpora
        HashSet<String> resultTaxonomy = new HashSet<>();

        String headTagName;

        if (corpusType == CorpusType.SOLAR) {
            headTagName = "head";
            // used for filter
            headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO));

            // init results now to avoid null pointers
            headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
        } else if (corpusType == CorpusType.SSJ500K) {
            headTagName = "bibl";
        } else {
            headTagName = "teiHeader";
        }

        XMLInputFactory factory = XMLInputFactory.newInstance();
        XMLEventReader xmlEventReader = null;
        try {
            xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
            boolean insideHeader = false;

            while (xmlEventReader.hasNext()) {
                XMLEvent xmlEvent = xmlEventReader.nextEvent();

                if (xmlEvent.isStartElement()) {
                    StartElement startElement = xmlEvent.asStartElement();
                    String elementName = startElement.getName().getLocalPart();

                    if (elementName.equalsIgnoreCase(headTagName)) {
                        // if the corpus is split into files, we skip bodies
                        // this toggle is true when we're inside a header (next block of code executes)
                        // and false when we're not (skip reading unnecessary attributes)
                        insideHeader = true;
                    }

                    if (insideHeader) {
                        if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) {
                            HashMap<String, String> atts = extractAttributes(startElement);
                            String debug = "";

                            String tax = startElement.getAttributeByName(QName.valueOf("target"))
                                    .getValue()
                                    .replace("#", "");

                            if (tax.indexOf(':') >= 0) {
                                tax = tax.split(":")[1];
                            }
                            resultTaxonomy.add(tax);
                        } else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) {
                            String tax = startElement.getAttributeByName(QName.valueOf("ref"))
                                    .getValue()
                                    .replace("#", "");

                            resultTaxonomy.add(tax);
                        // solar
//                        } else if (!parseTaxonomy && headTags.contains(elementName)) {
                        } else if (!parseTaxonomy) {
                            boolean inHeadTags = false;
                            String headTag = "";
                            for (String tag : headTags){
                                if(I18N.getDefaultLocaleItem(tag).equals(elementName)){
                                    inHeadTags = true;
                                    headTag = tag;
                                    break;
                                }
                            }
                            if(inHeadTags) {
                                String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
                                resultFilters.get(headTag).add(tagContent);
                            }
                        }
                    }
                } else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
                    // if the corpus is split into multiple files, each with only one header block per file
                    // that means we should stop after we reach the end of the header
                    return parseTaxonomy ? resultTaxonomy : resultFilters;
                } else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
                    // whole corpus in one file, so we have to continue reading in order to find all header blocks
                    insideHeader = false;
                }
            }
        } catch (XMLStreamException e) {
            logger.error("Streaming error", e);
            return parseTaxonomy ? resultTaxonomy : resultFilters;
        } catch (FileNotFoundException e) {
            logger.error("File not found", e);
            return parseTaxonomy ? resultTaxonomy : resultFilters;
            // TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user
        } finally {
            if (xmlEventReader != null) {
                try {
                    xmlEventReader.close();
                } catch (XMLStreamException e) {
                    logger.error("closing stream", e);
                }
            }
        }
        return parseTaxonomy ? resultTaxonomy : resultFilters;
    }

    private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) {
        return event.asEndElement()
                .getName()
                .getLocalPart()
                .equalsIgnoreCase(headerTag);
    }

    @SuppressWarnings("Duplicates")
    public static boolean readXMLGigafida(String path, StatisticsNew stats) {
        boolean inWord = false;
        boolean inPunctuation = false;
        boolean taxonomyMatch = true;
        ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
//		ArrayList<Taxonomy> currentFiletaxonomyLong = new ArrayList<>();
        String lemma = "";
        String msd = "";

        List<Word> sentence = new ArrayList<>();
        List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
        String sentenceDelimiter = "s";

        XMLEventReader eventReader = null;
        try {
            XMLInputFactory factory = XMLInputFactory.newInstance();
            eventReader = factory.createXMLEventReader(new FileInputStream(path));

            while (eventReader.hasNext()) {
                XMLEvent event = eventReader.nextEvent();

                switch (event.getEventType()) {
                    case XMLStreamConstants.START_ELEMENT:
                        StartElement startElement = event.asStartElement();
                        String qName = startElement.getName().getLocalPart();

                        // "word" node
                        if (qName.equals("w")) {
                            inWord = true;

                            msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
                            lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
                        }

                        if (qName.equals("c")){
                            inPunctuation = true;
                        }

                        // taxonomy node
                        else if (qName.equalsIgnoreCase("catRef")) {
                            // there are some term nodes at the beginning that are of no interest to us
                            // they differ by not having the attribute "ref", so test will equal null
                            Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));

                            if (tax != null) {
                                // keep only taxonomy properties
                                Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus());
                                currentFiletaxonomy.add(currentFiletaxonomyElement);
                                Tax taxonomy = new Tax();
//								currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
                            }
                        }
                        break;

                    case XMLStreamConstants.CHARACTERS:
                        Characters characters = event.asCharacters();

                        // "word" node value
                        if (inWord) {
                            String word = characters.getData();
                            sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
                            inWord = false;
                        }
//                        if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
                        if (stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
                            String punctuation = characters.getData();
                            sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
                            inPunctuation = false;

//						    String punctuation = ",";
//
//                            sentence.get(sentence.size() - 1).setWord(sentence.get(sentence.size() - 1).getWord() + punctuation);
//                            sentence.get(sentence.size() - 1).setLemma(sentence.get(sentence.size() - 1).getLemma() + punctuation);
//                            sentence.get(sentence.size() - 1).setMsd(sentence.get(sentence.size() - 1).getMsd() + punctuation);
//                            inPunctuation = false;
                        }
                        break;

//                    if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
//                        String actualPunctuation = characters.getData();
//                        if (actualPunctuation.equals(".") || actualPunctuation.equals("!") || actualPunctuation.equals("?") || actualPunctuation.equals("..."))
//                            break;
//                        String punctuation = ",";
//                        int skip_number = 0;
//                        if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue())){
//                            skip_number = stats.getFilter().getSkipValue();
//                        }
//                        for(int i = 1; i < skip_number + 2; i ++){
//                            if (i < sentence.size() && !sentence.get(sentence.size() - i).equals(punctuation)) {
//                                sentence.get(sentence.size() - i).setWord(sentence.get(sentence.size() - i).getWord() + punctuation);
//                                sentence.get(sentence.size() - i).setLemma(sentence.get(sentence.size() - i).getLemma() + punctuation);
//                                sentence.get(sentence.size() - i).setMsd(sentence.get(sentence.size() - i).getMsd() + punctuation);
//                            }
//                        }
//                        inPunctuation = false;
//                    }

                    case XMLStreamConstants.END_ELEMENT:
                        EndElement endElement = event.asEndElement();

                        String var = endElement.getName().getLocalPart();
                        String debug = "";

                        // parser reached end of the current sentence
                        if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
                            // count all UniGramOccurrences in sentence for statistics
                            if (stats.getFilter().getNgramValue() == 0){
                                int numSentenceParts = 0;
                                for(Word w : sentence){
                                    int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
                                    numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
                                }
                                stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
                            } else if(stats.getFilter().getNgramValue() >= 1) {
                                stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
                            }
                            // add sentence to corpus if it passes filters
                            sentence = runFilters(sentence, stats.getFilter());


                            if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
                                corpus.add(new Sentence(sentence, currentFiletaxonomy));
                            }

//							taxonomyMatch = true;
                            // and start a new one
                            sentence = new ArrayList<>();

                            /* Invoke Fork-Join when we reach maximum limit of
                             * sentences (because we can't read everything to
                             * memory) or we reach the end of the file.
                             */
                            if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
                                fj(corpus, stats);
                                // empty the current corpus, since we don't need the data anymore
                                corpus.clear();

                                // TODO: if (stats.isUseDB()) {
                                // 	stats.storeTmpResultsToDB();
                                // }
                            }
                        } else if (endElement.getName().getLocalPart().equals("teiHeader")) {
                            // before proceeding to read this file, make sure that taxonomy filters are a match

                            if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
                                currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection

                                if (stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.UNION")) && currentFiletaxonomy.isEmpty()) {
                                    // taxonomies don't match so stop
                                    // union (select words that match any of selected taxonomy
//									return false;
                                    taxonomyMatch = false;
//
                                } else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){
                                    // intersection (select only words that precisely match selected taxonomy
                                    taxonomyMatch = false;
                                }
                            }
                        }

                        // fallback
                        else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
                            // join corpus and stats
                            fj(corpus, stats);
                            corpus.clear();

                            // TODO: if (stats.isUseDB()) {
                            // 	stats.storeTmpResultsToDB();
                            // }
                        }

                        break;
                }
            }
        } catch (FileNotFoundException | XMLStreamException e) {
            throw new java.lang.RuntimeException("XMLStreamException | FileNotFoundException");
//            e.printStackTrace();
        } finally {
            if (eventReader != null) {
                try {
                    eventReader.close();
                } catch (XMLStreamException e) {
                    logger.error("closing stream", e);
                }
            }
        }

        return true;
    }

    @SuppressWarnings("Duplicates")
    public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
        boolean inWord = false;
        boolean inPunctuation = false;
        boolean taxonomyMatch = true;
        ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
//        ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
        String lemma = "";
        String msd = "";

        List<Word> sentence = new ArrayList<>();
        List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
        String sentenceDelimiter = "s";

        int numLines = 0;
        int lineNum = 0;
        progress.set(0.0);
        if(!isCollocability) {
            startTime = new Date();
        }
        // get number of lines
        try {
            XMLInputFactory factory = XMLInputFactory.newInstance();
            XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));

            while (eventReader.hasNext())
            {
                eventReader.next();
                numLines ++;
                // Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (XMLStreamException e) {
            e.printStackTrace();
        }

        XMLEventReader eventReader = null;
        try {
            XMLInputFactory factory = XMLInputFactory.newInstance();
            eventReader = factory.createXMLEventReader(new FileInputStream(path));

            while (eventReader.hasNext()) {
                int percentage = (int) (lineNum * 100.0 / numLines);
                if(progress.get() < percentage) {
                    progress.set(percentage);
                }
                if(isCancelled) {
                    return false;
                }
                lineNum ++;
                XMLEvent event = eventReader.nextEvent();

                switch (event.getEventType()) {
                    case XMLStreamConstants.START_ELEMENT:
                        StartElement startElement = event.asStartElement();
                        String qName = startElement.getName().getLocalPart();

                        // "word" node
                        if (qName.equals("w")) {
                            inWord = true;
                            if (!(String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("msd:") ||
                                    String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("mte:"))){
                                System.out.println("MSD written incorrectly");
                            }
                            msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(4);
                            lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
                        }

                        else if (qName.equals("pc")){
                            inPunctuation = true;
                        }

                        // taxonomy node
                        else if (stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("term")) {
                            // there are some term nodes at the beginning that are of no interest to us
                            // they differ by not having the attribute "ref", so test will equal null
                            Attribute tax = startElement.getAttributeByName(QName.valueOf("ref"));

                            if (tax != null) {
                                // keep only taxonomy properties
                                Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus());
                                currentFiletaxonomy.add(currentFiletaxonomyElement);
//                                Tax taxonomy = new Tax();
//                                currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
                            }
                        } else if (stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("catRef")) {
                            // get value from attribute target
                            Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));

                            if (tax != null && !tax.getValue().equals("dedup:nodup")) {
                                // keep only taxonomy properties
                                Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).split(":")[1], stats.getCorpus());
                                currentFiletaxonomy.add(currentFiletaxonomyElement);
//                                Tax taxonomy = new Tax();
//                                currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
                            }


//							if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) {
//								HashMap<String, String> atts = extractAttributes(startElement);
//								String debug = "";
//
//								String tax = startElement.getAttributeByName(QName.valueOf("target"))
//										.getValue()
//										.replace("#", "");
//
//								if (tax.indexOf(':') >= 0) {
//									tax = tax.split(":")[1];
//								}
//								resultTaxonomy.add(tax);
//							} else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) {
//								String tax = startElement.getAttributeByName(QName.valueOf("ref"))
//										.getValue()
//										.replace("#", "");
//
//								resultTaxonomy.add(tax);
//							} else if (!parseTaxonomy && headTags.contains(elementName)) {
//								String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
//								resultFilters.get(elementName).add(tagContent);
//							}


                        } else if (qName.equals("bibl")) {
                            // before proceeding to read this file, make sure that taxonomy filters are a match
                            taxonomyMatch = true;

                        } else if (qName.equals("text")){
                            taxonomyMatch = true;
                        }
                        break;

                    case XMLStreamConstants.CHARACTERS:
                        Characters characters = event.asCharacters();


                        // "word" node value
                        if (inWord) {
                            String word = characters.getData();
//							if (word.equals("Banovec")){
//								System.out.println("Test");
//							}
                            sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
                            inWord = false;
                        }
                        if (stats.getFilter().getNotePunctuations() && inPunctuation) {
//                        if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
                            String punctuation = characters.getData();
                            sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
                            inPunctuation = false;
                        }
                        break;

                    case XMLStreamConstants.END_ELEMENT:
                        EndElement endElement = event.asEndElement();

                        String var = endElement.getName().getLocalPart();
                        String debug = "";

                        // parser reached end of the current sentence
                        if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
                            if (stats.getFilter().getNgramValue() == 0){
                                int numSentenceParts = 0;
                                for(Word w : sentence){
                                    int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
                                    numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
                                }
                                stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
                            } else if(stats.getFilter().getNgramValue() >= 1) {
                                stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
                            }

                            // add sentence to corpus if it passes filters
                            sentence = runFilters(sentence, stats.getFilter());

                            if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
                                corpus.add(new Sentence(sentence, currentFiletaxonomy));
                            }

                            // and start a new one
                            sentence = new ArrayList<>();

                            /* Invoke Fork-Join when we reach maximum limit of
                             * sentences (because we can't read everything to
                             * memory) or we reach the end of the file.
                             */
                            if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
                                fj(corpus, stats);
                                // empty the current corpus, since we don't need the data anymore
                                corpus.clear();

                                // TODO: if (stats.isUseDB()) {
                                // 	stats.storeTmpResultsToDB();
                                // }
                            }
                        }
                        // fallback
                        else if (endElement.getName().getLocalPart().equalsIgnoreCase("div") &&
                                stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) {
                            // join corpus and stats
                            fj(corpus, stats);
                            corpus.clear();

                            currentFiletaxonomy = new ArrayList<>();
//                            currentFiletaxonomyLong = new ArrayList<>();
                        } else if (endElement.getName().getLocalPart().equals("bibl")) {
                            // before proceeding to read this file, make sure that taxonomy filters are a match

                            if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
                                currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection

                                if (stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.UNION")) && currentFiletaxonomy.isEmpty()) {
                                    // taxonomies don't match so stop
                                    // union (select words that match any of selected taxonomy
//									return false;
                                    taxonomyMatch = false;
//
                                } else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){
                                    // intersection (select only words that precisely match selected taxonomy
                                    taxonomyMatch = false;
                                }
                            }
                        } else if (endElement.getName().getLocalPart().equals("text")){
                            taxonomyMatch = false;
                        }

                        break;
                }
            }
            if (corpus.size() > 0) {
                fj(corpus, stats);
                // empty the current corpus, since we don't need the data anymore
                corpus.clear();

                // TODO: if (stats.isUseDB()) {
                // 	stats.storeTmpResultsToDB();
                // }
            }
        } catch (FileNotFoundException | XMLStreamException e) {
            e.printStackTrace();
        } finally {
            if (eventReader != null) {
                try {
                    eventReader.close();
                } catch (XMLStreamException e) {
                    logger.error("closing stream", e);
                }
            }
        }

        return true;
    }

    @SuppressWarnings("Duplicates")
    public static boolean readXMLGos(String path, StatisticsNew stats) {
        boolean inWord = false;
        boolean inPunctuation = false;
        boolean inOrthDiv = false;
        boolean computeForOrth = stats.getCorpus().isGosOrthMode();
        boolean inSeparatedWord = false;
        ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
//		ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
        String lemma = "";
        String msd = "";

        List<Word> sentence = new ArrayList<>();
        List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
        Map<String, List<Word>> GOSCorpusHM = new ConcurrentHashMap<>();
        String GOSCorpusHMKey = "";
        String sentenceDelimiter = "seg";
        int wordIndex = 0;

        String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm


        int numLines = 0;
        int lineNum = 0;
        progress.set(0.0);
        if(!isCollocability) {
            startTime = new Date();
        }
        // get number of lines
        try {
            XMLInputFactory factory = XMLInputFactory.newInstance();
            XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));

            while (eventReader.hasNext())
            {
                eventReader.next();
                numLines ++;
                // Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (XMLStreamException e) {
            e.printStackTrace();
        }


        XMLEventReader eventReader = null;
        boolean includeFile = true;
        try {
            XMLInputFactory factory = XMLInputFactory.newInstance();
            eventReader = factory.createXMLEventReader(new FileInputStream(path));

            // created hashmap to combine words with normalized words

            while (eventReader.hasNext()) {
                int percentage = (int) (lineNum * 100.0 / numLines);
                if(progress.get() < percentage) {
                    progress.set(percentage);
                }
                if(isCancelled) {
                    return false;
                }
                lineNum ++;
                XMLEvent event = eventReader.nextEvent();
                // System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", "")));

                switch (event.getEventType()) {
                    case XMLStreamConstants.START_ELEMENT:
                        StartElement startElement = event.asStartElement();
                        String qName = startElement.getName().getLocalPart();

                        if (qName.equals("div")) {
                            HashMap<String, String> atts = extractAttributes(startElement);

                            if (atts.keySet().contains("type")) {
                                inOrthDiv = atts.get("type").equals("orth");
                            }
                        }

                        // "word" node
                        if (qName.equals("w")) {
                            // check that it's not a type
                            HashMap<String, String> atts = extractAttributes(startElement);

                            if (!atts.containsKey("type")) {
                                inWord = true;

                                if (atts.containsKey("msd")) {
                                    msd = atts.get("msd");

                                }
                                if (atts.containsKey("lemma")) {
                                    lemma = atts.get("lemma");
                                }
                                //
                                // if (!inOrthDiv) {
                                // 	msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
                                // 	lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
                                // }
                            } else if (atts.containsKey("type") && atts.get("type").equals("separated")) {
                                inSeparatedWord = true;
                            }

                            // }
                        }
                        // taxonomy node
                        else if (qName.equalsIgnoreCase("catRef")) {
                            // there are some term nodes at the beginning that are of no interest to us
                            // they differ by not having the attribute "ref", so test will equal null
                            Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));

                            if (tax != null) {
                                // keep only taxonomy properties
                                Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()), stats.getCorpus());
                                currentFiletaxonomy.add(currentFiletaxonomyElement);
//								Tax taxonomy = new Tax();
//								currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
                            }
                        } else if (qName.equalsIgnoreCase("div")) {
                            gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
                        } else if (qName.equalsIgnoreCase("seg")) {
                            HashMap<String, String> atts = extractAttributes(startElement);

                            if (atts.keySet().contains("id")) {
                                if (inOrthDiv) {
                                    GOSCorpusHMKey = atts.get("id") + ".norm";
                                } else {
                                    GOSCorpusHMKey = atts.get("id");
                                }
                            } else {
                                System.out.println("No attribute \"id\"");
                            }
                        }
                        break;

                    case XMLStreamConstants.CHARACTERS:
                        // "word" node value
                        if (inWord) {
//						    if (GOSCorpusHMKey.equals("gos.028-0108.norm") && wordIndex > 8){
//                                System.out.println(wordIndex);
//                            }
                            // if algorithm is in orthodox part add new word to sentence
                            if (inOrthDiv){
//								GOSCorpusHM.put(GOSCorpusHMKey, sentence);
                                String word = "";
                                Characters characters = event.asCharacters();
                                sentence.add(createWord(characters.getData(), "", "", "", stats.getFilter()));
                                // if algorithm is in normalized part find orthodox word and add other info to it
                            } else {
                                Characters characters = event.asCharacters();
//								System.out.println(wordIndex);
//								System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex);
                                if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) {
                                    Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex);
                                    currentWord.setLemma(lemma, stats.getFilter().getWordParts());
                                    currentWord.setMsd(msd, stats.getFilter().getWordParts());
                                    currentWord.setNormalizedWord(characters.getData(), stats.getFilter().getWordParts());

                                    wordIndex += 1;

                                    // when a word is separated from one to many we have to create these duplicates
                                    if (inSeparatedWord){
                                        GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, createWord(currentWord.getWord(stats.getFilter().getWordParts()),
                                                "", "", "", stats.getFilter()));
                                    }
                                } //else {
//								    System.out.println("Error");
//                                }
                            }

                        }
                        break;

                    case XMLStreamConstants.END_ELEMENT:
                        EndElement endElement = event.asEndElement();

                        if (endElement.getName().getLocalPart().equals("w")) {
                            if (inWord){
                                inWord = false;
                            } else if(inSeparatedWord) {
                                // when there are no separated words left we have to delete last aditional duplicate
                                GOSCorpusHM.get(GOSCorpusHMKey).remove(wordIndex);

                                inSeparatedWord = false;
                            }
                        }

                        // parser reached end of the current sentence
                        if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
                            if (inOrthDiv){
                                // add sentence to corpus
                                GOSCorpusHM.put(GOSCorpusHMKey, sentence);
                            } else {


                                sentence = GOSCorpusHM.remove(GOSCorpusHMKey);
                                if (stats.getFilter().getNgramValue() == 0){
                                    int numSentenceParts = 0;
                                    for(Word w : sentence){
                                        int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
                                        numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
                                    }
                                    stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
                                } else if(stats.getFilter().getNgramValue() >= 1) {
                                    stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
                                }

                                // add sentence to corpus if it passes filters
                                if (includeFile && !ValidationUtil.isEmpty(sentence)) {
//									for(Word w : sentence) {
//										if (w.getW1().equals("")) {
//											System.out.println("HERE!!!");
//										}
//									}
                                    sentence = runFilters(sentence, stats.getFilter());
//									for(Word w : sentence) {
//										if (w.getW1().equals("")) {
//											System.out.println("HERE!!!");
//										}
//									}
                                    corpus.add(new Sentence(sentence, currentFiletaxonomy));
                                }


                                wordIndex = 0;


                                /* Invoke Fork-Join when we reach maximum limit of
                                 * sentences (because we can't read everything to
                                 * memory) or we reach the end of the file.
                                 */
                                if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
                                    fj(corpus, stats);
                                    // empty the current corpus, since we don't need
                                    // the data anymore
                                    corpus.clear();
                                }
                            }
                            // start a new sentence
                            sentence = new ArrayList<>();


                        } else if (endElement.getName().getLocalPart().equals("teiHeader")) {
                            // before proceeding to read this file, make sure that taxonomy filters are a match
//                            if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
//                                currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
//
//                                // disregard this entry if taxonomies don't match
//                                includeFile = !currentFiletaxonomy.isEmpty();
//
////								currentFiletaxonomy = new ArrayList<>();
//                            }
                            if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
                                currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection

                                if (stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.UNION")) && currentFiletaxonomy.isEmpty()) {
                                    // taxonomies don't match so stop
                                    // union (select words that match any of selected taxonomy
//									return false;
                                    includeFile = false;
//
                                } else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){
                                    // intersection (select only words that precisely match selected taxonomy
                                    includeFile = false;
                                } else {
                                    includeFile = true;
                                }
                            }
                        }

                        // backup
                        else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
                            fj(corpus, stats);
                            corpus.clear();

                            currentFiletaxonomy = new ArrayList<>();
//                            currentFiletaxonomyLong = new ArrayList<>();
                        }

                        break;
                }
            }
        } catch (FileNotFoundException | XMLStreamException e) {
            e.printStackTrace();
        } finally {
            if (eventReader != null) {
                try {
                    eventReader.close();
                } catch (XMLStreamException e) {
                    logger.error("closing stream", e);
                } catch (Exception e) {
                    logger.error("general error", e);
                }
            }
        }

        return true;
    }

    @SuppressWarnings("Duplicates")
    public static boolean readVERT(String path, StatisticsNew stats) {
        // taxonomy corpora
//		HashSet<String> resultTaxonomy = new HashSet<>();


        // regi path
        String regiPath = path.substring(0, path.length()-4) + "regi";

        LineIterator regiIt;
        int wordIndex = -1;
        int lemmaIndex = -1;
        int msdIndex = -1;
        boolean slovene = false;
        try {
            // read regi file
            regiIt = FileUtils.lineIterator(new File(regiPath), "UTF-8");
            try {
                boolean insideHeader = false;
                int attributeIndex = 0;
                while (regiIt.hasNext()) {
                    String line = regiIt.nextLine();

                    if (line.length() >= 9 && line.substring(0, 9).equals("ATTRIBUTE")) {
                        // split over "\" "
                        String[] split = line.split(" ");
                        if (split[1].equals("word") && wordIndex == -1){
                            wordIndex = attributeIndex;
                        } else if (split[1].equals("lempos") && lemmaIndex == -1){
                            lemmaIndex = attributeIndex;
                        } else if (split[1].equals("tag") && msdIndex == -1){
                            msdIndex = attributeIndex;
                        }
                        attributeIndex ++;
                        if (wordIndex >= 0 && lemmaIndex >= 0 && msdIndex >= 0){
                            break;
                        }
                    } else if (line.length() >= 8 && line.substring(0, 8).equals("LANGUAGE")) {
                        String[] split = line.split(" ");
                        if (split[1].equals("\"Slovenian\"")){
                            slovene = true;
                        }
                    }
                }
            } finally {
                LineIterator.closeQuietly(regiIt);
            }
        } catch (IOException e) {
            throw new java.lang.RuntimeException("IOException");
//            e.printStackTrace();
        }

        int numLines = 0;
        // get number of lines
        try (FileReader input = new FileReader(path);
             LineNumberReader count = new LineNumberReader(input)
        )
        {
            while (count.skip(Long.MAX_VALUE) > 0)
            {
                // Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file
            }

            numLines = count.getLineNumber() + 1;                                    // +1 because line index starts at 0
        } catch (IOException e) {
            e.printStackTrace();
        }

        LineIterator it;

        ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
        boolean inParagraph = false;
        boolean inSentence = false;
        boolean taxonomyMatch = true;
        int lineNum = 0;
        int numSentences = 0;
        int numSentencesLimit = 1000;
        List<Word> sentence = new ArrayList<>();
        List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT);

        progress.set(0.0);
        if(!isCollocability) {
            startTime = new Date();
        }
        try {
            it = FileUtils.lineIterator(new File(path), "UTF-8");
            try {
                boolean insideHeader = false;

                while (it.hasNext()) {
                    int percentage = (int) (lineNum * 100.0 / numLines);
                    if(progress.get() < percentage) {
                        progress.set(percentage);
                    }
                    if(isCancelled) {
                        return false;
                    }
                    lineNum ++;
                    String line = it.nextLine();
                    // beginning tags

                    // taxonomy
                    if (stats.getCorpus().getTaxonomy().size() > 0 && line.length() > 4 && line.substring(1, 5).equals("text")) {
                        String[] split = line.split("\" ");
                        currentFiletaxonomy = new ArrayList<>();

                        boolean medium = false;
                        boolean type = false;
                        boolean proofread = false;
                        for (String el : split) {
                            String[] attribute = el.split("=\"");
                            boolean idsPresent = false;
                            if (attribute[0].equals("medium_id") && !attribute[1].equals("-")) {
                                Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus());
                                currentFiletaxonomy.add(currentFiletaxonomyElement);
                                medium = true;
                            } else if (attribute[0].equals("type_id") && !attribute[1].equals("-")) {
                                Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus());
                                currentFiletaxonomy.add(currentFiletaxonomyElement);
                                type = true;
                            } else if (attribute[0].equals("proofread_id") && !attribute[1].equals("-")) {
                                Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus());
                                currentFiletaxonomy.add(currentFiletaxonomyElement);
                                proofread = true;
                            }
                            if (attribute[0].equals("medium") && !attribute[1].equals("-") && !medium) {
                                Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus());
                                currentFiletaxonomy.add(currentFiletaxonomyElement);
                            } else if (attribute[0].equals("type") && !attribute[1].equals("-") && !type) {
                                Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus());
                                currentFiletaxonomy.add(currentFiletaxonomyElement);
                            } else if (attribute[0].equals("proofread") && !attribute[1].equals("-") && !attribute[1].equals("-\">") && !proofread) {
                                Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus());
                                currentFiletaxonomy.add(currentFiletaxonomyElement);
                            }

                        }
                        taxonomyMatch = true;
                        if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
                            currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection

                            if (stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.UNION")) && currentFiletaxonomy.isEmpty()) {
                                // taxonomies don't match so stop
                                // union (select words that match any of selected taxonomy
//									return false;
                                taxonomyMatch = false;
//
                            } else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){
                                // intersection (select only words that precisely match selected taxonomy
                                taxonomyMatch = false;
                            }
                        }

                    }
//					else if((line.length() >= 3 && line.substring(0, 2).equals("<p") && line.substring(line.length() - 1, line.length()).equals(">")) ||
//							(line.length() >= 3 && line.substring(0, 3).equals("<ab") && line.substring(line.length() - 1, line.length()).equals(">"))){
//						inParagraph = true;
//					} else if((line.length() == 4 && line.equals("</p>")) || (line.length() == 5 && line.equals("</ab>"))){
//						inParagraph = false;
//					}
                    else if(line.length() >= 3 && line.substring(0, 2).equals("<s") && line.substring(line.length() - 1, line.length()).equals(">")){
                        inSentence = true;
                    } else if(line.length() == 4 && line.equals("</s>")){
                        inSentence = false;

                        if (stats.getFilter().getNgramValue() == 0){
                            int numSentenceParts = 0;
                            for(Word w : sentence){
                                int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
                                numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
                            }
                            stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
                        } else if(stats.getFilter().getNgramValue() >= 1) {
                            stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
                        }

                        sentence = runFilters(sentence, stats.getFilter());

                        if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
                            corpus.add(new Sentence(sentence, currentFiletaxonomy));
                        }

                        if (numSentences == numSentencesLimit) {
                            fj(corpus, stats);
                            corpus.clear();
                            numSentences = 0;
                        } else {
                            numSentences ++;
                        }

                        // and start a new one
                        sentence = new ArrayList<>();

//						corpus.add(new Sentence(sentence, currentFiletaxonomy));
                    } else if(!(line.charAt(0) == '<' && line.charAt(line.length() - 1) == '>') && inSentence){
//					} else if(!(line.charAt(0) == '<' && line.charAt(line.length() - 1) == '>') && inSentence && inParagraph){
                        String[] split = line.split("\t");
                        if(slovene) {
                            if (split[lemmaIndex].length() > 2 && split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) == '-' && Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1)) &&
                                    !split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-u")) {
                                Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter());
                                sentence.add(word);
                            } else if (stats.getFilter().getNotePunctuations() && (split[lemmaIndex].length() <= 2 || (split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) != '-' && !Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1))))) {
                                Word word = createWord(split[wordIndex], split[lemmaIndex], split[msdIndex], split[wordIndex], stats.getFilter());
                                sentence.add(word);
                            } else if (split[lemmaIndex].length() > 2 && !split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-u") ||
                                    stats.getFilter().getNotePunctuations()) {
                                Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter());
                                sentence.add(word);
                            }
                        } else {
                            if (split[lemmaIndex].length() > 2 && split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) == '-' && Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1)) &&
                                    !split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-z")) {
                                Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter());
                                sentence.add(word);
                            } else if (stats.getFilter().getNotePunctuations() && (split[lemmaIndex].length() <= 2 || (split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) != '-' && !Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1))))) {
                                Word word = createWord(split[wordIndex], split[lemmaIndex], split[msdIndex], split[wordIndex], stats.getFilter());
                                sentence.add(word);
                            } else if (split[lemmaIndex].length() > 2 && !split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-z") ||
                                    stats.getFilter().getNotePunctuations()) {
                                Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter());
                                sentence.add(word);
                            }
                        }
                    }
                }
                if (corpus.size() > 0) {
                    fj(corpus, stats);
                    corpus.clear();
                }
            } finally {
                LineIterator.closeQuietly(it);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
//		resultTaxonomy.remove("-");
        return true;
    }

    /**
     * Runs the sentence through some filters, so we don't do calculations when unnecessary.
     * Filters:
     * <ol>
     * <li><b>Ngrams:</b> omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)</li>
     * <li><b>Letter ngrams:</b> omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)</li>
     * </ol>
     *
     * @return Empty sentence (if fails 1.) or a sentence with some words removed (2.)
     */
    private static List<Word> runFilters(List<Word> sentence, Filter filter) {
        if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
            // ngram level: if not 0 must be less than or equal to number of words in this sentence.
            if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) {
                return new ArrayList<>();
            }

            // if we're calculating values for letters, omit words that are shorter than string length
            if (filter.getNgramValue() == 0) {
                sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord(filter.getWordParts()).length() < filter.getStringLength())
                        || (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma(filter.getWordParts()).length() < filter.getStringLength()));
            }
        }

        return sentence;
    }

    private static HashMap<String, String> extractAttributes(StartElement se) {
        Iterator attributesIt = se.getAttributes();
        HashMap<String, String> atts = new HashMap<>();

        while (attributesIt.hasNext()) {
            Attribute a = (Attribute) attributesIt.next();
            atts.put(a.getName().getLocalPart(), a.getValue());
        }

        return atts;
    }

    public static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
        List<String> wString = new ArrayList<>();
        if (f.getWordParts().contains(CalculateFor.WORD))
            wString.add(word);
        if (f.getWordParts().contains(CalculateFor.LEMMA))
            wString.add(lemma);
        if (f.getWordParts().contains(CalculateFor.MORPHOSYNTACTIC_SPECS))
            wString.add(msd);
        if (f.getWordParts().contains(CalculateFor.NORMALIZED_WORD))
            wString.add(normalizedWord);

        // find appropriate strings and put them in word
        Word w;

        switch (f.getWordParts().size()) {
            case 1:
                w = new Word1(wString.get(0));
                break;
            case 2:
                w = new Word2(wString.get(0), wString.get(1));
                break;
            case 3:
                w = new Word3(wString.get(0), wString.get(1), wString.get(2));
                break;
            case 4:
                w = new Word4(wString.get(0), wString.get(1), wString.get(2), wString.get(3));
                break;
            default:
                w = null;

        }
        return w;
    }
}