list/src/main/java/alg/XML_processing.java

package alg;

import static data.Enums.solar.SolarFilters.*;

import java.io.*;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ForkJoinPool;

import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.*;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.logging.log4j.LogManager;

import data.*;
import gui.ValidationUtil;

public class XML_processing {
	public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class);

	// public static void processCorpus(Statistics stats) {
	// 	// we can preset the list's size, so there won't be a need to resize it
	// 	List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT);
	//
	// 	int i = 0;
	// 	for (File f : Settings.corpus) {
	// 		i++;
	// 		readXML(f.toString(), stats);
	// 	}
	// }

	// public static void readXML(String path, Statistics stats) {
	// 	if (stats.getCorpusType() == CorpusType.GIGAFIDA) {
	// 		readXMLGigafida(path, stats);
	// 	} else if (stats.getCorpusType() == CorpusType.GOS) {
	// 		readXMLGos(path, stats);
	// 	} else if (stats.getCorpusType() == CorpusType.SOLAR) {
	// 		readXMLSolar(path, stats);
	// 	}
	// }

	public static void readXML(String path, StatisticsNew stats) {
		if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA
				|| stats.getCorpus().getCorpusType() == CorpusType.CCKRES) {
			readXMLGigafida(path, stats);
		} else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) {
			readXMLGos(path, stats);
		} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
			readXMLSolar(path, stats);
		} else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) {
            readXMLSSJ500K(path, stats);
        }
	}

	/**
	 * Reads and returns the value of a passed header tag or an empty string.
	 * E.g. title tag, for discerning the corpus' type.
	 * Notice: returns only the value of the first occurrence of a given tag name.
	 */
	public static String readXMLHeaderTag(String path, String tag) {
		XMLInputFactory factory = XMLInputFactory.newInstance();
		XMLEventReader eventReader = null;

		try {
			eventReader = factory.createXMLEventReader(new FileInputStream(path));
			while (eventReader.hasNext()) {
				XMLEvent xmlEvent = eventReader.nextEvent();
				if (xmlEvent.isStartElement()) {
					StartElement startElement = xmlEvent.asStartElement();
					String var = startElement.getName().getLocalPart();

					if (var.equalsIgnoreCase(tag)) {
						return eventReader.nextEvent().asCharacters().getData();
					}
				}
			}
		} catch (FileNotFoundException | XMLStreamException e) {
			e.printStackTrace();
		} finally {
			if (eventReader != null) {
				try {
					eventReader.close();
				} catch (XMLStreamException e) {
					logger.error("closing stream", e);
				}
			}
		}
		return "";
	}

	/**
	 * Reads and returns the value of a passed header attribute or an empty string.
	 * E.g. body base attribute, for discerning the corpus' type of ssj500k.
	 * Notice: returns only the value of the first occurrence of a given tag name.
	 */
	public static String readXMLHeaderAttribute(String path, String tag, String attribute) {
		XMLInputFactory factory = XMLInputFactory.newInstance();
		XMLEventReader eventReader = null;

		try {
			eventReader = factory.createXMLEventReader(new FileInputStream(path));
			while (eventReader.hasNext()) {
				XMLEvent xmlEvent = eventReader.nextEvent();
				if (xmlEvent.isStartElement()) {
					StartElement startElement = xmlEvent.asStartElement();
					String var = startElement.getName().getLocalPart();

					if (var.equalsIgnoreCase(tag)) {
                        HashMap<String, String> att = extractAttributes(startElement);

						if (att.containsKey("base")) {
							return att.get("base").substring(0, att.get("base").length() - 12);
						}


						return eventReader.nextEvent().asCharacters().getData();
					}
				}
			}
		} catch (FileNotFoundException | XMLStreamException e) {
			e.printStackTrace();
		} finally {
			if (eventReader != null) {
				try {
					eventReader.close();
				} catch (XMLStreamException e) {
					logger.error("closing stream", e);
				}
			}
		}
		return "";
	}

	private static void fj(List<Sentence> corpus, StatisticsNew stats) {
		ForkJoinPool pool = new ForkJoinPool();

		if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) {
			alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats);
			pool.invoke(wc);
		} else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) {
			alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats);
			pool.invoke(wc);
		} else {
			// TODO:
			// alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats);
			// pool.invoke(wc);
		}
	}

	// public static void readXMLGos(String path, Statistics stats) {
	// 	boolean in_word = false;
	// 	String taksonomija = "";
	// 	String lemma = "";
	// 	String msd = "";
	// 	String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm
	//
	// 	List<Word> stavek = new ArrayList<>();
	// 	List<Sentence> corpus = new ArrayList<>();
	// 	String sentenceDelimiter = "seg";
	// 	String taxonomyPrefix = "gos.";
	//
	// 	try {
	// 		XMLInputFactory factory = XMLInputFactory.newInstance();
	// 		XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
	//
	// 		while (eventReader.hasNext()) {
	// 			XMLEvent event = eventReader.nextEvent();
	//
	// 			switch (event.getEventType()) {
	// 				case XMLStreamConstants.START_ELEMENT:
	//
	// 					StartElement startElement = event.asStartElement();
	// 					String qName = startElement.getName().getLocalPart();
	//
	// 					// "word" node
	// 					if (qName.equals("w")) {
	// 						in_word = true;
	//
	// 						if (type.equals("norm")) {
	// 							// make sure we're looking at <w lemma...> and not <w type...>
	// 							Iterator var = startElement.getAttributes();
	// 							ArrayList<Object> attributes = new ArrayList<>();
	// 							while (var.hasNext()) {
	// 								attributes.add(var.next());
	// 							}
	//
	// 							if (attributes.contains("msd")) {
	// 								msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
	// 							} else {
	// 								msd = null;
	// 							}
	//
	// 							if (attributes.contains("lemma")) {
	// 								lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
	// 							}
	// 						}
	// 					}
	// 					// taxonomy node
	// 					else if (qName.equalsIgnoreCase("catRef")) {
	// 						// there are some term nodes at the beginning that are of no interest to us
	// 						// they differ by not having the attribute "ref", so test will equal null
	// 						Attribute test = startElement.getAttributeByName(QName.valueOf("target"));
	//
	// 						if (test != null) {
	// 							// keep only taxonomy properties
	// 							taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, "");
	// 						}
	// 					} else if (qName.equalsIgnoreCase("div")) {
	// 						type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
	//
	// 					}
	// 					break;
	//
	// 				case XMLStreamConstants.CHARACTERS:
	// 					Characters characters = event.asCharacters();
	//
	// 					// "word" node value
	// 					if (in_word) {
	// 						if (type.equals("norm") && msd != null) {
	// 							stavek.add(new Word(characters.getData(), lemma, msd));
	// 						} else {
	// 							stavek.add(new Word(characters.getData()));
	// 						}
	//
	// 						in_word = false;
	// 					}
	// 					break;
	//
	// 				case XMLStreamConstants.END_ELEMENT:
	// 					EndElement endElement = event.asEndElement();
	//
	// 					// parser reached end of the current sentence
	// 					if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
	// 						// add sentence to corpus
	// 						corpus.add(new Sentence(stavek, taksonomija, type));
	// 						// and start a new one
	// 						stavek = new ArrayList<>();
	//
	// 						/* Invoke Fork-Join when we reach maximum limit of
	// 						 * sentences (because we can't read everything to
	// 						 * memory) or we reach the end of the file.
	// 						 */
	// 						if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
	// 							fj(corpus, stats);
	// 							// empty the current corpus, since we don't need
	// 							// the data anymore
	// 							corpus.clear();
	// 						}
	// 					}
	//
	// 					// backup
	// 					if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
	// 						fj(corpus, stats);
	// 						corpus.clear();
	// 					}
	//
	// 					break;
	// 			}
	// 		}
	// 	} catch (FileNotFoundException | XMLStreamException e) {
	// 		e.printStackTrace();
	// 	}
	// }

	@SuppressWarnings("unused")
	public static void readXMLSolar(String path, StatisticsNew stats) {
		boolean in_word = false;
        boolean inPunctuation = false;
        String lemma = "";
		String msd = "";

		List<Word> stavek = new ArrayList<>();
		List<Sentence> corpus = new ArrayList<>();

		// used for filter
		Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto"));
		Map<String, String> headBlock = null;
		boolean includeThisBlock = false;

		try {
			XMLInputFactory factory = XMLInputFactory.newInstance();
			XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));

			while (eventReader.hasNext()) {
				XMLEvent event = eventReader.nextEvent();

				switch (event.getEventType()) {
					case XMLStreamConstants.START_ELEMENT:

						StartElement startElement = event.asStartElement();
						// System.out.println(String.format("%s", startElement.toString()));
						String qName = startElement.getName().getLocalPart();

						// "word" node
						if (qName.equals("w3")) {
							in_word = true;

							msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
							lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
						} else if (qName.equals("c3")) {
							String c3Content = eventReader.nextEvent().asCharacters().getData();

                            if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() &&
                                    stavek.size() > 0){
                                stavek.add(createWord(c3Content, c3Content, "/", "", stats.getFilter()));

                            }

							if (c3Content.equals(".") && includeThisBlock) {
								if (stats.getFilter().getNgramValue() == 0){
									int numSentenceParts = 0;
									for(Word w : stavek){
										int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
										numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
									}
									stats.updateUniGramOccurrences(numSentenceParts, new ArrayList<>());
								} else if(stats.getFilter().getNgramValue() >= 1) {
									stats.updateUniGramOccurrences(stavek.size(), new ArrayList<>());
								}

								// add sentence to corpus
								corpus.add(new Sentence(stavek, null));
								// and start a new one
								stavek = new ArrayList<>();

							/* Invoke Fork-Join when we reach maximum limit of
							 * sentences (because we can't read everything to
							 * memory) or we reach the end of the file.
							 */
								if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
									fj(corpus, stats);
									// empty the current corpus, since we don't need
									// the data anymore
									corpus.clear();
								}
							}
						} else if (headTags.contains(qName)) {
							String tagContent = eventReader.nextEvent().asCharacters().getData();
							headBlock.put(qName, tagContent);
						} else if (qName.equals("head")) {
							headBlock = new HashMap<>();
						}

						break;

					case XMLStreamConstants.CHARACTERS:
						Characters characters = event.asCharacters();

						// "word" node value
						if (in_word) {
							stavek.add(createWord(characters.getData(), lemma, msd, "", stats.getFilter()));
							in_word = false;
						}
						break;

					case XMLStreamConstants.END_ELEMENT:
						EndElement endElement = event.asEndElement();
						String qNameEnd = endElement.getName().getLocalPart();

						if (qNameEnd.equals("head")) {
							// validate and set boolean
							if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) {
								includeThisBlock = true;
							}
						} else if (qNameEnd.equals("body")) {
							// new block, reset filter status
							includeThisBlock = false;
						}

						// backup
						if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) {
							fj(corpus, stats);
							corpus.clear();
						}

						break;
				}
			}
		} catch (FileNotFoundException | XMLStreamException e) {
			e.printStackTrace();
		}
	}

	/**
	 * @param readHeadBlock block of tags read from the corpus
	 * @param userSetFilter tags with values set by the user
	 *
	 * @return
	 */
	private static boolean validateHeadBlock(Map<String, String> readHeadBlock, HashMap<String, HashSet<String>> userSetFilter) {
		boolean pass = true;

		if (userSetFilter == null) {
			return true;
		}

		for (Map.Entry<String, HashSet<String>> filterEntry : userSetFilter.entrySet()) {
			String key = filterEntry.getKey();
			HashSet<String> valueObject = filterEntry.getValue();

			// if (valueObject instanceof String) {
			// 	pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject);
			// } else
			if (valueObject != null) {
				//noinspection unchecked
				for (String value : valueObject) {
					pass = validateHeadBlockEntry(readHeadBlock, key, value);
				}
			}

			if (!pass) {
				// current head block does not include one of the set filters - not likely, but an edge case anyway
				return false;
			}
		}

		// if it gets to this point, it passed all the filters
		return true;
	}

	private static boolean validateHeadBlockEntry(Map<String, String> readHeadBlock, String userSetKey, String userSetValue) {
		if (!readHeadBlock.keySet().contains(userSetKey)) {
			// current head block does not include one of the set filters - not likely, but an edge case anyway
			return false;
		} else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) {
			// different values -> doesn't pass the filter
			return false;
		}

		return true;
	}


	/**
	 * Parses XML headers for information about its taxonomy (if supported) or filters (solar)
	 *
	 * @param filepath
	 * @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
	 * @param corpusType
	 */
	public static HashSet<String> readVertHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
//		boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
		// solar
		Set<String> headTags = null;
		HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
		// taxonomy corpora
		HashSet<String> resultTaxonomy = new HashSet<>();

		LineIterator it = null;
		try {
			it = FileUtils.lineIterator(new File(filepath), "UTF-8");
			try {
				boolean insideHeader = false;

				while (it.hasNext()) {
					String line = it.nextLine();

					if (line.length() > 4 && line.substring(1, 5).equals("text")) {
						// split over "\" "
						String[] split = line.split("\" ");
//						String mediumId = "";
//						String typeId = "";
//						String proofreadId = "";
						for (String el : split) {
							String[] attribute = el.split("=\"");
							if (attribute[0].equals("medium_id")) {
//								mediumId = attribute[1];
								resultTaxonomy.add(attribute[1]);
							} else if (attribute[0].equals("type_id")) {
//								typeId = attribute[1];
								resultTaxonomy.add(attribute[1]);
							} else if (attribute[0].equals("proofread_id")) {
//								proofreadId = attribute[1];
								resultTaxonomy.add(attribute[1]);
							}
						}
					}
				}
			} finally {
				LineIterator.closeQuietly(it);
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		resultTaxonomy.remove("-");
		return resultTaxonomy;
	}

	/**
	 * Parses XML headers for information about its taxonomy (if supported) or filters (solar)
	 *
	 * @param filepath
	 * @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
	 * @param corpusType
	 */
	public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
		boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
		// solar
		Set<String> headTags = null;
		HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
		// taxonomy corpora
		HashSet<String> resultTaxonomy = new HashSet<>();

		String headTagName;

		if (corpusType == CorpusType.SOLAR) {
			headTagName = "head";
			// used for filter
			headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO));

			// init results now to avoid null pointers
			headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
		} else if (corpusType == CorpusType.SSJ500K) {
            headTagName = "bibl";
        } else {
			headTagName = "teiHeader";
		}

		XMLInputFactory factory = XMLInputFactory.newInstance();
		XMLEventReader xmlEventReader = null;
		try {
			xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
			boolean insideHeader = false;

			while (xmlEventReader.hasNext()) {
				XMLEvent xmlEvent = xmlEventReader.nextEvent();

				if (xmlEvent.isStartElement()) {
					StartElement startElement = xmlEvent.asStartElement();
					String elementName = startElement.getName().getLocalPart();

					if (elementName.equalsIgnoreCase(headTagName)) {
						// if the corpus is split into files, we skip bodies
						// this toggle is true when we're inside a header (next block of code executes)
						// and false when we're not (skip reading unnecessary attributes)
						insideHeader = true;
					}

					if (insideHeader) {
						if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) {
							HashMap<String, String> atts = extractAttributes(startElement);
							String debug = "";

							String tax = startElement.getAttributeByName(QName.valueOf("target"))
									.getValue()
									.replace("#", "");

							resultTaxonomy.add(tax);
						} else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) {
                            String tax = startElement.getAttributeByName(QName.valueOf("ref"))
                                    .getValue()
                                    .replace("#", "");

                            resultTaxonomy.add(tax);
                        } else if (!parseTaxonomy && headTags.contains(elementName)) {
							String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
							resultFilters.get(elementName).add(tagContent);
						}
					}
				} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
					// if the corpus is split into multiple files, each with only one header block per file
					// that means we should stop after we reach the end of the header
					return parseTaxonomy ? resultTaxonomy : resultFilters;
				} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
					// whole corpus in one file, so we have to continue reading in order to find all header blocks
					insideHeader = false;
				}
			}
		} catch (XMLStreamException e) {
			logger.error("Streaming error", e);
			return parseTaxonomy ? resultTaxonomy : resultFilters;
		} catch (FileNotFoundException e) {
			logger.error("File not found", e);
			return parseTaxonomy ? resultTaxonomy : resultFilters;
			// TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user
		} finally {
			if (xmlEventReader != null) {
				try {
					xmlEventReader.close();
				} catch (XMLStreamException e) {
					logger.error("closing stream", e);
				}
			}
		}
		return parseTaxonomy ? resultTaxonomy : resultFilters;
	}

	private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) {
		return event.asEndElement()
				.getName()
				.getLocalPart()
				.equalsIgnoreCase(headerTag);
	}

	@SuppressWarnings("Duplicates")
	public static boolean readXMLGigafida(String path, StatisticsNew stats) {
		boolean inWord = false;
		boolean inPunctuation = false;
		boolean taxonomyMatch = true;
		ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
//		ArrayList<Taxonomy> currentFiletaxonomyLong = new ArrayList<>();
		String lemma = "";
		String msd = "";

		List<Word> sentence = new ArrayList<>();
		List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
		String sentenceDelimiter = "s";

		XMLEventReader eventReader = null;
		try {
			XMLInputFactory factory = XMLInputFactory.newInstance();
			eventReader = factory.createXMLEventReader(new FileInputStream(path));

			while (eventReader.hasNext()) {
				XMLEvent event = eventReader.nextEvent();

				switch (event.getEventType()) {
					case XMLStreamConstants.START_ELEMENT:
						StartElement startElement = event.asStartElement();
						String qName = startElement.getName().getLocalPart();

						// "word" node
						if (qName.equals("w")) {
							inWord = true;

							msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
							lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
						}

						if (qName.equals("c")){
							inPunctuation = true;
						}

						// taxonomy node
						else if (qName.equalsIgnoreCase("catRef")) {
							// there are some term nodes at the beginning that are of no interest to us
							// they differ by not having the attribute "ref", so test will equal null
							Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));

							if (tax != null) {
								// keep only taxonomy properties
								Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""));
								currentFiletaxonomy.add(currentFiletaxonomyElement);
								Tax taxonomy = new Tax();
//								currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
							}
						}
						break;

					case XMLStreamConstants.CHARACTERS:
						Characters characters = event.asCharacters();

						// "word" node value
						if (inWord) {
							String word = characters.getData();
							sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
							inWord = false;
						}
						if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
						    String punctuation = characters.getData();
							sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
							inPunctuation = false;

//						    String punctuation = ",";
//
//                            sentence.get(sentence.size() - 1).setWord(sentence.get(sentence.size() - 1).getWord() + punctuation);
//                            sentence.get(sentence.size() - 1).setLemma(sentence.get(sentence.size() - 1).getLemma() + punctuation);
//                            sentence.get(sentence.size() - 1).setMsd(sentence.get(sentence.size() - 1).getMsd() + punctuation);
//                            inPunctuation = false;
                        }
						break;

//                    if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
//                        String actualPunctuation = characters.getData();
//                        if (actualPunctuation.equals(".") || actualPunctuation.equals("!") || actualPunctuation.equals("?") || actualPunctuation.equals("..."))
//                            break;
//                        String punctuation = ",";
//                        int skip_number = 0;
//                        if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue())){
//                            skip_number = stats.getFilter().getSkipValue();
//                        }
//                        for(int i = 1; i < skip_number + 2; i ++){
//                            if (i < sentence.size() && !sentence.get(sentence.size() - i).equals(punctuation)) {
//                                sentence.get(sentence.size() - i).setWord(sentence.get(sentence.size() - i).getWord() + punctuation);
//                                sentence.get(sentence.size() - i).setLemma(sentence.get(sentence.size() - i).getLemma() + punctuation);
//                                sentence.get(sentence.size() - i).setMsd(sentence.get(sentence.size() - i).getMsd() + punctuation);
//                            }
//                        }
//                        inPunctuation = false;
//                    }

					case XMLStreamConstants.END_ELEMENT:
						EndElement endElement = event.asEndElement();

						String var = endElement.getName().getLocalPart();
						String debug = "";

						// parser reached end of the current sentence
						if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
                            // count all UniGramOccurrences in sentence for statistics
							if (stats.getFilter().getNgramValue() == 0){
								int numSentenceParts = 0;
								for(Word w : sentence){
									int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
									numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
								}
								stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
							} else if(stats.getFilter().getNgramValue() >= 1) {
								stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
							}
							// add sentence to corpus if it passes filters
							sentence = runFilters(sentence, stats.getFilter());


							if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
								corpus.add(new Sentence(sentence, currentFiletaxonomy));
							}

//							taxonomyMatch = true;
							// and start a new one
							sentence = new ArrayList<>();

							/* Invoke Fork-Join when we reach maximum limit of
							 * sentences (because we can't read everything to
							 * memory) or we reach the end of the file.
							 */
							if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
								fj(corpus, stats);
								// empty the current corpus, since we don't need the data anymore
								corpus.clear();

								// TODO: if (stats.isUseDB()) {
								// 	stats.storeTmpResultsToDB();
								// }
							}
						} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
							// before proceeding to read this file, make sure that taxonomy filters are a match

							if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
								currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection

								if (currentFiletaxonomy.isEmpty()) {
									// taxonomies don't match so stop
//									return false;
                                    taxonomyMatch = false;
//									System.out.println("TEST");
								}
							}
						}

						// fallback
						else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
							// join corpus and stats
							fj(corpus, stats);
							corpus.clear();

							// TODO: if (stats.isUseDB()) {
							// 	stats.storeTmpResultsToDB();
							// }
						}

						break;
				}
			}
		} catch (FileNotFoundException | XMLStreamException e) {
			e.printStackTrace();
		} finally {
			if (eventReader != null) {
				try {
					eventReader.close();
				} catch (XMLStreamException e) {
					logger.error("closing stream", e);
				}
			}
		}

		return true;
	}

    @SuppressWarnings("Duplicates")
    public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
        boolean inWord = false;
        boolean inPunctuation = false;
        boolean taxonomyMatch = true;
        ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
//        ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
        String lemma = "";
        String msd = "";

        List<Word> sentence = new ArrayList<>();
        List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
        String sentenceDelimiter = "s";

        XMLEventReader eventReader = null;
        try {
            XMLInputFactory factory = XMLInputFactory.newInstance();
            eventReader = factory.createXMLEventReader(new FileInputStream(path));

            while (eventReader.hasNext()) {
                XMLEvent event = eventReader.nextEvent();

                switch (event.getEventType()) {
                    case XMLStreamConstants.START_ELEMENT:
                        StartElement startElement = event.asStartElement();
                        String qName = startElement.getName().getLocalPart();

                        // "word" node
                        if (qName.equals("w")) {
                            inWord = true;
                            if (!String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("msd:")){
                                System.out.println("MSD written incorrectly");
                            }
                            msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(4);
                            lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
                        }

                        else if (qName.equals("pc")){
                            inPunctuation = true;
                        }

                        // taxonomy node
                        else if (qName.equalsIgnoreCase("term")) {
                            // there are some term nodes at the beginning that are of no interest to us
                            // they differ by not having the attribute "ref", so test will equal null
                            Attribute tax = startElement.getAttributeByName(QName.valueOf("ref"));

                            if (tax != null) {
                                // keep only taxonomy properties
                                Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""));
                                currentFiletaxonomy.add(currentFiletaxonomyElement);
//                                Tax taxonomy = new Tax();
//                                currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
                            }
                        } else if (qName.equals("bibl")) {
							// before proceeding to read this file, make sure that taxonomy filters are a match
							taxonomyMatch = true;

						}
                        break;

                    case XMLStreamConstants.CHARACTERS:
                        Characters characters = event.asCharacters();

                        // "word" node value
                        if (inWord) {
                            String word = characters.getData();
                            sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
                            inWord = false;
                        }
                        if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
                            String punctuation = characters.getData();
                            sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
                            inPunctuation = false;
                        }
                        break;

                    case XMLStreamConstants.END_ELEMENT:
                        EndElement endElement = event.asEndElement();

                        String var = endElement.getName().getLocalPart();
                        String debug = "";

                        // parser reached end of the current sentence
                        if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
							if (stats.getFilter().getNgramValue() == 0){
								int numSentenceParts = 0;
								for(Word w : sentence){
									int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
									numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
								}
								stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
							} else if(stats.getFilter().getNgramValue() >= 1) {
								stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
							}

                            // add sentence to corpus if it passes filters
                            sentence = runFilters(sentence, stats.getFilter());

                            if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
                                corpus.add(new Sentence(sentence, currentFiletaxonomy));
                            }

                            // and start a new one
                            sentence = new ArrayList<>();

                            /* Invoke Fork-Join when we reach maximum limit of
                             * sentences (because we can't read everything to
                             * memory) or we reach the end of the file.
                             */
                            if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
                                fj(corpus, stats);
                                // empty the current corpus, since we don't need the data anymore
                                corpus.clear();

                                // TODO: if (stats.isUseDB()) {
                                // 	stats.storeTmpResultsToDB();
                                // }
                            }
                        }
                        // fallback
                        else if (endElement.getName().getLocalPart().equalsIgnoreCase("div")) {
                            // join corpus and stats
                            fj(corpus, stats);
                            corpus.clear();

                            currentFiletaxonomy = new ArrayList<>();
//                            currentFiletaxonomyLong = new ArrayList<>();
                        } else if (endElement.getName().getLocalPart().equals("bibl")) {
							// before proceeding to read this file, make sure that taxonomy filters are a match

							if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
								currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection

								if (currentFiletaxonomy.isEmpty()) {
									// taxonomies don't match so stop
//									return false;
									taxonomyMatch = false;
//									System.out.println("TEST");
								}
							}
						}

                        break;
                }
            }
        } catch (FileNotFoundException | XMLStreamException e) {
            e.printStackTrace();
        } finally {
            if (eventReader != null) {
                try {
                    eventReader.close();
                } catch (XMLStreamException e) {
                    logger.error("closing stream", e);
                }
            }
        }

        return true;
    }

	@SuppressWarnings("Duplicates")
	public static boolean readXMLGos(String path, StatisticsNew stats) {
		boolean inWord = false;
        boolean inPunctuation = false;
		boolean inOrthDiv = false;
		boolean computeForOrth = stats.getCorpus().isGosOrthMode();
		boolean inSeparatedWord = false;
		ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
//		ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
		String lemma = "";
		String msd = "";

		List<Word> sentence = new ArrayList<>();
		List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
		Map<String, List<Word>> GOSCorpusHM = new ConcurrentHashMap<>();
		String GOSCorpusHMKey = "";
		String sentenceDelimiter = "seg";
		int wordIndex = 0;

		String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm

		XMLEventReader eventReader = null;

		boolean includeFile = true;

		try {
			XMLInputFactory factory = XMLInputFactory.newInstance();
			eventReader = factory.createXMLEventReader(new FileInputStream(path));

			// created hashmap to combine words with normalized words

			while (eventReader.hasNext()) {
				XMLEvent event = eventReader.nextEvent();
				// System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", "")));

				switch (event.getEventType()) {
					case XMLStreamConstants.START_ELEMENT:
						StartElement startElement = event.asStartElement();
						String qName = startElement.getName().getLocalPart();

						if (qName.equals("div")) {
							HashMap<String, String> atts = extractAttributes(startElement);

							if (atts.keySet().contains("type")) {
								inOrthDiv = atts.get("type").equals("orth");
							}
						}

						// "word" node
						if (qName.equals("w")) {
							// check that it's not a type
							HashMap<String, String> atts = extractAttributes(startElement);

							if (!atts.containsKey("type")) {
								inWord = true;

								if (atts.containsKey("msd")) {
									msd = atts.get("msd");

								}
								if (atts.containsKey("lemma")) {
									lemma = atts.get("lemma");
								}
								//
								// if (!inOrthDiv) {
								// 	msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
								// 	lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
								// }
							} else if (atts.containsKey("type") && atts.get("type").equals("separated")) {
                                inSeparatedWord = true;
                            }

							// }
						}
						// taxonomy node
						else if (qName.equalsIgnoreCase("catRef")) {
							// there are some term nodes at the beginning that are of no interest to us
							// they differ by not having the attribute "ref", so test will equal null
							Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));

							if (tax != null) {
								// keep only taxonomy properties
								Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()));
								currentFiletaxonomy.add(currentFiletaxonomyElement);
//								Tax taxonomy = new Tax();
//								currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
							}
						} else if (qName.equalsIgnoreCase("div")) {
							gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
						} else if (qName.equalsIgnoreCase("seg")) {
							HashMap<String, String> atts = extractAttributes(startElement);

							if (atts.keySet().contains("id")) {
							    if (inOrthDiv) {
                                    GOSCorpusHMKey = atts.get("id") + ".norm";
                                } else {
                                    GOSCorpusHMKey = atts.get("id");
                                }
							} else {
								System.out.println("No attribute \"id\"");
							}
						}
						break;

					case XMLStreamConstants.CHARACTERS:
						// "word" node value
						if (inWord) {
//						    if (GOSCorpusHMKey.equals("gos.028-0108.norm") && wordIndex > 8){
//                                System.out.println(wordIndex);
//                            }
							// if algorithm is in orthodox part add new word to sentence
							if (inOrthDiv){
//								GOSCorpusHM.put(GOSCorpusHMKey, sentence);
								String word = "";
								Characters characters = event.asCharacters();
								sentence.add(createWord(characters.getData(), "", "", "", stats.getFilter()));
							// if algorithm is in normalized part find orthodox word and add other info to it
							} else {
								Characters characters = event.asCharacters();
//								System.out.println(wordIndex);
//								System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex);
								if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) {
									Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex);
									currentWord.setLemma(lemma, stats.getFilter().getWordParts());
									currentWord.setMsd(msd, stats.getFilter().getWordParts());
									currentWord.setNormalizedWord(characters.getData(), stats.getFilter().getWordParts());

									wordIndex += 1;

                                    // when a word is separated from one to many we have to create these duplicates
                                    if (inSeparatedWord){
                                        GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, createWord(currentWord.getWord(stats.getFilter().getWordParts()),
                                                "", "", "", stats.getFilter()));
                                    }
								} //else {
//								    System.out.println("Error");
//                                }
							}

						}
						break;

					case XMLStreamConstants.END_ELEMENT:
						EndElement endElement = event.asEndElement();

                        if (endElement.getName().getLocalPart().equals("w")) {
                            if (inWord){
                                inWord = false;
                            } else if(inSeparatedWord) {
                                // when there are no separated words left we have to delete last aditional duplicate
                                GOSCorpusHM.get(GOSCorpusHMKey).remove(wordIndex);

                                inSeparatedWord = false;
                            }
                        }

						// parser reached end of the current sentence
						if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
							if (inOrthDiv){
							    // add sentence to corpus
								GOSCorpusHM.put(GOSCorpusHMKey, sentence);
							} else {


                                sentence = GOSCorpusHM.remove(GOSCorpusHMKey);
								if (stats.getFilter().getNgramValue() == 0){
									int numSentenceParts = 0;
									for(Word w : sentence){
										int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
										numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
									}
									stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
								} else if(stats.getFilter().getNgramValue() >= 1) {
									stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
								}

								// add sentence to corpus if it passes filters
								if (includeFile && !ValidationUtil.isEmpty(sentence)) {
//									for(Word w : sentence) {
//										if (w.getW1().equals("")) {
//											System.out.println("HERE!!!");
//										}
//									}
									sentence = runFilters(sentence, stats.getFilter());
//									for(Word w : sentence) {
//										if (w.getW1().equals("")) {
//											System.out.println("HERE!!!");
//										}
//									}
									corpus.add(new Sentence(sentence, currentFiletaxonomy));
								}


								wordIndex = 0;


								/* Invoke Fork-Join when we reach maximum limit of
								 * sentences (because we can't read everything to
								 * memory) or we reach the end of the file.
								 */
								if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
									fj(corpus, stats);
									// empty the current corpus, since we don't need
									// the data anymore
									corpus.clear();
								}
							}
                            // start a new sentence
                            sentence = new ArrayList<>();


						} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
							// before proceeding to read this file, make sure that taxonomy filters are a match
							if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
								currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection

								// disregard this entry if taxonomies don't match
								includeFile = !currentFiletaxonomy.isEmpty();

//								currentFiletaxonomy = new ArrayList<>();
							}
						}

						// backup
						else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
							fj(corpus, stats);
							corpus.clear();

                            currentFiletaxonomy = new ArrayList<>();
//                            currentFiletaxonomyLong = new ArrayList<>();
						}

						break;
				}
			}
		} catch (FileNotFoundException | XMLStreamException e) {
			e.printStackTrace();
		} finally {
			if (eventReader != null) {
				try {
					eventReader.close();
				} catch (XMLStreamException e) {
					logger.error("closing stream", e);
				} catch (Exception e) {
					logger.error("general error", e);
				}
			}
		}

		return true;
	}

	/**
	 * Runs the sentence through some filters, so we don't do calculations when unnecessary.
	 * Filters:
	 * <ol>
	 * <li><b>Ngrams:</b> omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)</li>
	 * <li><b>Letter ngrams:</b> omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)</li>
	 * </ol>
	 *
	 * @return Empty sentence (if fails 1.) or a sentence with some words removed (2.)
	 */
	private static List<Word> runFilters(List<Word> sentence, Filter filter) {
		if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
			// ngram level: if not 0 must be less than or equal to number of words in this sentence.
			if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) {
				return new ArrayList<>();
			}

			// if we're calculating values for letters, omit words that are shorter than string length
			if (filter.getNgramValue() == 0) {
				sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord(filter.getWordParts()).length() < filter.getStringLength())
						|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma(filter.getWordParts()).length() < filter.getStringLength()));
			}
		}

		return sentence;
	}

	private static HashMap<String, String> extractAttributes(StartElement se) {
		Iterator attributesIt = se.getAttributes();
		HashMap<String, String> atts = new HashMap<>();

		while (attributesIt.hasNext()) {
			Attribute a = (Attribute) attributesIt.next();
			atts.put(a.getName().getLocalPart(), a.getValue());
		}

		return atts;
	}

	public static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
		List<String> wString = new ArrayList<>();
		if (f.getWordParts().contains(CalculateFor.WORD))
			wString.add(word);
		if (f.getWordParts().contains(CalculateFor.LEMMA))
			wString.add(lemma);
		if (f.getWordParts().contains(CalculateFor.MORPHOSYNTACTIC_SPECS))
			wString.add(msd);
		if (f.getWordParts().contains(CalculateFor.NORMALIZED_WORD))
			wString.add(normalizedWord);

		// find appropriate strings and put them in word
		Word w;

		switch (f.getWordParts().size()) {
			case 1:
				w = new Word1(wString.get(0));
				break;
			case 2:
				w = new Word2(wString.get(0), wString.get(1));
				break;
			case 3:
				w = new Word3(wString.get(0), wString.get(1), wString.get(2));
				break;
			case 4:
				w = new Word4(wString.get(0), wString.get(1), wString.get(2), wString.get(3));
				break;
			default:
				w = null;

		}
		return w;
	}
}