Project copied

2018-06-19 09:15:37 +02:00
commit a18e52a599
94 changed files with 87092 additions and 0 deletions
--- a/src/main/java/alg/Common.java
+++ b/src/main/java/alg/Common.java
@@ -0,0 +1,15 @@
+package alg;
+
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicLong;
+
+public class Common {
+	public static <K, V> void updateMap(Map<K, AtomicLong> map, K o) {
+		// if not in map
+		AtomicLong r = map.putIfAbsent(o, new AtomicLong(1));
+
+		// else
+		if (r != null)
+			map.get(o).incrementAndGet();
+	}
+}
--- a/src/main/java/alg/XML_processing.java
+++ b/src/main/java/alg/XML_processing.java
@@ -0,0 +1,794 @@
+package alg;
+
+import static data.Enums.solar.SolarFilters.*;
+
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.util.*;
+import java.util.concurrent.ForkJoinPool;
+
+import javax.xml.namespace.QName;
+import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.events.*;
+
+import org.apache.logging.log4j.LogManager;
+
+import data.*;
+import gui.ValidationUtil;
+
+public class XML_processing {
+	public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class);
+
+	// public static void processCorpus(Statistics stats) {
+	// 	// we can preset the list's size, so there won't be a need to resize it
+	// 	List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT);
+	//
+	// 	int i = 0;
+	// 	for (File f : Settings.corpus) {
+	// 		i++;
+	// 		readXML(f.toString(), stats);
+	// 	}
+	// }
+
+	// public static void readXML(String path, Statistics stats) {
+	// 	if (stats.getCorpusType() == CorpusType.GIGAFIDA) {
+	// 		readXMLGigafida(path, stats);
+	// 	} else if (stats.getCorpusType() == CorpusType.GOS) {
+	// 		readXMLGos(path, stats);
+	// 	} else if (stats.getCorpusType() == CorpusType.SOLAR) {
+	// 		readXMLSolar(path, stats);
+	// 	}
+	// }
+
+	public static void readXML(String path, StatisticsNew stats) {
+		if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA
+				|| stats.getCorpus().getCorpusType() == CorpusType.CCKRES) {
+			readXMLGigafida(path, stats);
+		} else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) {
+			readXMLGos(path, stats);
+		} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
+			readXMLSolar(path, stats);
+		}
+	}
+
+	/**
+	 * Reads and returns the value of a passed header tag or an empty string.
+	 * E.g. title tag, for discerning the corpus' type.
+	 * Notice: returns only the value of the first occurrence of a given tag name.
+	 */
+	public static String readXMLHeaderTag(String path, String tag) {
+		XMLInputFactory factory = XMLInputFactory.newInstance();
+		XMLEventReader eventReader = null;
+
+		try {
+			eventReader = factory.createXMLEventReader(new FileInputStream(path));
+			while (eventReader.hasNext()) {
+				XMLEvent xmlEvent = eventReader.nextEvent();
+				if (xmlEvent.isStartElement()) {
+					StartElement startElement = xmlEvent.asStartElement();
+					String var = startElement.getName().getLocalPart();
+
+					if (var.equalsIgnoreCase(tag)) {
+						return eventReader.nextEvent().asCharacters().getData();
+					}
+				}
+			}
+		} catch (FileNotFoundException | XMLStreamException e) {
+			e.printStackTrace();
+		} finally {
+			if (eventReader != null) {
+				try {
+					eventReader.close();
+				} catch (XMLStreamException e) {
+					logger.error("closing stream", e);
+				}
+			}
+		}
+		return "";
+	}
+
+	private static void fj(List<Sentence> corpus, StatisticsNew stats) {
+		ForkJoinPool pool = new ForkJoinPool();
+
+		if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) {
+			alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats);
+			pool.invoke(wc);
+		} else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) {
+			alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats);
+			pool.invoke(wc);
+		} else {
+			// TODO:
+			// alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats);
+			// pool.invoke(wc);
+		}
+	}
+
+	// public static void readXMLGos(String path, Statistics stats) {
+	// 	boolean in_word = false;
+	// 	String taksonomija = "";
+	// 	String lemma = "";
+	// 	String msd = "";
+	// 	String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm
+	//
+	// 	List<Word> stavek = new ArrayList<>();
+	// 	List<Sentence> corpus = new ArrayList<>();
+	// 	String sentenceDelimiter = "seg";
+	// 	String taxonomyPrefix = "gos.";
+	//
+	// 	try {
+	// 		XMLInputFactory factory = XMLInputFactory.newInstance();
+	// 		XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
+	//
+	// 		while (eventReader.hasNext()) {
+	// 			XMLEvent event = eventReader.nextEvent();
+	//
+	// 			switch (event.getEventType()) {
+	// 				case XMLStreamConstants.START_ELEMENT:
+	//
+	// 					StartElement startElement = event.asStartElement();
+	// 					String qName = startElement.getName().getLocalPart();
+	//
+	// 					// "word" node
+	// 					if (qName.equals("w")) {
+	// 						in_word = true;
+	//
+	// 						if (type.equals("norm")) {
+	// 							// make sure we're looking at <w lemma...> and not <w type...>
+	// 							Iterator var = startElement.getAttributes();
+	// 							ArrayList<Object> attributes = new ArrayList<>();
+	// 							while (var.hasNext()) {
+	// 								attributes.add(var.next());
+	// 							}
+	//
+	// 							if (attributes.contains("msd")) {
+	// 								msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
+	// 							} else {
+	// 								msd = null;
+	// 							}
+	//
+	// 							if (attributes.contains("lemma")) {
+	// 								lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
+	// 							}
+	// 						}
+	// 					}
+	// 					// taxonomy node
+	// 					else if (qName.equalsIgnoreCase("catRef")) {
+	// 						// there are some term nodes at the beginning that are of no interest to us
+	// 						// they differ by not having the attribute "ref", so test will equal null
+	// 						Attribute test = startElement.getAttributeByName(QName.valueOf("target"));
+	//
+	// 						if (test != null) {
+	// 							// keep only taxonomy properties
+	// 							taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, "");
+	// 						}
+	// 					} else if (qName.equalsIgnoreCase("div")) {
+	// 						type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
+	//
+	// 					}
+	// 					break;
+	//
+	// 				case XMLStreamConstants.CHARACTERS:
+	// 					Characters characters = event.asCharacters();
+	//
+	// 					// "word" node value
+	// 					if (in_word) {
+	// 						if (type.equals("norm") && msd != null) {
+	// 							stavek.add(new Word(characters.getData(), lemma, msd));
+	// 						} else {
+	// 							stavek.add(new Word(characters.getData()));
+	// 						}
+	//
+	// 						in_word = false;
+	// 					}
+	// 					break;
+	//
+	// 				case XMLStreamConstants.END_ELEMENT:
+	// 					EndElement endElement = event.asEndElement();
+	//
+	// 					// parser reached end of the current sentence
+	// 					if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
+	// 						// add sentence to corpus
+	// 						corpus.add(new Sentence(stavek, taksonomija, type));
+	// 						// and start a new one
+	// 						stavek = new ArrayList<>();
+	//
+	// 						/* Invoke Fork-Join when we reach maximum limit of
+	// 						 * sentences (because we can't read everything to
+	// 						 * memory) or we reach the end of the file.
+	// 						 */
+	// 						if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
+	// 							fj(corpus, stats);
+	// 							// empty the current corpus, since we don't need
+	// 							// the data anymore
+	// 							corpus.clear();
+	// 						}
+	// 					}
+	//
+	// 					// backup
+	// 					if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
+	// 						fj(corpus, stats);
+	// 						corpus.clear();
+	// 					}
+	//
+	// 					break;
+	// 			}
+	// 		}
+	// 	} catch (FileNotFoundException | XMLStreamException e) {
+	// 		e.printStackTrace();
+	// 	}
+	// }
+
+	@SuppressWarnings("unused")
+	public static void readXMLSolar(String path, StatisticsNew stats) {
+		boolean in_word = false;
+		String lemma = "";
+		String msd = "";
+
+		List<Word> stavek = new ArrayList<>();
+		List<Sentence> corpus = new ArrayList<>();
+
+		// used for filter
+		Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto"));
+		Map<String, String> headBlock = null;
+		boolean includeThisBlock = false;
+
+		try {
+			XMLInputFactory factory = XMLInputFactory.newInstance();
+			XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
+
+			while (eventReader.hasNext()) {
+				XMLEvent event = eventReader.nextEvent();
+
+				switch (event.getEventType()) {
+					case XMLStreamConstants.START_ELEMENT:
+
+						StartElement startElement = event.asStartElement();
+						// System.out.println(String.format("%s", startElement.toString()));
+						String qName = startElement.getName().getLocalPart();
+
+						// "word" node
+						if (qName.equals("w3")) {
+							in_word = true;
+
+							msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
+							lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
+						} else if (qName.equals("c3")) {
+							String c3Content = eventReader.nextEvent().asCharacters().getData();
+
+							if (c3Content.equals(".") && includeThisBlock) {
+								// add sentence to corpus
+								corpus.add(new Sentence(stavek));
+								// and start a new one
+								stavek = new ArrayList<>();
+
+							/* Invoke Fork-Join when we reach maximum limit of
+							 * sentences (because we can't read everything to
+							 * memory) or we reach the end of the file.
+							 */
+								if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
+									fj(corpus, stats);
+									// empty the current corpus, since we don't need
+									// the data anymore
+									corpus.clear();
+								}
+							}
+						} else if (headTags.contains(qName)) {
+							String tagContent = eventReader.nextEvent().asCharacters().getData();
+							headBlock.put(qName, tagContent);
+						} else if (qName.equals("head")) {
+							headBlock = new HashMap<>();
+						}
+
+						break;
+
+					case XMLStreamConstants.CHARACTERS:
+						Characters characters = event.asCharacters();
+
+						// "word" node value
+						if (in_word) {
+							stavek.add(new Word(characters.getData(), lemma, msd));
+							in_word = false;
+						}
+						break;
+
+					case XMLStreamConstants.END_ELEMENT:
+						EndElement endElement = event.asEndElement();
+						String qNameEnd = endElement.getName().getLocalPart();
+
+						if (qNameEnd.equals("head")) {
+							// validate and set boolean
+							if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) {
+								includeThisBlock = true;
+							}
+						} else if (qNameEnd.equals("body")) {
+							// new block, reset filter status
+							includeThisBlock = false;
+						}
+
+						// backup
+						if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) {
+							fj(corpus, stats);
+							corpus.clear();
+						}
+
+						break;
+				}
+			}
+		} catch (FileNotFoundException | XMLStreamException e) {
+			e.printStackTrace();
+		}
+	}
+
+	/**
+	 * @param readHeadBlock block of tags read from the corpus
+	 * @param userSetFilter tags with values set by the user
+	 *
+	 * @return
+	 */
+	private static boolean validateHeadBlock(Map<String, String> readHeadBlock, HashMap<String, HashSet<String>> userSetFilter) {
+		boolean pass = true;
+
+		if (userSetFilter == null) {
+			return true;
+		}
+
+		for (Map.Entry<String, HashSet<String>> filterEntry : userSetFilter.entrySet()) {
+			String key = filterEntry.getKey();
+			HashSet<String> valueObject = filterEntry.getValue();
+
+			// if (valueObject instanceof String) {
+			// 	pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject);
+			// } else
+			if (valueObject != null) {
+				//noinspection unchecked
+				for (String value : valueObject) {
+					pass = validateHeadBlockEntry(readHeadBlock, key, value);
+				}
+			}
+
+			if (!pass) {
+				// current head block does not include one of the set filters - not likely, but an edge case anyway
+				return false;
+			}
+		}
+
+		// if it gets to this point, it passed all the filters
+		return true;
+	}
+
+	private static boolean validateHeadBlockEntry(Map<String, String> readHeadBlock, String userSetKey, String userSetValue) {
+		if (!readHeadBlock.keySet().contains(userSetKey)) {
+			// current head block does not include one of the set filters - not likely, but an edge case anyway
+			return false;
+		} else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) {
+			// different values -> doesn't pass the filter
+			return false;
+		}
+
+		return true;
+	}
+
+	/**
+	 * Parses XML headers for information about its taxonomy (if supported) or filters (solar)
+	 *
+	 * @param filepath
+	 * @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
+	 * @param corpusType
+	 */
+	public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
+		boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
+		// solar
+		Set<String> headTags = null;
+		HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
+		// taxonomy corpora
+		HashSet<String> resultTaxonomy = new HashSet<>();
+
+		String headTagName;
+
+		if (corpusType == CorpusType.SOLAR) {
+			headTagName = "head";
+			// used for filter
+			headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO));
+
+			// init results now to avoid null pointers
+			headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
+		} else {
+			headTagName = "teiHeader";
+		}
+
+		XMLInputFactory factory = XMLInputFactory.newInstance();
+		XMLEventReader xmlEventReader = null;
+		try {
+			xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
+			boolean insideHeader = false;
+
+			while (xmlEventReader.hasNext()) {
+				XMLEvent xmlEvent = xmlEventReader.nextEvent();
+
+				if (xmlEvent.isStartElement()) {
+					StartElement startElement = xmlEvent.asStartElement();
+					String elementName = startElement.getName().getLocalPart();
+
+					if (elementName.equalsIgnoreCase(headTagName)) {
+						// if the corpus is split into files, we skip bodies
+						// this toggle is true when we're inside a header (next block of code executes)
+						// and false when we're not (skip reading unnecessary attributes)
+						insideHeader = true;
+					}
+
+					if (insideHeader) {
+						if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) {
+							HashMap<String, String> atts = extractAttributes(startElement);
+							String debug = "";
+
+							String tax = startElement.getAttributeByName(QName.valueOf("target"))
+									.getValue()
+									.replace("#", "");
+
+							resultTaxonomy.add(tax);
+						} else if (!parseTaxonomy && headTags.contains(elementName)) {
+							String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
+							resultFilters.get(elementName).add(tagContent);
+						}
+					}
+				} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
+					// if the corpus is split into multiple files, each with only one header block per file
+					// that means we should stop after we reach the end of the header
+					return parseTaxonomy ? resultTaxonomy : resultFilters;
+				} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
+					// whole corpus in one file, so we have to continue reading in order to find all header blocks
+					insideHeader = false;
+				}
+			}
+		} catch (XMLStreamException e) {
+			logger.error("Streaming error", e);
+			return parseTaxonomy ? resultTaxonomy : resultFilters;
+		} catch (FileNotFoundException e) {
+			logger.error("File not found", e);
+			return parseTaxonomy ? resultTaxonomy : resultFilters;
+			// TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user
+		} finally {
+			if (xmlEventReader != null) {
+				try {
+					xmlEventReader.close();
+				} catch (XMLStreamException e) {
+					logger.error("closing stream", e);
+				}
+			}
+		}
+		return parseTaxonomy ? resultTaxonomy : resultFilters;
+	}
+
+	private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) {
+		return event.asEndElement()
+				.getName()
+				.getLocalPart()
+				.equalsIgnoreCase(headerTag);
+	}
+
+	@SuppressWarnings("Duplicates")
+	public static boolean readXMLGigafida(String path, StatisticsNew stats) {
+		boolean inWord = false;
+		ArrayList<String> currentFiletaxonomy = new ArrayList<>();
+		String lemma = "";
+		String msd = "";
+
+		List<Word> sentence = new ArrayList<>();
+		List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
+		String sentenceDelimiter = "s";
+
+		XMLEventReader eventReader = null;
+		try {
+			XMLInputFactory factory = XMLInputFactory.newInstance();
+			eventReader = factory.createXMLEventReader(new FileInputStream(path));
+
+			while (eventReader.hasNext()) {
+				XMLEvent event = eventReader.nextEvent();
+
+				switch (event.getEventType()) {
+					case XMLStreamConstants.START_ELEMENT:
+						StartElement startElement = event.asStartElement();
+						String qName = startElement.getName().getLocalPart();
+
+						// "word" node
+						if (qName.equals("w")) {
+							inWord = true;
+
+							msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
+							lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
+						}
+						// taxonomy node
+						else if (qName.equalsIgnoreCase("catRef")) {
+							// there are some term nodes at the beginning that are of no interest to us
+							// they differ by not having the attribute "ref", so test will equal null
+							Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
+
+							if (tax != null) {
+								// keep only taxonomy properties
+								currentFiletaxonomy.add(String.valueOf(tax.getValue()).replace("#", ""));
+							}
+						}
+						break;
+
+					case XMLStreamConstants.CHARACTERS:
+						Characters characters = event.asCharacters();
+
+						// "word" node value
+						if (inWord) {
+							String word = characters.getData();
+							sentence.add(new Word(word, lemma, msd));
+							inWord = false;
+						}
+						break;
+
+					case XMLStreamConstants.END_ELEMENT:
+						EndElement endElement = event.asEndElement();
+
+						String var = endElement.getName().getLocalPart();
+						String debug = "";
+
+						// parser reached end of the current sentence
+						if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
+							// add sentence to corpus if it passes filters
+							sentence = runFilters(sentence, stats.getFilter());
+
+							if (!ValidationUtil.isEmpty(sentence)) {
+								corpus.add(new Sentence(sentence));
+							}
+
+							// and start a new one
+							sentence = new ArrayList<>();
+
+							/* Invoke Fork-Join when we reach maximum limit of
+							 * sentences (because we can't read everything to
+							 * memory) or we reach the end of the file.
+							 */
+							if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
+								fj(corpus, stats);
+								// empty the current corpus, since we don't need the data anymore
+								corpus.clear();
+
+								// TODO: if (stats.isUseDB()) {
+								// 	stats.storeTmpResultsToDB();
+								// }
+							}
+						} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
+							// before proceeding to read this file, make sure that taxonomy filters are a match
+
+							if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
+								currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
+
+								if (currentFiletaxonomy.isEmpty()) {
+									// taxonomies don't match so stop
+									return false;
+								}
+							}
+						}
+
+						// fallback
+						else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
+							fj(corpus, stats);
+							corpus.clear();
+
+							// TODO: if (stats.isUseDB()) {
+							// 	stats.storeTmpResultsToDB();
+							// }
+						}
+
+						break;
+				}
+			}
+		} catch (FileNotFoundException | XMLStreamException e) {
+			e.printStackTrace();
+		} finally {
+			if (eventReader != null) {
+				try {
+					eventReader.close();
+				} catch (XMLStreamException e) {
+					logger.error("closing stream", e);
+				}
+			}
+		}
+
+		return true;
+	}
+
+	@SuppressWarnings("Duplicates")
+	public static boolean readXMLGos(String path, StatisticsNew stats) {
+		boolean inWord = false;
+		boolean inOrthDiv = false;
+		boolean computeForOrth = stats.getCorpus().isGosOrthMode();
+		ArrayList<String> currentFiletaxonomy = new ArrayList<>();
+		String lemma = "";
+		String msd = "";
+
+		List<Word> sentence = new ArrayList<>();
+		List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
+		String sentenceDelimiter = "seg";
+
+		String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm
+
+		XMLEventReader eventReader = null;
+
+		boolean includeFile = true;
+
+		try {
+			XMLInputFactory factory = XMLInputFactory.newInstance();
+			eventReader = factory.createXMLEventReader(new FileInputStream(path));
+
+			while (eventReader.hasNext()) {
+				XMLEvent event = eventReader.nextEvent();
+				// System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", "")));
+
+				switch (event.getEventType()) {
+					case XMLStreamConstants.START_ELEMENT:
+						StartElement startElement = event.asStartElement();
+						String qName = startElement.getName().getLocalPart();
+
+						if (qName.equals("div")) {
+							HashMap<String, String> atts = extractAttributes(startElement);
+
+							if (atts.keySet().contains("type")) {
+								inOrthDiv = atts.get("type").equals("orth");
+							}
+						}
+
+						// "word" node
+						if (qName.equals("w")) {
+							// check that it's not a type
+							HashMap<String, String> atts = extractAttributes(startElement);
+
+							if (!atts.containsKey("type")) {
+								inWord = true;
+
+								if (atts.containsKey("msd")) {
+									msd = atts.get("msd");
+
+								}
+								if (atts.containsKey("lemma")) {
+									lemma = atts.get("lemma");
+								}
+								//
+								// if (!inOrthDiv) {
+								// 	msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
+								// 	lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
+								// }
+							}
+
+							// }
+						}
+						// taxonomy node
+						else if (qName.equalsIgnoreCase("catRef")) {
+							// there are some term nodes at the beginning that are of no interest to us
+							// they differ by not having the attribute "ref", so test will equal null
+							Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
+
+							if (tax != null) {
+								// keep only taxonomy properties
+								currentFiletaxonomy.add(String.valueOf(tax.getValue()));
+							}
+						} else if (qName.equalsIgnoreCase("div")) {
+							gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
+						}
+						break;
+
+					case XMLStreamConstants.CHARACTERS:
+						// "word" node value
+						if (inWord) {
+							Characters characters = event.asCharacters();
+							if (gosType.equals("norm") && msd != null) {
+								sentence.add(new Word(characters.getData(), lemma, msd));
+							} else {
+								sentence.add(new Word(characters.getData()));
+							}
+
+							inWord = false;
+						}
+						break;
+
+					case XMLStreamConstants.END_ELEMENT:
+						EndElement endElement = event.asEndElement();
+
+						// parser reached end of the current sentence
+						if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
+							// add sentence to corpus if it passes filters
+							boolean saveSentence = computeForOrth == inOrthDiv;
+
+							if (includeFile && saveSentence && !ValidationUtil.isEmpty(sentence)) {
+								sentence = runFilters(sentence, stats.getFilter());
+								corpus.add(new Sentence(sentence));
+							}
+
+							// and start a new one
+							sentence = new ArrayList<>();
+
+							/* Invoke Fork-Join when we reach maximum limit of
+							 * sentences (because we can't read everything to
+							 * memory) or we reach the end of the file.
+							 */
+							if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
+								fj(corpus, stats);
+								// empty the current corpus, since we don't need
+								// the data anymore
+								corpus.clear();
+							}
+						} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
+							// before proceeding to read this file, make sure that taxonomy filters are a match
+							if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
+								currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
+
+								// disregard this entry if taxonomies don't match
+								includeFile = !currentFiletaxonomy.isEmpty();
+
+								currentFiletaxonomy = new ArrayList<>();
+							}
+						}
+
+						// backup
+						else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
+							fj(corpus, stats);
+							corpus.clear();
+						}
+
+						break;
+				}
+			}
+		} catch (FileNotFoundException | XMLStreamException e) {
+			e.printStackTrace();
+		} finally {
+			if (eventReader != null) {
+				try {
+					eventReader.close();
+				} catch (XMLStreamException e) {
+					logger.error("closing stream", e);
+				} catch (Exception e) {
+					logger.error("general error", e);
+				}
+			}
+		}
+
+		return true;
+	}
+
+	/**
+	 * Runs the sentence through some filters, so we don't do calculations when unnecessary.
+	 * Filters:
+	 * <ol>
+	 * <li><b>Ngrams:</b> omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)</li>
+	 * <li><b>Letter ngrams:</b> omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)</li>
+	 * </ol>
+	 *
+	 * @return Empty sentence (if fails 1.) or a sentence with some words removed (2.)
+	 */
+	private static List<Word> runFilters(List<Word> sentence, Filter filter) {
+		if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
+			// ngram level: if not 0 must be less than or equal to number of words in this sentence.
+			if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) {
+				return null;
+			}
+
+			// if we're calculating values for letters, omit words that are shorter than string length
+			if (filter.getNgramValue() == 0) {
+				sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength())
+						|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength()));
+			}
+		}
+
+		return sentence;
+	}
+
+	private static HashMap<String, String> extractAttributes(StartElement se) {
+		Iterator attributesIt = se.getAttributes();
+		HashMap<String, String> atts = new HashMap<>();
+
+		while (attributesIt.hasNext()) {
+			Attribute a = (Attribute) attributesIt.next();
+			atts.put(a.getName().getLocalPart(), a.getValue());
+		}
+
+		return atts;
+	}
+}
--- a/src/main/java/alg/inflectedJOS/ForkJoin.java
+++ b/src/main/java/alg/inflectedJOS/ForkJoin.java
@@ -0,0 +1,67 @@
+package alg.inflectedJOS;
+
+import java.util.List;
+import java.util.concurrent.RecursiveAction;
+
+import data.Sentence;
+import data.Statistics;
+
+public class ForkJoin extends RecursiveAction {
+	private static final long serialVersionUID = -1260951004477299634L;
+
+	private static final int ACCEPTABLE_SIZE = 1000;
+	private List<Sentence> corpus;
+	private Statistics stats;
+	private int start;
+	private int end;
+
+
+	/**
+	 * Constructor for subproblems.
+	 */
+	private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
+		this.corpus = corpus;
+		this.start = start;
+		this.end = end;
+		this.stats = stats;
+	}
+
+	/**
+	 * Default constructor for the initial problem
+	 */
+	public ForkJoin(List<Sentence> corpus, Statistics stats) {
+		this.corpus = corpus;
+		this.start = 0;
+		this.end = corpus.size();
+		this.stats = stats;
+	}
+
+	private void computeDirectly() {
+		List<Sentence> subCorpus = corpus.subList(start, end);
+
+		if (stats.isTaxonomySet()) {
+			InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
+		} else {
+			InflectedJOSCount.calculateForAll(subCorpus, stats, null);
+		}
+	}
+
+	@Override
+	protected void compute() {
+		int subCorpusSize = end - start;
+
+		if (subCorpusSize < ACCEPTABLE_SIZE) {
+			computeDirectly();
+		} else {
+			int mid = start + subCorpusSize / 2;
+			ForkJoin left = new ForkJoin(corpus, start, mid, stats);
+			ForkJoin right = new ForkJoin(corpus, mid, end, stats);
+
+			// fork (push to queue)-> compute -> join
+			left.fork();
+			right.fork();
+			left.join();
+			right.join();
+		}
+	}
+}
--- a/src/main/java/alg/inflectedJOS/InflectedJOSCount.java
+++ b/src/main/java/alg/inflectedJOS/InflectedJOSCount.java
@@ -0,0 +1,170 @@
+package alg.inflectedJOS;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.commons.lang3.StringUtils;
+
+import alg.Common;
+import data.Sentence;
+import data.Statistics;
+import data.StatisticsNew;
+import data.Word;
+
+public class InflectedJOSCount {
+
+	public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
+
+	// static {
+	// 	// calculate all possible combinations of indices we will substitute with a '-' for substring statistics
+	// 	indices = new HashMap<>();
+	// 	for (int i = 5; i <= 8; i++) {
+	// 		indices.put(i, calculateCombinations(i));
+	// 	}
+	// }
+	//
+	// private static List<Integer> calculateCombinations(int i) {
+	// 	int arr[] = {1, 2, 3, 4, 5};
+	// 	int r = 3;
+	// 	int n = arr.length;
+	// 	ArrayList<ArrayList<Integer>> result = new ArrayList<>();
+	//
+	// 	return printCombination(arr, n, r);
+	// }
+	//
+	// /* arr[]  ---> Input Array
+	// data[] ---> Temporary array to store current combination
+	// start & end ---> Staring and Ending indexes in arr[]
+	// index  ---> Current index in data[]
+	// r ---> Size of a combination to be printed */
+	// static void combinationUtil(int arr[], int data[], int start,
+	// 							int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
+	// 	// Current combination is ready to be printed, print it
+	// 	ArrayList<Integer> tmpResult = new ArrayList<>();
+	//
+	// 	if (index == r) {
+	// 		ArrayList<Integer> tmpResult = new ArrayList<>();
+	// 		for (int j = 0; j < r; j++)
+	// 			System.out.print(data[j] + " ");
+	// 		System.out.println("");
+	// 		return;
+	// 	}
+	//
+	// 	// replace index with all possible elements. The condition
+	// 	// "end-i+1 >= r-index" makes sure that including one element
+	// 	// at index will make a combination with remaining elements
+	// 	// at remaining positions
+	// 	for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
+	// 		data[index] = arr[i];
+	// 		combinationUtil(arr, data, i + 1, end, index + 1, r);
+	// 	}
+	// }
+	//
+	// // The main function that prints all combinations of size r
+	// // in arr[] of size n. This function mainly uses combinationUtil()
+	// static void printCombination(int arr[], int n, int r) {
+	// 	// A temporary array to store all combination one by one
+	// 	int data[] = new int[r];
+	//
+	// 	// Print all combination using temprary array 'data[]'
+	// 	combinationUtil(arr, data, 0, n - 1, 0, r);
+	// }
+
+	// public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
+	// 	for (Sentence s : corpus) {
+	// 		// disregard if wrong taxonomy
+	// 		if (!(s.getTaxonomy().startsWith(taxonomy))) {
+	// 			continue;
+	// 		}
+	//
+	// 		calculateCommon(s, stats.result);
+	//
+	// 		for (Word word : s.getWords()) {
+	// 			// skip if current word is not inflected
+	// 			if (!(word.getMsd().length() > 0)) {
+	// 				continue;
+	// 			}
+	//
+	// 			String msd = word.getMsd();
+	//
+	// 			StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
+	//
+	// 			for (int i = 1; i < msd.length(); i++) {
+	// 				entry.setCharAt(i, msd.charAt(i));
+	// 				Common.updateMap(stats.result, entry.toString());
+	// 				entry.setCharAt(i, '-');
+	// 			}
+	// 		}
+	// 	}
+	// }
+
+	// public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
+	// 	for (Sentence s : corpus) {
+	// 		for (Word word : s.getWords()) {
+	// 			if (!(word.getMsd().length() > 0)) {
+	// 				continue;
+	// 			}
+	//
+	// 			String msd = word.getMsd();
+	//
+	// 			StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
+	//
+	// 			for (int i = 1; i < msd.length(); i++) {
+	// 				entry.setCharAt(i, msd.charAt(i));
+	// 				Common.updateMap(stats.result, entry.toString());
+	// 				entry.setCharAt(i, '-');
+	// 			}
+	// 		}
+	// 	}
+	// }
+
+	static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
+		for (Sentence s : corpus) {
+			// disregard if wrong taxonomy
+			if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
+				continue;
+			}
+
+			for (Word word : s.getWords()) {
+				// skip if current word is not inflected
+				if (!(word.getMsd().length() > 0)) {
+					continue;
+				}
+
+				String msd = word.getMsd();
+
+				StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
+
+				for (int i = 1; i < msd.length(); i++) {
+					entry.setCharAt(i, msd.charAt(i));
+					Common.updateMap(stats.result, entry.toString());
+					entry.setCharAt(i, '-');
+				}
+			}
+		}
+	}
+
+	public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
+		for (Sentence s : corpus) {
+
+			for (Word word : s.getWords()) {
+				// skip if current word is not inflected
+				// // TODO: if has defined msd and is of correct type (create a set)
+				// if (!(word.getMsd().length() > 0)) {
+				// 	continue;
+				// }
+
+				String msd = word.getMsd();
+
+				StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
+
+				for (int i = 1; i < msd.length(); i++) {
+					entry.setCharAt(i, msd.charAt(i));
+					stats.updateResults(entry.toString());
+					entry.setCharAt(i, '-');
+				}
+			}
+		}
+	}
+}
--- a/src/main/java/alg/inflectedJOS/WordFormation.java
+++ b/src/main/java/alg/inflectedJOS/WordFormation.java
@@ -0,0 +1,131 @@
+package alg.inflectedJOS;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+
+import data.Enums.InflectedJosTypes;
+import data.StatisticsNew;
+import gui.ValidationUtil;
+import util.Combinations;
+
+// adapted from http://www.geeksforgeeks.org/print-all-possible-combinations-of-r-elements-in-a-given-array-of-size-n/
+public class WordFormation {
+	private static HashMap<String, Long> josTypeResult;
+	private static Object[][] tmpResults;
+
+	private static HashMap<Integer, HashSet<HashSet<Integer>>> indices;
+
+	static {
+		indices = new HashMap<>();
+
+		for (int i = 4; i <= 8; i++) {
+			indices.put(i, Combinations.generateIndices(i));
+		}
+	}
+
+	public static void calculateStatistics(StatisticsNew stat) {
+		Map<String, AtomicLong> result = stat.getResult();
+
+		// 1. filter - keep only inflected types
+		result.keySet().removeIf(x -> !InflectedJosTypes.inflectedJosTypes.contains(x.charAt(0)));
+
+		// 2. for each inflected type get all possible subcombinations
+		for (Character josChar : InflectedJosTypes.inflectedJosTypes) {
+			josTypeResult = new HashMap<>();
+
+			// filter out results for a single word type
+			Map<String, AtomicLong> singleTypeResults = result.entrySet().stream()
+					.filter(x -> x.getKey().charAt(0) == josChar)
+					.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
+
+			if (ValidationUtil.isEmpty(singleTypeResults)) {
+				continue;
+			}
+
+			// get all possible indices combos for a msd of this length
+			// HashSet<HashSet<Integer>> indicesCombos = indices.get()
+			//Combinations.generateIndices(singleTypeResults.keySet().stream().findFirst().get().length());
+
+			for (Map.Entry<String, AtomicLong> e : singleTypeResults.entrySet()) {
+				int l = e.getKey().length();
+
+				for (HashSet<Integer> indicesCombo : indices.get(e.getKey().length())) {
+					updateResults(mask(e.getKey(), indicesCombo), e.getValue().longValue());
+				}
+			}
+
+			resultsMapToArray(singleTypeResults.values().stream().mapToLong(Number::longValue).sum());
+		}
+
+		stat.setResultCustom(tmpResults);
+	}
+
+	private static String mask(String word, HashSet<Integer> indicesCombo) {
+		StringBuilder sb = new StringBuilder();
+
+		sb.append(word.charAt(0));
+		for (int i = 1; i < word.length(); i++) {
+			sb.append(indicesCombo.contains(i) ? word.charAt(i) : ".");
+		}
+
+		return sb.toString();
+	}
+
+
+	private static void updateResults(String s, Long nOfOccurences) {
+		// if not in map add
+		Long r = josTypeResult.putIfAbsent(s, nOfOccurences);
+
+		// else update
+		if (r != null) {
+			josTypeResult.put(s, josTypeResult.get(s) + nOfOccurences);
+		}
+	}
+
+	private static void resultsMapToArray(Long totalValue) {
+		Double total = totalValue * 1.0;
+		Object[][] josTypeResultArray = new Object[josTypeResult.size()][3];
+
+		int i = 0;
+		for (Map.Entry<String, Long> e : josTypeResult.entrySet()) {
+			josTypeResultArray[i][0] = e.getKey();
+			josTypeResultArray[i][1] = e.getValue();
+			josTypeResultArray[i][2] = e.getValue() / total;
+
+			if (e.getValue() > total) {
+
+				String debug = "";
+
+			}
+
+			i++;
+		}
+
+		if (tmpResults == null) {
+			tmpResults = josTypeResultArray;
+		} else {
+			int firstLength = tmpResults.length;
+			int secondLength = josTypeResultArray.length;
+			Object[][] tmp = new Object[firstLength + secondLength][3];
+
+			System.arraycopy(tmpResults, 0, tmp, 0, firstLength);
+			System.arraycopy(josTypeResultArray, 0, tmp, firstLength, secondLength);
+
+			tmpResults = tmp;
+
+			// tmpResults = ArrayUtils.addAll(tmpResults, josTypeResultArray);
+		}
+	}
+
+	private static void printArray() {
+		for (int i = 0; i < tmpResults.length; i++) {
+			for (int j = 0; j < tmpResults[i].length; j++) {
+				System.out.print(tmpResults[i][j] + "\t");
+			}
+			System.out.println();
+		}
+	}
+}
--- a/src/main/java/alg/ngram/ForkJoin.java
+++ b/src/main/java/alg/ngram/ForkJoin.java
@@ -0,0 +1,62 @@
+package alg.ngram;
+
+import java.util.List;
+import java.util.concurrent.RecursiveAction;
+
+import data.Sentence;
+import data.StatisticsNew;
+
+public class ForkJoin extends RecursiveAction {
+	private static final long serialVersionUID = 5074814035083362355L;
+
+	private static final int ACCEPTABLE_SIZE = 1000;
+	private List<Sentence> corpus;
+	private StatisticsNew stats;
+	private int start;
+	private int end;
+
+
+	/**
+	 * Constructor for subproblems.
+	 */
+	private ForkJoin(List<Sentence> corpus, int start, int end, StatisticsNew stats) {
+		this.corpus = corpus;
+		this.start = start;
+		this.end = end;
+		this.stats = stats;
+	}
+
+	/**
+	 * Default constructor for the initial problem
+	 */
+	public ForkJoin(List<Sentence> corpus, StatisticsNew stats) {
+		this.corpus = corpus;
+		this.start = 0;
+		this.end = corpus.size();
+		this.stats = stats;
+	}
+
+	private void computeDirectly() {
+		List<Sentence> subCorpus = corpus.subList(start, end);
+		Ngrams.calculateForAll(subCorpus, stats);
+	}
+
+	@Override
+	protected void compute() {
+		int subCorpusSize = end - start;
+
+		if (subCorpusSize < ACCEPTABLE_SIZE) {
+			computeDirectly();
+		} else {
+			int mid = start + subCorpusSize / 2;
+			ForkJoin left = new ForkJoin(corpus, start, mid, stats);
+			ForkJoin right = new ForkJoin(corpus, mid, end, stats);
+
+			// fork (push to queue)-> compute -> join
+			left.fork();
+			right.fork();
+			left.join();
+			right.join();
+		}
+	}
+}
--- a/src/main/java/alg/ngram/Ngrams.java
+++ b/src/main/java/alg/ngram/Ngrams.java
@@ -0,0 +1,204 @@
+package alg.ngram;
+
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import data.CalculateFor;
+import data.Sentence;
+import data.StatisticsNew;
+import data.Word;
+import gui.ValidationUtil;
+
+public class Ngrams {
+	public final static Logger logger = LogManager.getLogger(Ngrams.class);
+
+
+	public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
+		if (stats.getFilter().getNgramValue() == 0) { // letter ngram
+			generateNgramLetterCandidates(corpus, stats);
+		} else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) {
+			generateSkipgramCandidates(corpus, stats);
+		} else {
+			generateNgramCandidates(corpus, stats);
+		}
+	}
+
+	public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
+		for (Sentence s : corpus) {
+			// skip sentences shorter than specified ngram length
+			if (s.getWords().size() < stats.getFilter().getNgramValue()) {
+				continue;
+			}
+
+			for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) {
+				List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
+
+				// if msd regex is set and this candidate doesn't pass it, skip this iteration
+				if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {
+					continue;
+				}
+
+				stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
+			}
+		}
+	}
+
+	/**
+	 * Checks whether an ngram candidate passes specified regex filter.
+	 */
+	private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {
+		if (ngramCandidate.size() != regex.size()) {
+			logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
+			return false;
+		}
+
+		for (int i = 0; i < regex.size(); i++) {
+			if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+	private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {
+		ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());
+
+		switch (calculateFor) {
+			case LEMMA:
+				candidate.addAll(ngramCandidate
+						.stream()
+						.map(Word::getLemma)
+						.collect(Collectors.toList()));
+				break;
+			case WORD:
+				candidate.addAll(ngramCandidate
+						.stream()
+						.map(Word::getWord)
+						.collect(Collectors.toList()));
+				break;
+			case MORPHOSYNTACTIC_SPECS:
+			case MORPHOSYNTACTIC_PROPERTY:
+				candidate.addAll(ngramCandidate
+						.stream()
+						.map(Word::getMsd)
+						.collect(Collectors.toList()));
+				break;
+			case WORD_TYPE:
+				candidate.addAll(ngramCandidate
+						.stream()
+						.map(w -> Character.toString(w.getMsd().charAt(0)))
+						.collect(Collectors.toList()));
+				break;
+		}
+
+		return StringUtils.join(candidate, " ");
+	}
+
+	/**
+	 * Generates candidates and updates results
+	 *
+	 * @param corpus
+	 * @param stats
+	 */
+	private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
+		for (Sentence s : corpus) {
+			for (Word w : s.getWords()) {
+				String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
+
+				// skip this iteration if:
+				// - word doesn't contain a proper version (missing lemma for example)
+				// - msd regex is given but this word's msd doesn't match it, skip this iteration
+				// - given substring length is larger than the word length
+				if (ValidationUtil.isEmpty(word)
+						|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())
+						|| word.length() < stats.getFilter().getStringLength()) {
+					continue;
+				}
+
+				for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
+					// TODO: locila?
+					stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength()));
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Extracts skipgram candidates.
+	 *
+	 * @return List of candidates represented as a list<candidates(String)>
+	 */
+	public static void generateSkipgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
+		ArrayList<Word> currentLoop;
+		int ngram = stats.getFilter().getNgramValue();
+		int skip = stats.getFilter().getSkipValue();
+
+		for (Sentence s : corpus) {
+			List<Word> sentence = s.getWords();
+
+			for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
+				for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
+					if (ngram == 2 && j < sentence.size()) {
+						currentLoop = new ArrayList<>();
+						currentLoop.add(sentence.get(i));
+						currentLoop.add(sentence.get(j));
+
+						validateAndCountSkipgramCandidate(currentLoop, stats);
+					} else {
+						for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
+							if (ngram == 3 && k < sentence.size()) {
+								currentLoop = new ArrayList<>();
+								currentLoop.add(sentence.get(i));
+								currentLoop.add(sentence.get(j));
+								currentLoop.add(sentence.get(k));
+
+								validateAndCountSkipgramCandidate(currentLoop, stats);
+							} else {
+								for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
+									if (ngram == 4 && k < sentence.size()) {
+										currentLoop = new ArrayList<>();
+										currentLoop.add(sentence.get(i));
+										currentLoop.add(sentence.get(j));
+										currentLoop.add(sentence.get(k));
+										currentLoop.add(sentence.get(l));
+
+										validateAndCountSkipgramCandidate(currentLoop, stats);
+									} else {
+										for (int m = k + 1; m <= k + 1 + skip; m++) { // 5gram
+											if (ngram == 5 && k < sentence.size()) {
+												currentLoop = new ArrayList<>();
+												currentLoop.add(sentence.get(i));
+												currentLoop.add(sentence.get(j));
+												currentLoop.add(sentence.get(k));
+												currentLoop.add(sentence.get(l));
+												currentLoop.add(sentence.get(m));
+
+												validateAndCountSkipgramCandidate(currentLoop, stats);
+											}
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+	private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
+		// count if no regex is set or if it is & candidate passes it
+		if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
+			stats.updateResults(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()));
+		}
+	}
+}
--- a/src/main/java/alg/word/ForkJoin.java
+++ b/src/main/java/alg/word/ForkJoin.java
@@ -0,0 +1,62 @@
+package alg.word;
+
+import java.util.List;
+import java.util.concurrent.RecursiveAction;
+
+import data.Sentence;
+import data.StatisticsNew;
+
+public class ForkJoin extends RecursiveAction {
+	private static final long serialVersionUID = 7711587510996456040L;
+
+	private static final int ACCEPTABLE_SIZE = 1000;
+	private List<Sentence> corpus;
+	private StatisticsNew stats;
+	private int start;
+	private int end;
+
+
+	/**
+	 * Constructor for subproblems.
+	 */
+	private ForkJoin(List<Sentence> corpus, int start, int end, StatisticsNew stats) {
+		this.corpus = corpus;
+		this.start = start;
+		this.end = end;
+		this.stats = stats;
+	}
+
+	/**
+	 * Default constructor for the initial problem
+	 */
+	public ForkJoin(List<Sentence> corpus, StatisticsNew stats) {
+		this.corpus = corpus;
+		this.start = 0;
+		this.end = corpus.size();
+		this.stats = stats;
+	}
+
+	private void computeDirectly() {
+		List<Sentence> subCorpus = corpus.subList(start, end);
+		WordLevel.calculateForAll(subCorpus, stats);
+	}
+
+	@Override
+	protected void compute() {
+		int subCorpusSize = end - start;
+
+		if (subCorpusSize < ACCEPTABLE_SIZE) {
+			computeDirectly();
+		} else {
+			int mid = start + subCorpusSize / 2;
+			ForkJoin left = new ForkJoin(corpus, start, mid, stats);
+			ForkJoin right = new ForkJoin(corpus, mid, end, stats);
+
+			// fork (push to queue)-> compute -> join
+			left.fork();
+			right.fork();
+			left.join();
+			right.join();
+		}
+	}
+}
--- a/src/main/java/alg/word/WordCount.java
+++ b/src/main/java/alg/word/WordCount.java
@@ -0,0 +1,167 @@
+package alg.word;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import alg.Common;
+import data.CalculateFor;
+import data.Sentence;
+import data.Statistics;
+import data.Word;
+
+class WordCount {
+	private static void calculateNoFilter(List<Sentence> corpus, Statistics stats) {
+		for (Sentence s : corpus) {
+			List<String> sentence = new ArrayList<>(s.getWords().size());
+
+			if (stats.getCf() == CalculateFor.LEMMA) {
+				sentence.addAll(s.getWords()
+						.stream()
+						.map(Word::getLemma)
+						.collect(Collectors.toList()));
+			} else if (stats.getCf() == CalculateFor.WORD) {
+				sentence.addAll(s.getWords()
+						.stream()
+						.map(Word::getWord)
+						.collect(Collectors.toList()));
+			}
+
+			for (String word : sentence) {
+				Common.updateMap(stats.result, word);
+			}
+		}
+	}
+
+	private static void calculateVCC(List<Sentence> corpus, Statistics stats) {
+		for (Sentence s : corpus) {
+			List<String> sentence = new ArrayList<>(s.getWords().size());
+
+			if (stats.getCf() == CalculateFor.LEMMA) {
+				sentence.addAll(s.getWords()
+						.stream()
+						.map(Word::getCVVLemma)
+						.collect(Collectors.toList()));
+			} else if (stats.getCf() == CalculateFor.WORD) {
+				sentence.addAll(s.getWords()
+						.stream()
+						.map(Word::getCVVWord)
+						.collect(Collectors.toList()));
+			}
+
+			for (String word : sentence) {
+				if (word.length() > stats.getSubstringLength()) {
+					for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) {
+						String substring = word.substring(i, i + stats.getSubstringLength());
+						Common.updateMap(stats.result, substring);
+					}
+				}
+			}
+		}
+	}
+
+	private static void calculateForJosType(List<Sentence> corpus, Statistics stats) {
+		for (Sentence s : corpus) {
+			List<String> sentence = new ArrayList<>(s.getWords().size());
+			List<Word> filteredWords = new ArrayList<>();
+
+			for (Word word : s.getWords()) {
+				if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
+					filteredWords.add(word);
+				}
+			}
+
+			if (stats.getCf() == CalculateFor.LEMMA) {
+				sentence.addAll(filteredWords
+						.stream()
+						.map(Word::getLemma)
+						.collect(Collectors.toList()));
+			} else if (stats.getCf() == CalculateFor.WORD) {
+				sentence.addAll(filteredWords
+						.stream()
+						.map(Word::getWord)
+						.collect(Collectors.toList()));
+			}
+
+			for (String word : sentence) {
+				Common.updateMap(stats.result, word);
+			}
+		}
+	}
+
+	private static void calculateForTaxonomyAndJosType(List<Sentence> corpus, Statistics stats) {
+		for (Sentence s : corpus) {
+			if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
+				List<String> sentence = new ArrayList<>(s.getWords().size());
+				List<Word> filteredWords = new ArrayList<>();
+
+				for (Word word : s.getWords()) {
+					if (word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
+						filteredWords.add(word);
+					}
+				}
+
+				if (stats.getCf() == CalculateFor.LEMMA) {
+					sentence.addAll(filteredWords
+							.stream()
+							.map(Word::getLemma)
+							.collect(Collectors.toList()));
+				} else if (stats.getCf() == CalculateFor.WORD) {
+					sentence.addAll(filteredWords
+							.stream()
+							.map(Word::getWord)
+							.collect(Collectors.toList()));
+				}
+
+				for (String word : sentence) {
+					Common.updateMap(stats.result, word);
+				}
+			}
+		}
+	}
+
+	private static void calculateForTaxonomy(List<Sentence> corpus, Statistics stats) {
+		for (Sentence s : corpus) {
+			if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
+				List<String> sentence = new ArrayList<>(s.getWords().size());
+
+				if (stats.getCf() == CalculateFor.LEMMA) {
+					sentence.addAll(s.getWords()
+							.stream()
+							.map(Word::getLemma)
+							.collect(Collectors.toList()));
+				} else if (stats.getCf() == CalculateFor.WORD) {
+					sentence.addAll(s.getWords()
+							.stream()
+							.map(Word::getWord)
+							.collect(Collectors.toList()));
+				}
+
+				for (String word : sentence) {
+					Common.updateMap(stats.result, word);
+				}
+			}
+		}
+	}
+
+	static void calculateForAll(List<Sentence> corpus, Statistics stats) {
+		boolean taxonomyIsSet = stats.isTaxonomySet();
+		boolean JosTypeIsSet = stats.isJOSTypeSet();
+
+		// branching because even though the only difference is an if or two &&
+		// O(if) = 1, the amount of ifs adds up and this saves some time
+		if (taxonomyIsSet && JosTypeIsSet) {
+			calculateForTaxonomyAndJosType(corpus, stats);
+		} else if (taxonomyIsSet && !JosTypeIsSet) {
+			calculateForTaxonomy(corpus, stats);
+		} else if (!taxonomyIsSet && JosTypeIsSet) {
+			calculateForJosType(corpus, stats);
+		} else {
+			if (stats.isVcc()) {
+				calculateVCC(corpus, stats);
+			} else {
+				calculateNoFilter(corpus, stats);
+			}
+		}
+	}
+}
--- a/src/main/java/alg/word/WordLevel.java
+++ b/src/main/java/alg/word/WordLevel.java
@@ -0,0 +1,112 @@
+package alg.word;
+
+import static data.Enums.WordLevelDefaultValues.*;
+
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.commons.lang3.StringUtils;
+
+import data.Enums.WordLevelDefaultValues;
+import data.Enums.WordLevelType;
+import data.Sentence;
+import data.StatisticsNew;
+import data.Word;
+
+@SuppressWarnings("Duplicates")
+public class WordLevel {
+	private static HashSet<String> suffixes;
+	private static int minSuffixLength;
+	private static int maxSuffixLength;
+
+	private static HashSet<String> prefixes;
+	private static int minPrefixLength;
+	private static int maxPrefixLength;
+
+	static {
+		suffixes = WordLevelDefaultValues.getSuffixes();
+		calculateSuffixesLengths();
+
+		prefixes = WordLevelDefaultValues.getPrefixes();
+		calculatePrefixesLengths();
+	}
+
+	public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
+		for (Sentence s : corpus) {
+			for (Word word : s.getWords()) {
+				calculateForSuffixes(word.getWord(), stats);
+				calculateForPrefixes(word.getWord(), stats);
+			}
+		}
+	}
+
+	private static void calculateForPrefixes(String word, StatisticsNew stats) {
+		for (int tmpPrefixLength = maxPrefixLength; tmpPrefixLength >= minPrefixLength; tmpPrefixLength++) {
+			if (word.length() - tmpPrefixLength < MIN_N_OF_CHARACTERS_LEFT_PREFIX) {
+				return;
+			}
+
+			String extractedPrefix = StringUtils.left(word, tmpPrefixLength);
+
+			if (prefixes.contains(extractedPrefix)) {
+				// save suffix and full word
+				stats.updateResultsNested(WordLevelType.PREFIX, extractedPrefix, word);
+				return;
+			}
+		}
+	}
+
+	public static void calculateForSuffixes(String word, StatisticsNew stats) {
+		for (int tmpSuffixLength = maxSuffixLength; tmpSuffixLength >= minSuffixLength; tmpSuffixLength++) {
+			// preveri, da je beseda - cuttan suffix daljši od prednastavljene vrednosti
+			// ker gremo od najdaljše opcije k najkrajši, se ob dosegu tega pogoja lahko zaključi računanje za trenutno besedo
+			if (word.length() - tmpSuffixLength < MIN_N_OF_CHARACTERS_LEFT_SUFFIX) {
+				return;
+			}
+
+			String extractedSuffix = StringUtils.right(word, tmpSuffixLength);
+
+			if (suffixes.contains(extractedSuffix)) {
+				// save suffix and full word
+				stats.updateResultsNested(WordLevelType.SUFFIX, extractedSuffix, word);
+				return;
+			}
+		}
+	}
+
+	// finds the shortest and longest suffix for quicker calculations
+	public static void calculateSuffixesLengths() {
+		minSuffixLength = -1;
+		maxSuffixLength = -1;
+
+		for (String suffix : suffixes) {
+			if (suffix.length() > maxSuffixLength) {
+				maxSuffixLength = suffix.length();
+
+				if (minSuffixLength < 0) {
+					minSuffixLength = maxSuffixLength;
+				}
+			} else if (suffix.length() < minSuffixLength) {
+				minSuffixLength = suffix.length();
+			}
+		}
+	}
+
+	// finds the shortest and longest suffix for quicker calculations
+	public static void calculatePrefixesLengths() {
+		minPrefixLength = -1;
+		maxPrefixLength = -1;
+
+		for (String prefix : prefixes) {
+			if (prefix.length() > maxPrefixLength) {
+				maxPrefixLength = prefix.length();
+
+				if (minPrefixLength < 0) {
+					minPrefixLength = maxPrefixLength;
+				}
+			} else if (prefix.length() < minPrefixLength) {
+				minPrefixLength = prefix.length();
+			}
+		}
+	}
+}