Added some optimizations and new taxonomy names

2018-08-31 07:57:58 +02:00
parent 1c00f1a283
commit 426a9ccc46
21 changed files with 1345 additions and 1182 deletions
--- a/src/main/java/alg/XML_processing.java
+++ b/src/main/java/alg/XML_processing.java
@@ -262,7 +262,7 @@ public class XML_processing {

                            if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() &&
                                    stavek.size() > 0){
-                                stavek.add(new Word(c3Content, c3Content, "/"));
+                                stavek.add(createWord(c3Content, c3Content, "/", "", stats.getFilter()));

                            }

@@ -297,7 +297,7 @@ public class XML_processing {

 						// "word" node value
 						if (in_word) {
-							stavek.add(new Word(characters.getData(), lemma, msd));
+							stavek.add(createWord(characters.getData(), lemma, msd, "", stats.getFilter()));
 							in_word = false;
 						}
 						break;
@@ -537,12 +537,12 @@ public class XML_processing {
 						// "word" node value
 						if (inWord) {
 							String word = characters.getData();
-							sentence.add(new Word(word, lemma, msd));
+							sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
 							inWord = false;
 						}
 						if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
 						    String punctuation = characters.getData();
-							sentence.add(new Word(punctuation, punctuation, "/"));
+							sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
 							inPunctuation = false;

 //						    String punctuation = ",";
@@ -761,7 +761,7 @@ public class XML_processing {
 //								GOSCorpusHM.put(GOSCorpusHMKey, sentence);
 								String word = "";
 								Characters characters = event.asCharacters();
-								sentence.add(new Word(characters.getData(), "", ""));
+								sentence.add(createWord(characters.getData(), "", "", "", stats.getFilter()));
 							// if algorithm is in normalized part find orthodox word and add other info to it
 							} else {
 								Characters characters = event.asCharacters();
@@ -769,15 +769,16 @@ public class XML_processing {
 //								System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex);
 								if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) {
 									Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex);
-									currentWord.setLemma(lemma);
-									currentWord.setMsd(msd);
-									currentWord.setNormalizedWord(characters.getData());
+									currentWord.setLemma(lemma, stats.getFilter().getWordParts());
+									currentWord.setMsd(msd, stats.getFilter().getWordParts());
+									currentWord.setNormalizedWord(characters.getData(), stats.getFilter().getWordParts());

 									wordIndex += 1;

                                    // when a word is separated from one to many we have to create these duplicates
                                    if (inSeparatedWord){
-                                        GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, new Word(currentWord.getWord(), "", ""));
+                                        GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, createWord(currentWord.getWord(stats.getFilter().getWordParts()),
+                                                "", "", "", stats.getFilter()));
                                    }
 								} //else {
 //								    System.out.println("Error");
@@ -893,8 +894,8 @@ public class XML_processing {

 			// if we're calculating values for letters, omit words that are shorter than string length
 			if (filter.getNgramValue() == 0) {
-				sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength())
-						|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength()));
+				sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord(filter.getWordParts()).length() < filter.getStringLength())
+						|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma(filter.getWordParts()).length() < filter.getStringLength()));
 			}
 		}

@@ -912,4 +913,38 @@ public class XML_processing {

 		return atts;
 	}
+
+	private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
+		List<String> wString = new ArrayList<>();
+		if (f.getWordParts().contains(CalculateFor.WORD))
+			wString.add(word);
+		if (f.getWordParts().contains(CalculateFor.LEMMA))
+			wString.add(lemma);
+		if (f.getWordParts().contains(CalculateFor.MORPHOSYNTACTIC_SPECS))
+			wString.add(msd);
+		if (f.getWordParts().contains(CalculateFor.NORMALIZED_WORD))
+			wString.add(normalizedWord);
+
+		// find appropriate strings and put them in word
+		Word w;
+
+		switch (f.getWordParts().size()) {
+			case 1:
+				w = new Word1(wString.get(0));
+				break;
+			case 2:
+				w = new Word2(wString.get(0), wString.get(1));
+				break;
+			case 3:
+				w = new Word3(wString.get(0), wString.get(1), wString.get(2));
+				break;
+			case 4:
+				w = new Word4(wString.get(0), wString.get(1), wString.get(2), wString.get(3));
+				break;
+			default:
+				w = null;
+
+		}
+		return w;
+	}
 }
--- a/src/main/java/alg/inflectedJOS/ForkJoin.java
+++ b/src/main/java/alg/inflectedJOS/ForkJoin.java
@@ -1,67 +1,67 @@
-package alg.inflectedJOS;
-
-import java.util.List;
-import java.util.concurrent.RecursiveAction;
-
-import data.Sentence;
-import data.Statistics;
-
-public class ForkJoin extends RecursiveAction {
-	private static final long serialVersionUID = -1260951004477299634L;
-
-	private static final int ACCEPTABLE_SIZE = 1000;
-	private List<Sentence> corpus;
-	private Statistics stats;
-	private int start;
-	private int end;
-
-
-	/**
-	 * Constructor for subproblems.
-	 */
-	private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
-		this.corpus = corpus;
-		this.start = start;
-		this.end = end;
-		this.stats = stats;
-	}
-
-	/**
-	 * Default constructor for the initial problem
-	 */
-	public ForkJoin(List<Sentence> corpus, Statistics stats) {
-		this.corpus = corpus;
-		this.start = 0;
-		this.end = corpus.size();
-		this.stats = stats;
-	}
-
-	private void computeDirectly() {
-		List<Sentence> subCorpus = corpus.subList(start, end);
-
-		if (stats.isTaxonomySet()) {
-			InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
-		} else {
-			InflectedJOSCount.calculateForAll(subCorpus, stats, null);
-		}
-	}
-
-	@Override
-	protected void compute() {
-		int subCorpusSize = end - start;
-
-		if (subCorpusSize < ACCEPTABLE_SIZE) {
-			computeDirectly();
-		} else {
-			int mid = start + subCorpusSize / 2;
-			ForkJoin left = new ForkJoin(corpus, start, mid, stats);
-			ForkJoin right = new ForkJoin(corpus, mid, end, stats);
-
-			// fork (push to queue)-> compute -> join
-			left.fork();
-			right.fork();
-			left.join();
-			right.join();
-		}
-	}
-}
+//package alg.inflectedJOS;
+//
+//import java.util.List;
+//import java.util.concurrent.RecursiveAction;
+//
+//import data.Sentence;
+//import data.Statistics;
+//
+//public class ForkJoin extends RecursiveAction {
+//	private static final long serialVersionUID = -1260951004477299634L;
+//
+//	private static final int ACCEPTABLE_SIZE = 1000;
+//	private List<Sentence> corpus;
+//	private Statistics stats;
+//	private int start;
+//	private int end;
+//
+//
+//	/**
+//	 * Constructor for subproblems.
+//	 */
+//	private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
+//		this.corpus = corpus;
+//		this.start = start;
+//		this.end = end;
+//		this.stats = stats;
+//	}
+//
+//	/**
+//	 * Default constructor for the initial problem
+//	 */
+//	public ForkJoin(List<Sentence> corpus, Statistics stats) {
+//		this.corpus = corpus;
+//		this.start = 0;
+//		this.end = corpus.size();
+//		this.stats = stats;
+//	}
+//
+//	private void computeDirectly() {
+//		List<Sentence> subCorpus = corpus.subList(start, end);
+//
+//		if (stats.isTaxonomySet()) {
+//			InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
+//		} else {
+//			InflectedJOSCount.calculateForAll(subCorpus, stats, null);
+//		}
+//	}
+//
+//	@Override
+//	protected void compute() {
+//		int subCorpusSize = end - start;
+//
+//		if (subCorpusSize < ACCEPTABLE_SIZE) {
+//			computeDirectly();
+//		} else {
+//			int mid = start + subCorpusSize / 2;
+//			ForkJoin left = new ForkJoin(corpus, start, mid, stats);
+//			ForkJoin right = new ForkJoin(corpus, mid, end, stats);
+//
+//			// fork (push to queue)-> compute -> join
+//			left.fork();
+//			right.fork();
+//			left.join();
+//			right.join();
+//		}
+//	}
+//}
--- a/src/main/java/alg/inflectedJOS/InflectedJOSCount.java
+++ b/src/main/java/alg/inflectedJOS/InflectedJOSCount.java
@@ -1,170 +1,170 @@
-package alg.inflectedJOS;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-
-import org.apache.commons.lang3.StringUtils;
-
-import alg.Common;
-import data.Sentence;
-import data.Statistics;
-import data.StatisticsNew;
-import data.Word;
-
-public class InflectedJOSCount {
-
-	public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
-
-	// static {
-	// 	// calculate all possible combinations of indices we will substitute with a '-' for substring statistics
-	// 	indices = new HashMap<>();
-	// 	for (int i = 5; i <= 8; i++) {
-	// 		indices.put(i, calculateCombinations(i));
-	// 	}
-	// }
-	//
-	// private static List<Integer> calculateCombinations(int i) {
-	// 	int arr[] = {1, 2, 3, 4, 5};
-	// 	int r = 3;
-	// 	int n = arr.length;
-	// 	ArrayList<ArrayList<Integer>> result = new ArrayList<>();
-	//
-	// 	return printCombination(arr, n, r);
-	// }
-	//
-	// /* arr[]  ---> Input Array
-	// data[] ---> Temporary array to store current combination
-	// start & end ---> Staring and Ending indexes in arr[]
-	// index  ---> Current index in data[]
-	// r ---> Size of a combination to be printed */
-	// static void combinationUtil(int arr[], int data[], int start,
-	// 							int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
-	// 	// Current combination is ready to be printed, print it
-	// 	ArrayList<Integer> tmpResult = new ArrayList<>();
-	//
-	// 	if (index == r) {
-	// 		ArrayList<Integer> tmpResult = new ArrayList<>();
-	// 		for (int j = 0; j < r; j++)
-	// 			System.out.print(data[j] + " ");
-	// 		System.out.println("");
-	// 		return;
-	// 	}
-	//
-	// 	// replace index with all possible elements. The condition
-	// 	// "end-i+1 >= r-index" makes sure that including one element
-	// 	// at index will make a combination with remaining elements
-	// 	// at remaining positions
-	// 	for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
-	// 		data[index] = arr[i];
-	// 		combinationUtil(arr, data, i + 1, end, index + 1, r);
-	// 	}
-	// }
-	//
-	// // The main function that prints all combinations of size r
-	// // in arr[] of size n. This function mainly uses combinationUtil()
-	// static void printCombination(int arr[], int n, int r) {
-	// 	// A temporary array to store all combination one by one
-	// 	int data[] = new int[r];
-	//
-	// 	// Print all combination using temprary array 'data[]'
-	// 	combinationUtil(arr, data, 0, n - 1, 0, r);
-	// }
-
-	// public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
-	// 	for (Sentence s : corpus) {
-	// 		// disregard if wrong taxonomy
-	// 		if (!(s.getTaxonomy().startsWith(taxonomy))) {
-	// 			continue;
-	// 		}
-	//
-	// 		calculateCommon(s, stats.result);
-	//
-	// 		for (Word word : s.getWords()) {
-	// 			// skip if current word is not inflected
-	// 			if (!(word.getMsd().length() > 0)) {
-	// 				continue;
-	// 			}
-	//
-	// 			String msd = word.getMsd();
-	//
-	// 			StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
-	//
-	// 			for (int i = 1; i < msd.length(); i++) {
-	// 				entry.setCharAt(i, msd.charAt(i));
-	// 				Common.updateMap(stats.result, entry.toString());
-	// 				entry.setCharAt(i, '-');
-	// 			}
-	// 		}
-	// 	}
-	// }
-
-	// public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
-	// 	for (Sentence s : corpus) {
-	// 		for (Word word : s.getWords()) {
-	// 			if (!(word.getMsd().length() > 0)) {
-	// 				continue;
-	// 			}
-	//
-	// 			String msd = word.getMsd();
-	//
-	// 			StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
-	//
-	// 			for (int i = 1; i < msd.length(); i++) {
-	// 				entry.setCharAt(i, msd.charAt(i));
-	// 				Common.updateMap(stats.result, entry.toString());
-	// 				entry.setCharAt(i, '-');
-	// 			}
-	// 		}
-	// 	}
-	// }
-
-	static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
-		for (Sentence s : corpus) {
-			// disregard if wrong taxonomy
-//			if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
-//				continue;
+//package alg.inflectedJOS;
+//
+//import java.util.ArrayList;
+//import java.util.HashMap;
+//import java.util.List;
+//
+//import org.apache.commons.lang3.StringUtils;
+//
+//import alg.Common;
+//import data.Sentence;
+//import data.Statistics;
+//import data.StatisticsNew;
+//import data.Word;
+//
+//public class InflectedJOSCount {
+//
+//	public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
+//
+//	// static {
+//	// 	// calculate all possible combinations of indices we will substitute with a '-' for substring statistics
+//	// 	indices = new HashMap<>();
+//	// 	for (int i = 5; i <= 8; i++) {
+//	// 		indices.put(i, calculateCombinations(i));
+//	// 	}
+//	// }
+//	//
+//	// private static List<Integer> calculateCombinations(int i) {
+//	// 	int arr[] = {1, 2, 3, 4, 5};
+//	// 	int r = 3;
+//	// 	int n = arr.length;
+//	// 	ArrayList<ArrayList<Integer>> result = new ArrayList<>();
+//	//
+//	// 	return printCombination(arr, n, r);
+//	// }
+//	//
+//	// /* arr[]  ---> Input Array
+//	// data[] ---> Temporary array to store current combination
+//	// start & end ---> Staring and Ending indexes in arr[]
+//	// index  ---> Current index in data[]
+//	// r ---> Size of a combination to be printed */
+//	// static void combinationUtil(int arr[], int data[], int start,
+//	// 							int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
+//	// 	// Current combination is ready to be printed, print it
+//	// 	ArrayList<Integer> tmpResult = new ArrayList<>();
+//	//
+//	// 	if (index == r) {
+//	// 		ArrayList<Integer> tmpResult = new ArrayList<>();
+//	// 		for (int j = 0; j < r; j++)
+//	// 			System.out.print(data[j] + " ");
+//	// 		System.out.println("");
+//	// 		return;
+//	// 	}
+//	//
+//	// 	// replace index with all possible elements. The condition
+//	// 	// "end-i+1 >= r-index" makes sure that including one element
+//	// 	// at index will make a combination with remaining elements
+//	// 	// at remaining positions
+//	// 	for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
+//	// 		data[index] = arr[i];
+//	// 		combinationUtil(arr, data, i + 1, end, index + 1, r);
+//	// 	}
+//	// }
+//	//
+//	// // The main function that prints all combinations of size r
+//	// // in arr[] of size n. This function mainly uses combinationUtil()
+//	// static void printCombination(int arr[], int n, int r) {
+//	// 	// A temporary array to store all combination one by one
+//	// 	int data[] = new int[r];
+//	//
+//	// 	// Print all combination using temprary array 'data[]'
+//	// 	combinationUtil(arr, data, 0, n - 1, 0, r);
+//	// }
+//
+//	// public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
+//	// 	for (Sentence s : corpus) {
+//	// 		// disregard if wrong taxonomy
+//	// 		if (!(s.getTaxonomy().startsWith(taxonomy))) {
+//	// 			continue;
+//	// 		}
+//	//
+//	// 		calculateCommon(s, stats.result);
+//	//
+//	// 		for (Word word : s.getWords()) {
+//	// 			// skip if current word is not inflected
+//	// 			if (!(word.getMsd().length() > 0)) {
+//	// 				continue;
+//	// 			}
+//	//
+//	// 			String msd = word.getMsd();
+//	//
+//	// 			StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
+//	//
+//	// 			for (int i = 1; i < msd.length(); i++) {
+//	// 				entry.setCharAt(i, msd.charAt(i));
+//	// 				Common.updateMap(stats.result, entry.toString());
+//	// 				entry.setCharAt(i, '-');
+//	// 			}
+//	// 		}
+//	// 	}
+//	// }
+//
+//	// public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
+//	// 	for (Sentence s : corpus) {
+//	// 		for (Word word : s.getWords()) {
+//	// 			if (!(word.getMsd().length() > 0)) {
+//	// 				continue;
+//	// 			}
+//	//
+//	// 			String msd = word.getMsd();
+//	//
+//	// 			StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
+//	//
+//	// 			for (int i = 1; i < msd.length(); i++) {
+//	// 				entry.setCharAt(i, msd.charAt(i));
+//	// 				Common.updateMap(stats.result, entry.toString());
+//	// 				entry.setCharAt(i, '-');
+//	// 			}
+//	// 		}
+//	// 	}
+//	// }
+//
+//	static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
+//		for (Sentence s : corpus) {
+//			// disregard if wrong taxonomy
+////			if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
+////				continue;
+////			}
+//
+//			for (Word word : s.getWords()) {
+//				// skip if current word is not inflected
+//				if (!(word.getMsd().length() > 0)) {
+//					continue;
+//				}
+//
+//				String msd = word.getMsd();
+//
+//				StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
+//
+//				for (int i = 1; i < msd.length(); i++) {
+//					entry.setCharAt(i, msd.charAt(i));
+//					Common.updateMap(stats.result, entry.toString());
+//					entry.setCharAt(i, '-');
+//				}
 //			}
-
-			for (Word word : s.getWords()) {
-				// skip if current word is not inflected
-				if (!(word.getMsd().length() > 0)) {
-					continue;
-				}
-
-				String msd = word.getMsd();
-
-				StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
-
-				for (int i = 1; i < msd.length(); i++) {
-					entry.setCharAt(i, msd.charAt(i));
-					Common.updateMap(stats.result, entry.toString());
-					entry.setCharAt(i, '-');
-				}
-			}
-		}
-	}
-
-	public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
-		for (Sentence s : corpus) {
-
-			for (Word word : s.getWords()) {
-				// skip if current word is not inflected
-				// // TODO: if has defined msd and is of correct type (create a set)
-				// if (!(word.getMsd().length() > 0)) {
-				// 	continue;
-				// }
-
-				String msd = word.getMsd();
-
-				StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
-
-				for (int i = 1; i < msd.length(); i++) {
-					entry.setCharAt(i, msd.charAt(i));
-					stats.updateResults(entry.toString());
-					entry.setCharAt(i, '-');
-				}
-			}
-		}
-	}
-}
+//		}
+//	}
+//
+//	public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
+//		for (Sentence s : corpus) {
+//
+//			for (Word word : s.getWords()) {
+//				// skip if current word is not inflected
+//				// // TODO: if has defined msd and is of correct type (create a set)
+//				// if (!(word.getMsd().length() > 0)) {
+//				// 	continue;
+//				// }
+//
+//				String msd = word.getMsd();
+//
+//				StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
+//
+//				for (int i = 1; i < msd.length(); i++) {
+//					entry.setCharAt(i, msd.charAt(i));
+//					stats.updateResults(entry.toString());
+//					entry.setCharAt(i, '-');
+//				}
+//			}
+//		}
+//	}
+//}
--- a/src/main/java/alg/ngram/Ngrams.java
+++ b/src/main/java/alg/ngram/Ngrams.java
@@ -43,12 +43,12 @@ public class Ngrams {
 				List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());

 				// if msd regex is set and this candidate doesn't pass it, skip this iteration
-				if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {
+				if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
 					continue;
 				}

 				// generate proper MultipleHMKeys depending on filter data
-				String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor());
+				String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts());

 				// if last letter is ',' erase it

@@ -67,14 +67,14 @@ public class Ngrams {
 						multipleKeys = new MultipleHMKeys1(key);
 						break;
 					case 1:
-						String k1_2 = wordToString(ngramCandidate, otherKeys.get(0));
+						String k1_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
 //						if (stats.getFilter().getNotePunctuations())
 //							k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length()-1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
 						multipleKeys = new MultipleHMKeys2(key, k1_2);
 						break;
 					case 2:
-						String k2_2 = wordToString(ngramCandidate, otherKeys.get(0));
-						String k2_3 = wordToString(ngramCandidate, otherKeys.get(1));
+						String k2_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
+						String k2_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
 //						if (stats.getFilter().getNotePunctuations()) {
 //							k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
 //							k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
@@ -82,9 +82,9 @@ public class Ngrams {
 						multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
 						break;
 					case 3:
-						String k3_2 = wordToString(ngramCandidate, otherKeys.get(0));
-						String k3_3 = wordToString(ngramCandidate, otherKeys.get(1));
-						String k3_4 = wordToString(ngramCandidate, otherKeys.get(2));
+						String k3_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
+						String k3_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
+						String k3_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
 //						if (stats.getFilter().getNotePunctuations()) {
 //							k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
 //							k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
@@ -93,10 +93,10 @@ public class Ngrams {
 						multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
 						break;
 					case 4:
-						String k4_2 = wordToString(ngramCandidate, otherKeys.get(0));
-						String k4_3 = wordToString(ngramCandidate, otherKeys.get(1));
-						String k4_4 = wordToString(ngramCandidate, otherKeys.get(2));
-						String k4_5 = wordToString(ngramCandidate, otherKeys.get(3));
+						String k4_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
+						String k4_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
+						String k4_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
+						String k4_5 = wordToString(ngramCandidate, otherKeys.get(3), stats.getFilter().getWordParts());
 //						if (stats.getFilter().getNotePunctuations()) {
 //							k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
 //							k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
@@ -137,7 +137,7 @@ public class Ngrams {
 	/**
 	 * Checks whether an ngram candidate passes specified regex filter.
 	 */
-	private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {
+	private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex, ArrayList<CalculateFor> wordParts) {
 		if (ngramCandidate.size() != regex.size()) {
 			logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
 			return false;
@@ -145,7 +145,7 @@ public class Ngrams {

 		for (int i = 0; i < regex.size(); i++) {
 			//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
-			if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern() + ".*")) {
+			if (!ngramCandidate.get(i).getMsd(wordParts).matches(regex.get(i).pattern() + ".*")) {
 				return false;
 			}
 		}
@@ -153,33 +153,33 @@ public class Ngrams {
 		return true;
 	}

-	private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {
+	private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor, ArrayList<CalculateFor> wordParts) {
 		ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());

 		switch (calculateFor) {
 			case LEMMA:
 				candidate.addAll(ngramCandidate
 						.stream()
-						.map(Word::getLemma)
+						.map(w -> w.getLemma(wordParts))
 						.collect(Collectors.toList()));
 				return StringUtils.join(candidate, " ");
 			case WORD:
 				candidate.addAll(ngramCandidate
 						.stream()
-						.map(Word::getWord)
+						.map(w -> w.getWord(wordParts))
 						.collect(Collectors.toList()));
 				return StringUtils.join(candidate, " ");
 			case MORPHOSYNTACTIC_SPECS:
 			case MORPHOSYNTACTIC_PROPERTY:
 				candidate.addAll(ngramCandidate
 						.stream()
-						.map(Word::getMsd)
+						.map(w -> w.getMsd(wordParts))
 						.collect(Collectors.toList()));
 				return StringUtils.join(candidate, " ");
 			case WORD_TYPE:
 				candidate.addAll(ngramCandidate
 						.stream()
-						.map(w -> Character.toString(w.getMsd().charAt(0)))
+						.map(w -> Character.toString(w.getMsd(wordParts).charAt(0)))
 						.collect(Collectors.toList()));
 //				candidate.addAll(ngramCandidate
 //						.stream()
@@ -190,7 +190,7 @@ public class Ngrams {
 			case NORMALIZED_WORD:
 				candidate.addAll(ngramCandidate
 						.stream()
-						.map(Word::getNormalizedWord)
+						.map(w -> w.getNormalizedWord(wordParts))
 						.collect(Collectors.toList()));
 				return StringUtils.join(candidate, " ");
 		}
@@ -208,14 +208,14 @@ public class Ngrams {
 		for (Sentence s : corpus) {
 			for (Word w : s.getWords()) {
 				List<String> taxonomy = s.getTaxonomy();
-				String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
+				String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv(), stats.getFilter().getWordParts());

 				// skip this iteration if:
 				// - word doesn't contain a proper version (missing lemma for example)
 				// - msd regex is given but this word's msd doesn't match it, skip this iteration
 				// - given substring length is larger than the word length
 				if (ValidationUtil.isEmpty(word)
-						|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())
+						|| stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern())
 						|| word.length() < stats.getFilter().getStringLength()) {
 					continue;
 				}
@@ -331,7 +331,7 @@ public class Ngrams {

 	private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats, List<String> taxonomy) {
 		// count if no regex is set or if it is & candidate passes it
-		if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
+		if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
 //		    String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
 //            key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
 //			stats.updateTaxonomyResults(new MultipleHMKeys1(key),
@@ -340,7 +340,7 @@ public class Ngrams {

 			ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();

-			String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
+			String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts());

 			// if last letter is ',' erase it

@@ -359,14 +359,14 @@ public class Ngrams {
 					multipleKeys = new MultipleHMKeys1(key);
 					break;
 				case 1:
-					String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0));
+					String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
 //					if (stats.getFilter().getNotePunctuations())
 //						k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length() - 1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
 					multipleKeys = new MultipleHMKeys2(key, k1_2);
 					break;
 				case 2:
-					String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0));
-					String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1));
+					String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
+					String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
 //					if (stats.getFilter().getNotePunctuations()) {
 //						k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
 //						k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
@@ -374,9 +374,9 @@ public class Ngrams {
 					multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
 					break;
 				case 3:
-					String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0));
-					String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1));
-					String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2));
+					String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
+					String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
+					String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
 //					if (stats.getFilter().getNotePunctuations()) {
 //						k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
 //						k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
@@ -385,10 +385,10 @@ public class Ngrams {
 					multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
 					break;
 				case 4:
-					String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0));
-					String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1));
-					String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2));
-					String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3));
+					String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts());
+					String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts());
+					String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts());
+					String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3), stats.getFilter().getWordParts());
 //					if (stats.getFilter().getNotePunctuations()) {
 //						k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
 //						k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
--- a/src/main/java/alg/word/WordCount.java
+++ b/src/main/java/alg/word/WordCount.java
@@ -10,84 +10,84 @@ import data.Sentence;
 import data.Statistics;
 import data.Word;

-class WordCount {
-	private static void calculateNoFilter(List<Sentence> corpus, Statistics stats) {
-		for (Sentence s : corpus) {
-			List<String> sentence = new ArrayList<>(s.getWords().size());
-
-			if (stats.getCf() == CalculateFor.LEMMA) {
-				sentence.addAll(s.getWords()
-						.stream()
-						.map(Word::getLemma)
-						.collect(Collectors.toList()));
-			} else if (stats.getCf() == CalculateFor.WORD) {
-				sentence.addAll(s.getWords()
-						.stream()
-						.map(Word::getWord)
-						.collect(Collectors.toList()));
-			}
-
-			for (String word : sentence) {
-				Common.updateMap(stats.result, word);
-			}
-		}
-	}
-
-	private static void calculateVCC(List<Sentence> corpus, Statistics stats) {
-		for (Sentence s : corpus) {
-			List<String> sentence = new ArrayList<>(s.getWords().size());
-
-			if (stats.getCf() == CalculateFor.LEMMA) {
-				sentence.addAll(s.getWords()
-						.stream()
-						.map(Word::getCVVLemma)
-						.collect(Collectors.toList()));
-			} else if (stats.getCf() == CalculateFor.WORD) {
-				sentence.addAll(s.getWords()
-						.stream()
-						.map(Word::getCVVWord)
-						.collect(Collectors.toList()));
-			}
-
-			for (String word : sentence) {
-				if (word.length() > stats.getSubstringLength()) {
-					for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) {
-						String substring = word.substring(i, i + stats.getSubstringLength());
-						Common.updateMap(stats.result, substring);
-					}
-				}
-			}
-		}
-	}
-
-	private static void calculateForJosType(List<Sentence> corpus, Statistics stats) {
-		for (Sentence s : corpus) {
-			List<String> sentence = new ArrayList<>(s.getWords().size());
-			List<Word> filteredWords = new ArrayList<>();
-
-			for (Word word : s.getWords()) {
-				if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
-					filteredWords.add(word);
-				}
-			}
-
-			if (stats.getCf() == CalculateFor.LEMMA) {
-				sentence.addAll(filteredWords
-						.stream()
-						.map(Word::getLemma)
-						.collect(Collectors.toList()));
-			} else if (stats.getCf() == CalculateFor.WORD) {
-				sentence.addAll(filteredWords
-						.stream()
-						.map(Word::getWord)
-						.collect(Collectors.toList()));
-			}
-
-			for (String word : sentence) {
-				Common.updateMap(stats.result, word);
-			}
-		}
-	}
+//class WordCount {
+//	private static void calculateNoFilter(List<Sentence> corpus, Statistics stats) {
+//		for (Sentence s : corpus) {
+//			List<String> sentence = new ArrayList<>(s.getWords().size());
+//
+//			if (stats.getCf() == CalculateFor.LEMMA) {
+//				sentence.addAll(s.getWords()
+//						.stream()
+//						.map(Word::getLemma)
+//						.collect(Collectors.toList()));
+//			} else if (stats.getCf() == CalculateFor.WORD) {
+//				sentence.addAll(s.getWords()
+//						.stream()
+//						.map(Word::getWord)
+//						.collect(Collectors.toList()));
+//			}
+//
+//			for (String word : sentence) {
+//				Common.updateMap(stats.result, word);
+//			}
+//		}
+//	}
+//
+//	private static void calculateVCC(List<Sentence> corpus, Statistics stats) {
+//		for (Sentence s : corpus) {
+//			List<String> sentence = new ArrayList<>(s.getWords().size());
+//
+//			if (stats.getCf() == CalculateFor.LEMMA) {
+//				sentence.addAll(s.getWords()
+//						.stream()
+//						.map(Word::getCVVLemma)
+//						.collect(Collectors.toList()));
+//			} else if (stats.getCf() == CalculateFor.WORD) {
+//				sentence.addAll(s.getWords()
+//						.stream()
+//						.map(Word::getCVVWord)
+//						.collect(Collectors.toList()));
+//			}
+//
+//			for (String word : sentence) {
+//				if (word.length() > stats.getSubstringLength()) {
+//					for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) {
+//						String substring = word.substring(i, i + stats.getSubstringLength());
+//						Common.updateMap(stats.result, substring);
+//					}
+//				}
+//			}
+//		}
+//	}
+//
+//	private static void calculateForJosType(List<Sentence> corpus, Statistics stats) {
+//		for (Sentence s : corpus) {
+//			List<String> sentence = new ArrayList<>(s.getWords().size());
+//			List<Word> filteredWords = new ArrayList<>();
+//
+//			for (Word word : s.getWords()) {
+//				if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
+//					filteredWords.add(word);
+//				}
+//			}
+//
+//			if (stats.getCf() == CalculateFor.LEMMA) {
+//				sentence.addAll(filteredWords
+//						.stream()
+//						.map(Word::getLemma)
+//						.collect(Collectors.toList()));
+//			} else if (stats.getCf() == CalculateFor.WORD) {
+//				sentence.addAll(filteredWords
+//						.stream()
+//						.map(Word::getWord)
+//						.collect(Collectors.toList()));
+//			}
+//
+//			for (String word : sentence) {
+//				Common.updateMap(stats.result, word);
+//			}
+//		}
+//	}

 //	private static void calculateForTaxonomyAndJosType(List<Sentence> corpus, Statistics stats) {
 //		for (Sentence s : corpus) {
@@ -164,4 +164,4 @@ class WordCount {
 //			}
 //		}
 //	}
-}
+//}
--- a/src/main/java/alg/word/WordLevel.java
+++ b/src/main/java/alg/word/WordLevel.java
@@ -34,8 +34,8 @@ public class WordLevel {
 	public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
 		for (Sentence s : corpus) {
 			for (Word word : s.getWords()) {
-				calculateForSuffixes(word.getWord(), stats);
-				calculateForPrefixes(word.getWord(), stats);
+				calculateForSuffixes(word.getWord(stats.getFilter().getWordParts()), stats);
+				calculateForPrefixes(word.getWord(stats.getFilter().getWordParts()), stats);
 			}
 		}
 	}