Added new ssj500k reading option. Fixed GOS taxonomy

2018-09-03 13:31:41 +02:00 · 2018-09-03 13:31:41 +02:00 · 1d9e9b7ed6
commit 1d9e9b7ed6
parent 426a9ccc46
9 changed files with 280 additions and 40 deletions
--- a/src/main/java/alg/XML_processing.java
+++ b/src/main/java/alg/XML_processing.java
@ -52,7 +52,9 @@ public class XML_processing {
 			readXMLGos(path, stats);
 		} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
 			readXMLSolar(path, stats);
-		}
+		} else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) {
+            readXMLSSJ500K(path, stats);
+        }
 	}

 	/**
@ -91,6 +93,50 @@ public class XML_processing {
 		return "";
 	}

+	/**
+	 * Reads and returns the value of a passed header attribute or an empty string.
+	 * E.g. body base attribute, for discerning the corpus' type of ssj500k.
+	 * Notice: returns only the value of the first occurrence of a given tag name.
+	 */
+	public static String readXMLHeaderAttribute(String path, String tag, String attribute) {
+		XMLInputFactory factory = XMLInputFactory.newInstance();
+		XMLEventReader eventReader = null;
+
+		try {
+			eventReader = factory.createXMLEventReader(new FileInputStream(path));
+			while (eventReader.hasNext()) {
+				XMLEvent xmlEvent = eventReader.nextEvent();
+				if (xmlEvent.isStartElement()) {
+					StartElement startElement = xmlEvent.asStartElement();
+					String var = startElement.getName().getLocalPart();
+
+					if (var.equalsIgnoreCase(tag)) {
+                        HashMap<String, String> att = extractAttributes(startElement);
+
+						if (att.containsKey("base")) {
+							return att.get("base").substring(0, att.get("base").length() - 12);
+						}
+
+
+
+						return eventReader.nextEvent().asCharacters().getData();
+					}
+				}
+			}
+		} catch (FileNotFoundException | XMLStreamException e) {
+			e.printStackTrace();
+		} finally {
+			if (eventReader != null) {
+				try {
+					eventReader.close();
+				} catch (XMLStreamException e) {
+					logger.error("closing stream", e);
+				}
+			}
+		}
+		return "";
+	}
+
 	private static void fj(List<Sentence> corpus, StatisticsNew stats) {
 		ForkJoinPool pool = new ForkJoinPool();

@ -403,7 +449,9 @@ public class XML_processing {

 			// init results now to avoid null pointers
 			headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
-		} else {
+		} else if (corpusType == CorpusType.SSJ500K) {
+            headTagName = "bibl";
+        } else {
 			headTagName = "teiHeader";
 		}

@ -437,7 +485,13 @@ public class XML_processing {
 									.replace("#", "");

 							resultTaxonomy.add(tax);
-						} else if (!parseTaxonomy && headTags.contains(elementName)) {
+						} else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) {
+                            String tax = startElement.getAttributeByName(QName.valueOf("ref"))
+                                    .getValue()
+                                    .replace("#", "");
+
+                            resultTaxonomy.add(tax);
+                        } else if (!parseTaxonomy && headTags.contains(elementName)) {
 							String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
 							resultFilters.get(elementName).add(tagContent);
 						}
@ -646,6 +700,138 @@ public class XML_processing {
 		return true;
 	}

+    @SuppressWarnings("Duplicates")
+    public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
+        boolean inWord = false;
+        boolean inPunctuation = false;
+        ArrayList<String> currentFiletaxonomy = new ArrayList<>();
+        ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
+        String lemma = "";
+        String msd = "";
+
+        List<Word> sentence = new ArrayList<>();
+        List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
+        String sentenceDelimiter = "s";
+
+        XMLEventReader eventReader = null;
+        try {
+            XMLInputFactory factory = XMLInputFactory.newInstance();
+            eventReader = factory.createXMLEventReader(new FileInputStream(path));
+
+            while (eventReader.hasNext()) {
+                XMLEvent event = eventReader.nextEvent();
+
+                switch (event.getEventType()) {
+                    case XMLStreamConstants.START_ELEMENT:
+                        StartElement startElement = event.asStartElement();
+                        String qName = startElement.getName().getLocalPart();
+
+                        // "word" node
+                        if (qName.equals("w")) {
+                            inWord = true;
+                            if (!String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("msd:")){
+                                System.out.println("MSD written incorrectly");
+                            }
+                            msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(4);
+                            lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
+                        }
+
+                        else if (qName.equals("pc")){
+                            inPunctuation = true;
+                        }
+
+                        // taxonomy node
+                        else if (qName.equalsIgnoreCase("term")) {
+                            // there are some term nodes at the beginning that are of no interest to us
+                            // they differ by not having the attribute "ref", so test will equal null
+                            Attribute tax = startElement.getAttributeByName(QName.valueOf("ref"));
+
+                            if (tax != null) {
+                                // keep only taxonomy properties
+                                String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", "");
+                                currentFiletaxonomy.add(currentFiletaxonomyElement);
+                                Tax taxonomy = new Tax();
+                                currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
+                            }
+                        }
+                        break;
+
+                    case XMLStreamConstants.CHARACTERS:
+                        Characters characters = event.asCharacters();
+
+                        // "word" node value
+                        if (inWord) {
+                            String word = characters.getData();
+                            sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
+                            inWord = false;
+                        }
+                        if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
+                            String punctuation = characters.getData();
+                            sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
+                            inPunctuation = false;
+                        }
+                        break;
+
+                    case XMLStreamConstants.END_ELEMENT:
+                        EndElement endElement = event.asEndElement();
+
+                        String var = endElement.getName().getLocalPart();
+                        String debug = "";
+
+                        // parser reached end of the current sentence
+                        if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
+                            // add sentence to corpus if it passes filters
+                            sentence = runFilters(sentence, stats.getFilter());
+
+                            if (!ValidationUtil.isEmpty(sentence)) {
+                                corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
+                            }
+
+                            // and start a new one
+                            sentence = new ArrayList<>();
+
+                            /* Invoke Fork-Join when we reach maximum limit of
+                             * sentences (because we can't read everything to
+                             * memory) or we reach the end of the file.
+                             */
+                            if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
+                                fj(corpus, stats);
+                                // empty the current corpus, since we don't need the data anymore
+                                corpus.clear();
+
+                                // TODO: if (stats.isUseDB()) {
+                                // 	stats.storeTmpResultsToDB();
+                                // }
+                            }
+                        }
+                        // fallback
+                        else if (endElement.getName().getLocalPart().equalsIgnoreCase("div")) {
+                            // join corpus and stats
+                            fj(corpus, stats);
+                            corpus.clear();
+
+                            currentFiletaxonomy = new ArrayList<>();
+                            currentFiletaxonomyLong = new ArrayList<>();
+                        }
+
+                        break;
+                }
+            }
+        } catch (FileNotFoundException | XMLStreamException e) {
+            e.printStackTrace();
+        } finally {
+            if (eventReader != null) {
+                try {
+                    eventReader.close();
+                } catch (XMLStreamException e) {
+                    logger.error("closing stream", e);
+                }
+            }
+        }
+
+        return true;
+    }
+
 	@SuppressWarnings("Duplicates")
 	public static boolean readXMLGos(String path, StatisticsNew stats) {
 		boolean inWord = false;
@ -853,6 +1039,9 @@ public class XML_processing {
 						else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
 							fj(corpus, stats);
 							corpus.clear();
+
+                            currentFiletaxonomy = new ArrayList<>();
+                            currentFiletaxonomyLong = new ArrayList<>();
 						}

 						break;
@ -914,7 +1103,7 @@ public class XML_processing {
 		return atts;
 	}

-	private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
+	public static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
 		List<String> wString = new ArrayList<>();
 		if (f.getWordParts().contains(CalculateFor.WORD))
 			wString.add(word);
--- a/src/main/java/alg/ngram/Ngrams.java
+++ b/src/main/java/alg/ngram/Ngrams.java
@ -15,6 +15,8 @@ import org.apache.logging.log4j.Logger;

 import gui.ValidationUtil;

+import static alg.XML_processing.createWord;
+
 public class Ngrams {
 	public final static Logger logger = LogManager.getLogger(Ngrams.class);

@ -138,16 +140,22 @@ public class Ngrams {
 	 * Checks whether an ngram candidate passes specified regex filter.
 	 */
 	private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex, ArrayList<CalculateFor> wordParts) {
-		if (ngramCandidate.size() != regex.size()) {
-			logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
-			return false;
-		}
+//		if (ngramCandidate.size() != regex.size()) {
+//			logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
+//			return false;
+//		}

-		for (int i = 0; i < regex.size(); i++) {
+		int j = 0;
+		for (int i = 0; i < ngramCandidate.size(); i++) {
+		    String msd = ngramCandidate.get(i).getMsd(wordParts);
+		    if (msd.equals("*")){
+		        continue;
+            }
 			//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
-			if (!ngramCandidate.get(i).getMsd(wordParts).matches(regex.get(i).pattern() + ".*")) {
+			if (!msd.matches(regex.get(j).pattern() + ".*")) {
 				return false;
 			}
+			j ++;
 		}

 		return true;
@ -270,6 +278,7 @@ public class Ngrams {
 		ArrayList<Word> currentLoop;
 		int ngram = stats.getFilter().getNgramValue();
 		int skip = stats.getFilter().getSkipValue();
+		Word w = createWord("*", "*", "*", "*", stats.getFilter());

 		for (Sentence s : corpus) {
 			List<Word> sentence = s.getWords();
@ -283,7 +292,8 @@ public class Ngrams {
 					if (ngram == 2 && j < sentence.size()) {
 						currentLoop = new ArrayList<>();
 //						currentLoop.add(sentence.get(i));
-						currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
+						currentLoop.add(sentence.get(i));
+						fillSkipgrams(currentLoop, i, j, w);
 						currentLoop.add(sentence.get(j));

 						validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
@ -291,8 +301,10 @@ public class Ngrams {
 						for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
 							if (ngram == 3 && k < sentence.size()) {
 								currentLoop = new ArrayList<>();
-								currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
-								currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
+								currentLoop.add(sentence.get(i));
+								fillSkipgrams(currentLoop, i, j, w);
+								currentLoop.add(sentence.get(j));
+								fillSkipgrams(currentLoop, j, k, w);
 								currentLoop.add(sentence.get(k));

 								validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
@ -300,9 +312,12 @@ public class Ngrams {
 								for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
 									if (ngram == 4 && l < sentence.size()) {
 										currentLoop = new ArrayList<>();
-										currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
-										currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
-										currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats));
+										currentLoop.add(sentence.get(i));
+										fillSkipgrams(currentLoop, i, j, w);
+										currentLoop.add(sentence.get(j));
+										fillSkipgrams(currentLoop, j, k, w);
+										currentLoop.add(sentence.get(k));
+										fillSkipgrams(currentLoop, k, l, w);
 										currentLoop.add(sentence.get(l));

 										validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
@ -310,10 +325,14 @@ public class Ngrams {
 										for (int m = l + 1; m <= l + 1 + skip; m++) { // 5gram
 											if (ngram == 5 && m < sentence.size()) {
 												currentLoop = new ArrayList<>();
-												currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
-												currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
-												currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats));
-												currentLoop.add(checkAndModifySkipgramPunctuation(sentence, l, m, stats));
+												currentLoop.add(sentence.get(i));
+												fillSkipgrams(currentLoop, i, j, w);
+												currentLoop.add(sentence.get(j));
+												fillSkipgrams(currentLoop, j, k, w);
+												currentLoop.add(sentence.get(k));
+												fillSkipgrams(currentLoop, k, l, w);
+												currentLoop.add(sentence.get(l));
+												fillSkipgrams(currentLoop, l, m, w);
 												currentLoop.add(sentence.get(m));

 												validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
@ -329,6 +348,12 @@ public class Ngrams {
 		}
 	}

+	private static void fillSkipgrams(ArrayList<Word> currentLoop, int i, int j, Word w){
+		for(int k = i + 1; k < j; k++){
+			currentLoop.add(w);
+		}
+	}
+
 	private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats, List<String> taxonomy) {
 		// count if no regex is set or if it is & candidate passes it
 		if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
--- a/src/main/java/data/CorpusType.java
+++ b/src/main/java/data/CorpusType.java
@ -4,7 +4,8 @@ public enum CorpusType {
 	GIGAFIDA("Gigafida", "gigafida"),
 	CCKRES("ccKres ", "cckres"),
 	SOLAR("Šolar", "šolar"),
-	GOS("GOS", "gos");
+	GOS("GOS", "gos"),
+	SSJ500K("ssj500k", "ssj500k");


 	private final String name;
--- a/src/main/java/data/Filter.java
+++ b/src/main/java/data/Filter.java
@ -142,7 +142,7 @@ public class Filter {

 	public void setHasMsd(boolean hasMsd) {
 		filter.put(HAS_MSD, hasMsd);
-		if (hasMsd)
+		if (hasMsd && !((ArrayList<CalculateFor>) filter.get(MULTIPLE_KEYS)).contains(CalculateFor.MORPHOSYNTACTIC_SPECS))
 			addWordPart(CalculateFor.MORPHOSYNTACTIC_SPECS);
 	}

--- a/src/main/java/data/Tax.java
+++ b/src/main/java/data/Tax.java
@ -10,7 +10,7 @@ import javafx.collections.ObservableList;
 public class Tax {
 	private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
 	private static LinkedHashMap<String, String> GOS_TAXONOMY;
-	private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES));
+	private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K));

 	static {
 		// GIGAFIDA ----------------------------
@ -77,6 +77,12 @@ public class Tax {
 		GOS_TAXONOMY.put("gos.S", "gos.S - situacija");
 		GOS_TAXONOMY.put("gos.S.R", "gos.S.R - situacija-radio");
 		GOS_TAXONOMY.put("gos.S.T", "gos.S.T - situacija-televizija");
+
+		GOS_TAXONOMY.put("gos.K", "gos.K - kanal");
+		GOS_TAXONOMY.put("gos.K.O", "gos.K.O - kanal-osebni stik");
+		GOS_TAXONOMY.put("gos.K.P", "gos.K.P - kanal-telefon");
+		GOS_TAXONOMY.put("gos.K.R", "gos.K.R - kanal-radio");
+		GOS_TAXONOMY.put("gos.K.T", "gos.K.T - kanal-televizija");
 	}

 	/**
@ -98,7 +104,7 @@ public class Tax {
 	public static ObservableList<String> getTaxonomyForComboBox(CorpusType corpusType, HashSet<String> foundTax) {
 		LinkedHashMap<String, String> tax = new LinkedHashMap<>();

-		if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
+		if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K) {
 			tax = GIGAFIDA_TAXONOMY;
 		} else if (corpusType == CorpusType.GOS) {
 			tax = GOS_TAXONOMY;
--- a/src/main/java/gui/CorpusTab.java
+++ b/src/main/java/gui/CorpusTab.java
@ -244,7 +244,7 @@ public class CorpusTab {

 		logger.info("reading header data for ", corpusType.toString());

-		if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.CCKRES) {
+		if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K) {
 			boolean corpusIsSplit = corpusFiles.size() > 1;

 			final Task<HashSet<String>> task = new Task<HashSet<String>>() {
@ -429,6 +429,7 @@ public class CorpusTab {
 		// read first file only, maybe later do all, if toll on resources is acceptable
 		File f = corpusFiles.iterator().next();
 		String title = XML_processing.readXMLHeaderTag(f.getAbsolutePath(), "title").toLowerCase();
+		String attrib = XML_processing.readXMLHeaderAttribute(f.getAbsolutePath(), "body", "base").toLowerCase();
 		String test = CCKRES.getNameLowerCase();
 		String debug = "";

@ -442,6 +443,8 @@ public class CorpusTab {
 			corpusType = CCKRES;
 		} else if (title.contains(GOS.getNameLowerCase())) {
 			corpusType = GOS;
+		} else if (attrib.contains(SSJ500K.getNameLowerCase())) {
+			corpusType = SSJ500K;
 		}

 		if (corpusType == null) {
--- a/src/main/java/gui/OneWordAnalysisTab.java
+++ b/src/main/java/gui/OneWordAnalysisTab.java
@ -415,7 +415,6 @@ public class OneWordAnalysisTab {
        Filter filter = new Filter();
        filter.setNgramValue(1);
        filter.setCalculateFor(calculateFor);
-        filter.setMsd(msd);
        filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
        filter.setDisplayTaxonomy(displayTaxonomy);
        filter.setAl(AnalysisLevel.STRING_LEVEL);
@ -424,6 +423,9 @@ public class OneWordAnalysisTab {
        filter.setSolarFilters(solarFiltersMap);
        filter.setStringLength(1);
        filter.setMultipleKeys(alsoVisualize);
+
+        // setMsd must be behind alsoVisualize
+        filter.setMsd(msd);
        filter.setMinimalOccurrences(minimalOccurrences);
        filter.setMinimalTaxonomy(minimalTaxonomy);
        filter.setWriteMsdAtTheEnd(writeMsdAtTheEnd);
--- a/src/main/java/gui/StringAnalysisTabNew2.java
+++ b/src/main/java/gui/StringAnalysisTabNew2.java
@ -522,7 +522,6 @@ public class StringAnalysisTabNew2 {
        Filter filter = new Filter();
        filter.setNgramValue(ngramValue);
        filter.setCalculateFor(calculateFor);
-        filter.setMsd(msd);
        filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
        filter.setDisplayTaxonomy(displayTaxonomy);
        filter.setAl(AnalysisLevel.STRING_LEVEL);
@ -531,6 +530,9 @@ public class StringAnalysisTabNew2 {
        filter.setSolarFilters(solarFiltersMap);
        filter.setNotePunctuations(notePunctuations);
        filter.setMultipleKeys(alsoVisualize);
+
+        // setMsd must be behind alsoVisualize
+        filter.setMsd(msd);
        filter.setMinimalOccurrences(minimalOccurrences);
        filter.setMinimalTaxonomy(minimalTaxonomy);

--- a/src/main/java/util/Export.java
+++ b/src/main/java/util/Export.java
@ -12,6 +12,7 @@ import java.util.concurrent.atomic.AtomicLong;
 import data.CalculateFor;
 import data.Filter;
 import data.MultipleHMKeys;
+import gui.ValidationUtil;
 import org.apache.commons.csv.CSVFormat;
 import org.apache.commons.csv.CSVPrinter;
 import org.apache.commons.csv.QuoteMode;
@ -87,6 +88,9 @@ public class Export {


 		//CSV file header
+		if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) {
+			FILE_HEADER_AL.add("Izpuščene besede");
+		}
 		FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString());
 		if (filter.getCalculateFor().equals(CalculateFor.LEMMA))
 			FILE_HEADER_AL.add("Lema male črke");
@ -125,11 +129,9 @@ public class Export {

 //			for (Map<MultipleHMKeys, AtomicLong> value : taxonomyResults.values()) {
 		for (CalculateFor otherKey : filter.getMultipleKeys()) {
-			if (num_taxonomy_frequencies.get(otherKey) > 0) {
-				FILE_HEADER_AL.add(otherKey.toHeaderString());
-				if (otherKey.equals(CalculateFor.LEMMA))
-					FILE_HEADER_AL.add("Lema male črke");
-			}
+            FILE_HEADER_AL.add(otherKey.toHeaderString());
+            if (otherKey.equals(CalculateFor.LEMMA))
+                FILE_HEADER_AL.add("Lema male črke");
 		}

 //					if(otherKey.equals(CalculateFor.LEMMA)){
@ -215,9 +217,12 @@ public class Export {

 				for (Map.Entry<MultipleHMKeys, Long> e : map.entrySet()) {
 					List dataEntry = new ArrayList<>();
-					dataEntry.add(e.getKey().getK1());
+					if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) {
+						dataEntry.add(e.getKey().getK1());
+					}
+					dataEntry.add(eraseSkipgramStars(e.getKey().getK1(), filter));
                    if (filter.getCalculateFor().equals(CalculateFor.LEMMA)){
-                        dataEntry.add(e.getKey().getK1().toLowerCase());
+                        dataEntry.add(eraseSkipgramStars(e.getKey().getK1().toLowerCase(), filter));
                    }

                    int i = 0;
@ -225,20 +230,20 @@ public class Export {
                    	switch(i){
 							case 0:
 								if (otherKey.equals(CalculateFor.LEMMA)){
-									dataEntry.add(e.getKey().getK2());
-									dataEntry.add(e.getKey().getK2().toLowerCase());
+									dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter));
+									dataEntry.add(eraseSkipgramStars(e.getKey().getK2().toLowerCase(), filter));
 								} else {
-									dataEntry.add(e.getKey().getK2());
+									dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter));
 								}
 								break;
 							case 1:
-								dataEntry.add(e.getKey().getK3());
+								dataEntry.add(eraseSkipgramStars(e.getKey().getK3(), filter));
 								break;
 							case 2:
-								dataEntry.add(e.getKey().getK4());
+								dataEntry.add(eraseSkipgramStars(e.getKey().getK4(), filter));
 								break;
 							case 3:
-								dataEntry.add(e.getKey().getK5());
+								dataEntry.add(eraseSkipgramStars(e.getKey().getK5(), filter));
 								break;
 						}

@ -330,6 +335,13 @@ public class Export {
 		return fileName;
 	}

+	private static String eraseSkipgramStars(String s, Filter filter){
+		if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) {
+			s = s.replace("* ", "");
+		}
+		return s;
+	}
+
 	public static String SetToCSV(String title, Object[][] result, File resultsPath, LinkedHashMap<String, String> headerInfoBlock) {
 		//Delimiter used in CSV file
 		String NEW_LINE_SEPARATOR = "\n";