From 1d9e9b7ed62601ddbc7d31a927c31d93bfa073b7 Mon Sep 17 00:00:00 2001 From: Luka Date: Mon, 3 Sep 2018 13:31:41 +0200 Subject: [PATCH] Added new ssj500k reading option. Fixed GOS taxonomy --- src/main/java/alg/XML_processing.java | 197 ++++++++++++++++++- src/main/java/alg/ngram/Ngrams.java | 57 ++++-- src/main/java/data/CorpusType.java | 3 +- src/main/java/data/Filter.java | 2 +- src/main/java/data/Tax.java | 10 +- src/main/java/gui/CorpusTab.java | 5 +- src/main/java/gui/OneWordAnalysisTab.java | 4 +- src/main/java/gui/StringAnalysisTabNew2.java | 4 +- src/main/java/util/Export.java | 38 ++-- 9 files changed, 280 insertions(+), 40 deletions(-) diff --git a/src/main/java/alg/XML_processing.java b/src/main/java/alg/XML_processing.java index f57accb..6544613 100755 --- a/src/main/java/alg/XML_processing.java +++ b/src/main/java/alg/XML_processing.java @@ -52,7 +52,9 @@ public class XML_processing { readXMLGos(path, stats); } else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) { readXMLSolar(path, stats); - } + } else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) { + readXMLSSJ500K(path, stats); + } } /** @@ -91,6 +93,50 @@ public class XML_processing { return ""; } + /** + * Reads and returns the value of a passed header attribute or an empty string. + * E.g. body base attribute, for discerning the corpus' type of ssj500k. + * Notice: returns only the value of the first occurrence of a given tag name. + */ + public static String readXMLHeaderAttribute(String path, String tag, String attribute) { + XMLInputFactory factory = XMLInputFactory.newInstance(); + XMLEventReader eventReader = null; + + try { + eventReader = factory.createXMLEventReader(new FileInputStream(path)); + while (eventReader.hasNext()) { + XMLEvent xmlEvent = eventReader.nextEvent(); + if (xmlEvent.isStartElement()) { + StartElement startElement = xmlEvent.asStartElement(); + String var = startElement.getName().getLocalPart(); + + if (var.equalsIgnoreCase(tag)) { + HashMap att = extractAttributes(startElement); + + if (att.containsKey("base")) { + return att.get("base").substring(0, att.get("base").length() - 12); + } + + + + return eventReader.nextEvent().asCharacters().getData(); + } + } + } + } catch (FileNotFoundException | XMLStreamException e) { + e.printStackTrace(); + } finally { + if (eventReader != null) { + try { + eventReader.close(); + } catch (XMLStreamException e) { + logger.error("closing stream", e); + } + } + } + return ""; + } + private static void fj(List corpus, StatisticsNew stats) { ForkJoinPool pool = new ForkJoinPool(); @@ -403,7 +449,9 @@ public class XML_processing { // init results now to avoid null pointers headTags.forEach(f -> resultFilters.put(f, new HashSet<>())); - } else { + } else if (corpusType == CorpusType.SSJ500K) { + headTagName = "bibl"; + } else { headTagName = "teiHeader"; } @@ -437,7 +485,13 @@ public class XML_processing { .replace("#", ""); resultTaxonomy.add(tax); - } else if (!parseTaxonomy && headTags.contains(elementName)) { + } else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) { + String tax = startElement.getAttributeByName(QName.valueOf("ref")) + .getValue() + .replace("#", ""); + + resultTaxonomy.add(tax); + } else if (!parseTaxonomy && headTags.contains(elementName)) { String tagContent = xmlEventReader.nextEvent().asCharacters().getData(); resultFilters.get(elementName).add(tagContent); } @@ -646,6 +700,138 @@ public class XML_processing { return true; } + @SuppressWarnings("Duplicates") + public static boolean readXMLSSJ500K(String path, StatisticsNew stats) { + boolean inWord = false; + boolean inPunctuation = false; + ArrayList currentFiletaxonomy = new ArrayList<>(); + ArrayList currentFiletaxonomyLong = new ArrayList<>(); + String lemma = ""; + String msd = ""; + + List sentence = new ArrayList<>(); + List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it + String sentenceDelimiter = "s"; + + XMLEventReader eventReader = null; + try { + XMLInputFactory factory = XMLInputFactory.newInstance(); + eventReader = factory.createXMLEventReader(new FileInputStream(path)); + + while (eventReader.hasNext()) { + XMLEvent event = eventReader.nextEvent(); + + switch (event.getEventType()) { + case XMLStreamConstants.START_ELEMENT: + StartElement startElement = event.asStartElement(); + String qName = startElement.getName().getLocalPart(); + + // "word" node + if (qName.equals("w")) { + inWord = true; + if (!String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("msd:")){ + System.out.println("MSD written incorrectly"); + } + msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(4); + lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); + } + + else if (qName.equals("pc")){ + inPunctuation = true; + } + + // taxonomy node + else if (qName.equalsIgnoreCase("term")) { + // there are some term nodes at the beginning that are of no interest to us + // they differ by not having the attribute "ref", so test will equal null + Attribute tax = startElement.getAttributeByName(QName.valueOf("ref")); + + if (tax != null) { + // keep only taxonomy properties + String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", ""); + currentFiletaxonomy.add(currentFiletaxonomyElement); + Tax taxonomy = new Tax(); + currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement)); + } + } + break; + + case XMLStreamConstants.CHARACTERS: + Characters characters = event.asCharacters(); + + // "word" node value + if (inWord) { + String word = characters.getData(); + sentence.add(createWord(word, lemma, msd, word, stats.getFilter())); + inWord = false; + } + if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { + String punctuation = characters.getData(); + sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter())); + inPunctuation = false; + } + break; + + case XMLStreamConstants.END_ELEMENT: + EndElement endElement = event.asEndElement(); + + String var = endElement.getName().getLocalPart(); + String debug = ""; + + // parser reached end of the current sentence + if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { + // add sentence to corpus if it passes filters + sentence = runFilters(sentence, stats.getFilter()); + + if (!ValidationUtil.isEmpty(sentence)) { + corpus.add(new Sentence(sentence, currentFiletaxonomyLong)); + } + + // and start a new one + sentence = new ArrayList<>(); + + /* Invoke Fork-Join when we reach maximum limit of + * sentences (because we can't read everything to + * memory) or we reach the end of the file. + */ + if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { + fj(corpus, stats); + // empty the current corpus, since we don't need the data anymore + corpus.clear(); + + // TODO: if (stats.isUseDB()) { + // stats.storeTmpResultsToDB(); + // } + } + } + // fallback + else if (endElement.getName().getLocalPart().equalsIgnoreCase("div")) { + // join corpus and stats + fj(corpus, stats); + corpus.clear(); + + currentFiletaxonomy = new ArrayList<>(); + currentFiletaxonomyLong = new ArrayList<>(); + } + + break; + } + } + } catch (FileNotFoundException | XMLStreamException e) { + e.printStackTrace(); + } finally { + if (eventReader != null) { + try { + eventReader.close(); + } catch (XMLStreamException e) { + logger.error("closing stream", e); + } + } + } + + return true; + } + @SuppressWarnings("Duplicates") public static boolean readXMLGos(String path, StatisticsNew stats) { boolean inWord = false; @@ -853,6 +1039,9 @@ public class XML_processing { else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { fj(corpus, stats); corpus.clear(); + + currentFiletaxonomy = new ArrayList<>(); + currentFiletaxonomyLong = new ArrayList<>(); } break; @@ -914,7 +1103,7 @@ public class XML_processing { return atts; } - private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){ + public static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){ List wString = new ArrayList<>(); if (f.getWordParts().contains(CalculateFor.WORD)) wString.add(word); diff --git a/src/main/java/alg/ngram/Ngrams.java b/src/main/java/alg/ngram/Ngrams.java index a973a16..5f45544 100755 --- a/src/main/java/alg/ngram/Ngrams.java +++ b/src/main/java/alg/ngram/Ngrams.java @@ -15,6 +15,8 @@ import org.apache.logging.log4j.Logger; import gui.ValidationUtil; +import static alg.XML_processing.createWord; + public class Ngrams { public final static Logger logger = LogManager.getLogger(Ngrams.class); @@ -138,16 +140,22 @@ public class Ngrams { * Checks whether an ngram candidate passes specified regex filter. */ private static boolean passesRegex(List ngramCandidate, ArrayList regex, ArrayList wordParts) { - if (ngramCandidate.size() != regex.size()) { - logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway - return false; - } +// if (ngramCandidate.size() != regex.size()) { +// logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway +// return false; +// } - for (int i = 0; i < regex.size(); i++) { + int j = 0; + for (int i = 0; i < ngramCandidate.size(); i++) { + String msd = ngramCandidate.get(i).getMsd(wordParts); + if (msd.equals("*")){ + continue; + } //if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) { - if (!ngramCandidate.get(i).getMsd(wordParts).matches(regex.get(i).pattern() + ".*")) { + if (!msd.matches(regex.get(j).pattern() + ".*")) { return false; } + j ++; } return true; @@ -270,6 +278,7 @@ public class Ngrams { ArrayList currentLoop; int ngram = stats.getFilter().getNgramValue(); int skip = stats.getFilter().getSkipValue(); + Word w = createWord("*", "*", "*", "*", stats.getFilter()); for (Sentence s : corpus) { List sentence = s.getWords(); @@ -283,7 +292,8 @@ public class Ngrams { if (ngram == 2 && j < sentence.size()) { currentLoop = new ArrayList<>(); // currentLoop.add(sentence.get(i)); - currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats)); + currentLoop.add(sentence.get(i)); + fillSkipgrams(currentLoop, i, j, w); currentLoop.add(sentence.get(j)); validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy()); @@ -291,8 +301,10 @@ public class Ngrams { for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram if (ngram == 3 && k < sentence.size()) { currentLoop = new ArrayList<>(); - currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats)); - currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats)); + currentLoop.add(sentence.get(i)); + fillSkipgrams(currentLoop, i, j, w); + currentLoop.add(sentence.get(j)); + fillSkipgrams(currentLoop, j, k, w); currentLoop.add(sentence.get(k)); validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy()); @@ -300,9 +312,12 @@ public class Ngrams { for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram if (ngram == 4 && l < sentence.size()) { currentLoop = new ArrayList<>(); - currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats)); - currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats)); - currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats)); + currentLoop.add(sentence.get(i)); + fillSkipgrams(currentLoop, i, j, w); + currentLoop.add(sentence.get(j)); + fillSkipgrams(currentLoop, j, k, w); + currentLoop.add(sentence.get(k)); + fillSkipgrams(currentLoop, k, l, w); currentLoop.add(sentence.get(l)); validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy()); @@ -310,10 +325,14 @@ public class Ngrams { for (int m = l + 1; m <= l + 1 + skip; m++) { // 5gram if (ngram == 5 && m < sentence.size()) { currentLoop = new ArrayList<>(); - currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats)); - currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats)); - currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats)); - currentLoop.add(checkAndModifySkipgramPunctuation(sentence, l, m, stats)); + currentLoop.add(sentence.get(i)); + fillSkipgrams(currentLoop, i, j, w); + currentLoop.add(sentence.get(j)); + fillSkipgrams(currentLoop, j, k, w); + currentLoop.add(sentence.get(k)); + fillSkipgrams(currentLoop, k, l, w); + currentLoop.add(sentence.get(l)); + fillSkipgrams(currentLoop, l, m, w); currentLoop.add(sentence.get(m)); validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy()); @@ -329,6 +348,12 @@ public class Ngrams { } } + private static void fillSkipgrams(ArrayList currentLoop, int i, int j, Word w){ + for(int k = i + 1; k < j; k++){ + currentLoop.add(w); + } + } + private static void validateAndCountSkipgramCandidate(ArrayList skipgramCandidate, StatisticsNew stats, List taxonomy) { // count if no regex is set or if it is & candidate passes it if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) { diff --git a/src/main/java/data/CorpusType.java b/src/main/java/data/CorpusType.java index 2c70385..e8b2db9 100755 --- a/src/main/java/data/CorpusType.java +++ b/src/main/java/data/CorpusType.java @@ -4,7 +4,8 @@ public enum CorpusType { GIGAFIDA("Gigafida", "gigafida"), CCKRES("ccKres ", "cckres"), SOLAR("Šolar", "šolar"), - GOS("GOS", "gos"); + GOS("GOS", "gos"), + SSJ500K("ssj500k", "ssj500k"); private final String name; diff --git a/src/main/java/data/Filter.java b/src/main/java/data/Filter.java index 6cc3e43..e597b45 100755 --- a/src/main/java/data/Filter.java +++ b/src/main/java/data/Filter.java @@ -142,7 +142,7 @@ public class Filter { public void setHasMsd(boolean hasMsd) { filter.put(HAS_MSD, hasMsd); - if (hasMsd) + if (hasMsd && !((ArrayList) filter.get(MULTIPLE_KEYS)).contains(CalculateFor.MORPHOSYNTACTIC_SPECS)) addWordPart(CalculateFor.MORPHOSYNTACTIC_SPECS); } diff --git a/src/main/java/data/Tax.java b/src/main/java/data/Tax.java index bd71203..891fb54 100755 --- a/src/main/java/data/Tax.java +++ b/src/main/java/data/Tax.java @@ -10,7 +10,7 @@ import javafx.collections.ObservableList; public class Tax { private static LinkedHashMap GIGAFIDA_TAXONOMY; private static LinkedHashMap GOS_TAXONOMY; - private static final HashSet corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES)); + private static final HashSet corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K)); static { // GIGAFIDA ---------------------------- @@ -77,6 +77,12 @@ public class Tax { GOS_TAXONOMY.put("gos.S", "gos.S - situacija"); GOS_TAXONOMY.put("gos.S.R", "gos.S.R - situacija-radio"); GOS_TAXONOMY.put("gos.S.T", "gos.S.T - situacija-televizija"); + + GOS_TAXONOMY.put("gos.K", "gos.K - kanal"); + GOS_TAXONOMY.put("gos.K.O", "gos.K.O - kanal-osebni stik"); + GOS_TAXONOMY.put("gos.K.P", "gos.K.P - kanal-telefon"); + GOS_TAXONOMY.put("gos.K.R", "gos.K.R - kanal-radio"); + GOS_TAXONOMY.put("gos.K.T", "gos.K.T - kanal-televizija"); } /** @@ -98,7 +104,7 @@ public class Tax { public static ObservableList getTaxonomyForComboBox(CorpusType corpusType, HashSet foundTax) { LinkedHashMap tax = new LinkedHashMap<>(); - if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) { + if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K) { tax = GIGAFIDA_TAXONOMY; } else if (corpusType == CorpusType.GOS) { tax = GOS_TAXONOMY; diff --git a/src/main/java/gui/CorpusTab.java b/src/main/java/gui/CorpusTab.java index 6bb5aba..1b55c54 100755 --- a/src/main/java/gui/CorpusTab.java +++ b/src/main/java/gui/CorpusTab.java @@ -244,7 +244,7 @@ public class CorpusTab { logger.info("reading header data for ", corpusType.toString()); - if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.CCKRES) { + if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K) { boolean corpusIsSplit = corpusFiles.size() > 1; final Task> task = new Task>() { @@ -429,6 +429,7 @@ public class CorpusTab { // read first file only, maybe later do all, if toll on resources is acceptable File f = corpusFiles.iterator().next(); String title = XML_processing.readXMLHeaderTag(f.getAbsolutePath(), "title").toLowerCase(); + String attrib = XML_processing.readXMLHeaderAttribute(f.getAbsolutePath(), "body", "base").toLowerCase(); String test = CCKRES.getNameLowerCase(); String debug = ""; @@ -442,6 +443,8 @@ public class CorpusTab { corpusType = CCKRES; } else if (title.contains(GOS.getNameLowerCase())) { corpusType = GOS; + } else if (attrib.contains(SSJ500K.getNameLowerCase())) { + corpusType = SSJ500K; } if (corpusType == null) { diff --git a/src/main/java/gui/OneWordAnalysisTab.java b/src/main/java/gui/OneWordAnalysisTab.java index 65c55fe..d99b238 100755 --- a/src/main/java/gui/OneWordAnalysisTab.java +++ b/src/main/java/gui/OneWordAnalysisTab.java @@ -415,7 +415,6 @@ public class OneWordAnalysisTab { Filter filter = new Filter(); filter.setNgramValue(1); filter.setCalculateFor(calculateFor); - filter.setMsd(msd); filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType())); filter.setDisplayTaxonomy(displayTaxonomy); filter.setAl(AnalysisLevel.STRING_LEVEL); @@ -424,6 +423,9 @@ public class OneWordAnalysisTab { filter.setSolarFilters(solarFiltersMap); filter.setStringLength(1); filter.setMultipleKeys(alsoVisualize); + + // setMsd must be behind alsoVisualize + filter.setMsd(msd); filter.setMinimalOccurrences(minimalOccurrences); filter.setMinimalTaxonomy(minimalTaxonomy); filter.setWriteMsdAtTheEnd(writeMsdAtTheEnd); diff --git a/src/main/java/gui/StringAnalysisTabNew2.java b/src/main/java/gui/StringAnalysisTabNew2.java index f141727..24bb2f0 100755 --- a/src/main/java/gui/StringAnalysisTabNew2.java +++ b/src/main/java/gui/StringAnalysisTabNew2.java @@ -522,7 +522,6 @@ public class StringAnalysisTabNew2 { Filter filter = new Filter(); filter.setNgramValue(ngramValue); filter.setCalculateFor(calculateFor); - filter.setMsd(msd); filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType())); filter.setDisplayTaxonomy(displayTaxonomy); filter.setAl(AnalysisLevel.STRING_LEVEL); @@ -531,6 +530,9 @@ public class StringAnalysisTabNew2 { filter.setSolarFilters(solarFiltersMap); filter.setNotePunctuations(notePunctuations); filter.setMultipleKeys(alsoVisualize); + + // setMsd must be behind alsoVisualize + filter.setMsd(msd); filter.setMinimalOccurrences(minimalOccurrences); filter.setMinimalTaxonomy(minimalTaxonomy); diff --git a/src/main/java/util/Export.java b/src/main/java/util/Export.java index 574db17..4fff32b 100755 --- a/src/main/java/util/Export.java +++ b/src/main/java/util/Export.java @@ -12,6 +12,7 @@ import java.util.concurrent.atomic.AtomicLong; import data.CalculateFor; import data.Filter; import data.MultipleHMKeys; +import gui.ValidationUtil; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVPrinter; import org.apache.commons.csv.QuoteMode; @@ -87,6 +88,9 @@ public class Export { //CSV file header + if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { + FILE_HEADER_AL.add("Izpuščene besede"); + } FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString()); if (filter.getCalculateFor().equals(CalculateFor.LEMMA)) FILE_HEADER_AL.add("Lema male črke"); @@ -125,11 +129,9 @@ public class Export { // for (Map value : taxonomyResults.values()) { for (CalculateFor otherKey : filter.getMultipleKeys()) { - if (num_taxonomy_frequencies.get(otherKey) > 0) { - FILE_HEADER_AL.add(otherKey.toHeaderString()); - if (otherKey.equals(CalculateFor.LEMMA)) - FILE_HEADER_AL.add("Lema male črke"); - } + FILE_HEADER_AL.add(otherKey.toHeaderString()); + if (otherKey.equals(CalculateFor.LEMMA)) + FILE_HEADER_AL.add("Lema male črke"); } // if(otherKey.equals(CalculateFor.LEMMA)){ @@ -215,9 +217,12 @@ public class Export { for (Map.Entry e : map.entrySet()) { List dataEntry = new ArrayList<>(); - dataEntry.add(e.getKey().getK1()); + if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { + dataEntry.add(e.getKey().getK1()); + } + dataEntry.add(eraseSkipgramStars(e.getKey().getK1(), filter)); if (filter.getCalculateFor().equals(CalculateFor.LEMMA)){ - dataEntry.add(e.getKey().getK1().toLowerCase()); + dataEntry.add(eraseSkipgramStars(e.getKey().getK1().toLowerCase(), filter)); } int i = 0; @@ -225,20 +230,20 @@ public class Export { switch(i){ case 0: if (otherKey.equals(CalculateFor.LEMMA)){ - dataEntry.add(e.getKey().getK2()); - dataEntry.add(e.getKey().getK2().toLowerCase()); + dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter)); + dataEntry.add(eraseSkipgramStars(e.getKey().getK2().toLowerCase(), filter)); } else { - dataEntry.add(e.getKey().getK2()); + dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter)); } break; case 1: - dataEntry.add(e.getKey().getK3()); + dataEntry.add(eraseSkipgramStars(e.getKey().getK3(), filter)); break; case 2: - dataEntry.add(e.getKey().getK4()); + dataEntry.add(eraseSkipgramStars(e.getKey().getK4(), filter)); break; case 3: - dataEntry.add(e.getKey().getK5()); + dataEntry.add(eraseSkipgramStars(e.getKey().getK5(), filter)); break; } @@ -330,6 +335,13 @@ public class Export { return fileName; } + private static String eraseSkipgramStars(String s, Filter filter){ + if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { + s = s.replace("* ", ""); + } + return s; + } + public static String SetToCSV(String title, Object[][] result, File resultsPath, LinkedHashMap headerInfoBlock) { //Delimiter used in CSV file String NEW_LINE_SEPARATOR = "\n";