Added new ssj500k reading option. Fixed GOS taxonomy
This commit is contained in:
parent
426a9ccc46
commit
1d9e9b7ed6
|
@ -52,7 +52,9 @@ public class XML_processing {
|
|||
readXMLGos(path, stats);
|
||||
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
|
||||
readXMLSolar(path, stats);
|
||||
}
|
||||
} else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) {
|
||||
readXMLSSJ500K(path, stats);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -91,6 +93,50 @@ public class XML_processing {
|
|||
return "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads and returns the value of a passed header attribute or an empty string.
|
||||
* E.g. body base attribute, for discerning the corpus' type of ssj500k.
|
||||
* Notice: returns only the value of the first occurrence of a given tag name.
|
||||
*/
|
||||
public static String readXMLHeaderAttribute(String path, String tag, String attribute) {
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
XMLEventReader eventReader = null;
|
||||
|
||||
try {
|
||||
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
while (eventReader.hasNext()) {
|
||||
XMLEvent xmlEvent = eventReader.nextEvent();
|
||||
if (xmlEvent.isStartElement()) {
|
||||
StartElement startElement = xmlEvent.asStartElement();
|
||||
String var = startElement.getName().getLocalPart();
|
||||
|
||||
if (var.equalsIgnoreCase(tag)) {
|
||||
HashMap<String, String> att = extractAttributes(startElement);
|
||||
|
||||
if (att.containsKey("base")) {
|
||||
return att.get("base").substring(0, att.get("base").length() - 12);
|
||||
}
|
||||
|
||||
|
||||
|
||||
return eventReader.nextEvent().asCharacters().getData();
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException | XMLStreamException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
if (eventReader != null) {
|
||||
try {
|
||||
eventReader.close();
|
||||
} catch (XMLStreamException e) {
|
||||
logger.error("closing stream", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
private static void fj(List<Sentence> corpus, StatisticsNew stats) {
|
||||
ForkJoinPool pool = new ForkJoinPool();
|
||||
|
||||
|
@ -403,7 +449,9 @@ public class XML_processing {
|
|||
|
||||
// init results now to avoid null pointers
|
||||
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
|
||||
} else {
|
||||
} else if (corpusType == CorpusType.SSJ500K) {
|
||||
headTagName = "bibl";
|
||||
} else {
|
||||
headTagName = "teiHeader";
|
||||
}
|
||||
|
||||
|
@ -437,7 +485,13 @@ public class XML_processing {
|
|||
.replace("#", "");
|
||||
|
||||
resultTaxonomy.add(tax);
|
||||
} else if (!parseTaxonomy && headTags.contains(elementName)) {
|
||||
} else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) {
|
||||
String tax = startElement.getAttributeByName(QName.valueOf("ref"))
|
||||
.getValue()
|
||||
.replace("#", "");
|
||||
|
||||
resultTaxonomy.add(tax);
|
||||
} else if (!parseTaxonomy && headTags.contains(elementName)) {
|
||||
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
|
||||
resultFilters.get(elementName).add(tagContent);
|
||||
}
|
||||
|
@ -646,6 +700,138 @@ public class XML_processing {
|
|||
return true;
|
||||
}
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
|
||||
boolean inWord = false;
|
||||
boolean inPunctuation = false;
|
||||
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
|
||||
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
|
||||
String lemma = "";
|
||||
String msd = "";
|
||||
|
||||
List<Word> sentence = new ArrayList<>();
|
||||
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
|
||||
String sentenceDelimiter = "s";
|
||||
|
||||
XMLEventReader eventReader = null;
|
||||
try {
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
|
||||
while (eventReader.hasNext()) {
|
||||
XMLEvent event = eventReader.nextEvent();
|
||||
|
||||
switch (event.getEventType()) {
|
||||
case XMLStreamConstants.START_ELEMENT:
|
||||
StartElement startElement = event.asStartElement();
|
||||
String qName = startElement.getName().getLocalPart();
|
||||
|
||||
// "word" node
|
||||
if (qName.equals("w")) {
|
||||
inWord = true;
|
||||
if (!String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("msd:")){
|
||||
System.out.println("MSD written incorrectly");
|
||||
}
|
||||
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(4);
|
||||
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
||||
}
|
||||
|
||||
else if (qName.equals("pc")){
|
||||
inPunctuation = true;
|
||||
}
|
||||
|
||||
// taxonomy node
|
||||
else if (qName.equalsIgnoreCase("term")) {
|
||||
// there are some term nodes at the beginning that are of no interest to us
|
||||
// they differ by not having the attribute "ref", so test will equal null
|
||||
Attribute tax = startElement.getAttributeByName(QName.valueOf("ref"));
|
||||
|
||||
if (tax != null) {
|
||||
// keep only taxonomy properties
|
||||
String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", "");
|
||||
currentFiletaxonomy.add(currentFiletaxonomyElement);
|
||||
Tax taxonomy = new Tax();
|
||||
currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.CHARACTERS:
|
||||
Characters characters = event.asCharacters();
|
||||
|
||||
// "word" node value
|
||||
if (inWord) {
|
||||
String word = characters.getData();
|
||||
sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
|
||||
inWord = false;
|
||||
}
|
||||
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
|
||||
String punctuation = characters.getData();
|
||||
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
|
||||
inPunctuation = false;
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.END_ELEMENT:
|
||||
EndElement endElement = event.asEndElement();
|
||||
|
||||
String var = endElement.getName().getLocalPart();
|
||||
String debug = "";
|
||||
|
||||
// parser reached end of the current sentence
|
||||
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
||||
// add sentence to corpus if it passes filters
|
||||
sentence = runFilters(sentence, stats.getFilter());
|
||||
|
||||
if (!ValidationUtil.isEmpty(sentence)) {
|
||||
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
|
||||
}
|
||||
|
||||
// and start a new one
|
||||
sentence = new ArrayList<>();
|
||||
|
||||
/* Invoke Fork-Join when we reach maximum limit of
|
||||
* sentences (because we can't read everything to
|
||||
* memory) or we reach the end of the file.
|
||||
*/
|
||||
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
||||
fj(corpus, stats);
|
||||
// empty the current corpus, since we don't need the data anymore
|
||||
corpus.clear();
|
||||
|
||||
// TODO: if (stats.isUseDB()) {
|
||||
// stats.storeTmpResultsToDB();
|
||||
// }
|
||||
}
|
||||
}
|
||||
// fallback
|
||||
else if (endElement.getName().getLocalPart().equalsIgnoreCase("div")) {
|
||||
// join corpus and stats
|
||||
fj(corpus, stats);
|
||||
corpus.clear();
|
||||
|
||||
currentFiletaxonomy = new ArrayList<>();
|
||||
currentFiletaxonomyLong = new ArrayList<>();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException | XMLStreamException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
if (eventReader != null) {
|
||||
try {
|
||||
eventReader.close();
|
||||
} catch (XMLStreamException e) {
|
||||
logger.error("closing stream", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public static boolean readXMLGos(String path, StatisticsNew stats) {
|
||||
boolean inWord = false;
|
||||
|
@ -853,6 +1039,9 @@ public class XML_processing {
|
|||
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
|
||||
fj(corpus, stats);
|
||||
corpus.clear();
|
||||
|
||||
currentFiletaxonomy = new ArrayList<>();
|
||||
currentFiletaxonomyLong = new ArrayList<>();
|
||||
}
|
||||
|
||||
break;
|
||||
|
@ -914,7 +1103,7 @@ public class XML_processing {
|
|||
return atts;
|
||||
}
|
||||
|
||||
private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
|
||||
public static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
|
||||
List<String> wString = new ArrayList<>();
|
||||
if (f.getWordParts().contains(CalculateFor.WORD))
|
||||
wString.add(word);
|
||||
|
|
|
@ -15,6 +15,8 @@ import org.apache.logging.log4j.Logger;
|
|||
|
||||
import gui.ValidationUtil;
|
||||
|
||||
import static alg.XML_processing.createWord;
|
||||
|
||||
public class Ngrams {
|
||||
public final static Logger logger = LogManager.getLogger(Ngrams.class);
|
||||
|
||||
|
@ -138,16 +140,22 @@ public class Ngrams {
|
|||
* Checks whether an ngram candidate passes specified regex filter.
|
||||
*/
|
||||
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex, ArrayList<CalculateFor> wordParts) {
|
||||
if (ngramCandidate.size() != regex.size()) {
|
||||
logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
|
||||
return false;
|
||||
}
|
||||
// if (ngramCandidate.size() != regex.size()) {
|
||||
// logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
|
||||
// return false;
|
||||
// }
|
||||
|
||||
for (int i = 0; i < regex.size(); i++) {
|
||||
int j = 0;
|
||||
for (int i = 0; i < ngramCandidate.size(); i++) {
|
||||
String msd = ngramCandidate.get(i).getMsd(wordParts);
|
||||
if (msd.equals("*")){
|
||||
continue;
|
||||
}
|
||||
//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
|
||||
if (!ngramCandidate.get(i).getMsd(wordParts).matches(regex.get(i).pattern() + ".*")) {
|
||||
if (!msd.matches(regex.get(j).pattern() + ".*")) {
|
||||
return false;
|
||||
}
|
||||
j ++;
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -270,6 +278,7 @@ public class Ngrams {
|
|||
ArrayList<Word> currentLoop;
|
||||
int ngram = stats.getFilter().getNgramValue();
|
||||
int skip = stats.getFilter().getSkipValue();
|
||||
Word w = createWord("*", "*", "*", "*", stats.getFilter());
|
||||
|
||||
for (Sentence s : corpus) {
|
||||
List<Word> sentence = s.getWords();
|
||||
|
@ -283,7 +292,8 @@ public class Ngrams {
|
|||
if (ngram == 2 && j < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
// currentLoop.add(sentence.get(i));
|
||||
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
||||
currentLoop.add(sentence.get(i));
|
||||
fillSkipgrams(currentLoop, i, j, w);
|
||||
currentLoop.add(sentence.get(j));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
|
||||
|
@ -291,8 +301,10 @@ public class Ngrams {
|
|||
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
|
||||
if (ngram == 3 && k < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
||||
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
|
||||
currentLoop.add(sentence.get(i));
|
||||
fillSkipgrams(currentLoop, i, j, w);
|
||||
currentLoop.add(sentence.get(j));
|
||||
fillSkipgrams(currentLoop, j, k, w);
|
||||
currentLoop.add(sentence.get(k));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
|
||||
|
@ -300,9 +312,12 @@ public class Ngrams {
|
|||
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
|
||||
if (ngram == 4 && l < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
||||
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
|
||||
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats));
|
||||
currentLoop.add(sentence.get(i));
|
||||
fillSkipgrams(currentLoop, i, j, w);
|
||||
currentLoop.add(sentence.get(j));
|
||||
fillSkipgrams(currentLoop, j, k, w);
|
||||
currentLoop.add(sentence.get(k));
|
||||
fillSkipgrams(currentLoop, k, l, w);
|
||||
currentLoop.add(sentence.get(l));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
|
||||
|
@ -310,10 +325,14 @@ public class Ngrams {
|
|||
for (int m = l + 1; m <= l + 1 + skip; m++) { // 5gram
|
||||
if (ngram == 5 && m < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
|
||||
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
|
||||
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats));
|
||||
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, l, m, stats));
|
||||
currentLoop.add(sentence.get(i));
|
||||
fillSkipgrams(currentLoop, i, j, w);
|
||||
currentLoop.add(sentence.get(j));
|
||||
fillSkipgrams(currentLoop, j, k, w);
|
||||
currentLoop.add(sentence.get(k));
|
||||
fillSkipgrams(currentLoop, k, l, w);
|
||||
currentLoop.add(sentence.get(l));
|
||||
fillSkipgrams(currentLoop, l, m, w);
|
||||
currentLoop.add(sentence.get(m));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
|
||||
|
@ -329,6 +348,12 @@ public class Ngrams {
|
|||
}
|
||||
}
|
||||
|
||||
private static void fillSkipgrams(ArrayList<Word> currentLoop, int i, int j, Word w){
|
||||
for(int k = i + 1; k < j; k++){
|
||||
currentLoop.add(w);
|
||||
}
|
||||
}
|
||||
|
||||
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats, List<String> taxonomy) {
|
||||
// count if no regex is set or if it is & candidate passes it
|
||||
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
|
||||
|
|
|
@ -4,7 +4,8 @@ public enum CorpusType {
|
|||
GIGAFIDA("Gigafida", "gigafida"),
|
||||
CCKRES("ccKres ", "cckres"),
|
||||
SOLAR("Šolar", "šolar"),
|
||||
GOS("GOS", "gos");
|
||||
GOS("GOS", "gos"),
|
||||
SSJ500K("ssj500k", "ssj500k");
|
||||
|
||||
|
||||
private final String name;
|
||||
|
|
|
@ -142,7 +142,7 @@ public class Filter {
|
|||
|
||||
public void setHasMsd(boolean hasMsd) {
|
||||
filter.put(HAS_MSD, hasMsd);
|
||||
if (hasMsd)
|
||||
if (hasMsd && !((ArrayList<CalculateFor>) filter.get(MULTIPLE_KEYS)).contains(CalculateFor.MORPHOSYNTACTIC_SPECS))
|
||||
addWordPart(CalculateFor.MORPHOSYNTACTIC_SPECS);
|
||||
}
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ import javafx.collections.ObservableList;
|
|||
public class Tax {
|
||||
private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
|
||||
private static LinkedHashMap<String, String> GOS_TAXONOMY;
|
||||
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES));
|
||||
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K));
|
||||
|
||||
static {
|
||||
// GIGAFIDA ----------------------------
|
||||
|
@ -77,6 +77,12 @@ public class Tax {
|
|||
GOS_TAXONOMY.put("gos.S", "gos.S - situacija");
|
||||
GOS_TAXONOMY.put("gos.S.R", "gos.S.R - situacija-radio");
|
||||
GOS_TAXONOMY.put("gos.S.T", "gos.S.T - situacija-televizija");
|
||||
|
||||
GOS_TAXONOMY.put("gos.K", "gos.K - kanal");
|
||||
GOS_TAXONOMY.put("gos.K.O", "gos.K.O - kanal-osebni stik");
|
||||
GOS_TAXONOMY.put("gos.K.P", "gos.K.P - kanal-telefon");
|
||||
GOS_TAXONOMY.put("gos.K.R", "gos.K.R - kanal-radio");
|
||||
GOS_TAXONOMY.put("gos.K.T", "gos.K.T - kanal-televizija");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -98,7 +104,7 @@ public class Tax {
|
|||
public static ObservableList<String> getTaxonomyForComboBox(CorpusType corpusType, HashSet<String> foundTax) {
|
||||
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
|
||||
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K) {
|
||||
tax = GIGAFIDA_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.GOS) {
|
||||
tax = GOS_TAXONOMY;
|
||||
|
|
|
@ -244,7 +244,7 @@ public class CorpusTab {
|
|||
|
||||
logger.info("reading header data for ", corpusType.toString());
|
||||
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.CCKRES) {
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K) {
|
||||
boolean corpusIsSplit = corpusFiles.size() > 1;
|
||||
|
||||
final Task<HashSet<String>> task = new Task<HashSet<String>>() {
|
||||
|
@ -429,6 +429,7 @@ public class CorpusTab {
|
|||
// read first file only, maybe later do all, if toll on resources is acceptable
|
||||
File f = corpusFiles.iterator().next();
|
||||
String title = XML_processing.readXMLHeaderTag(f.getAbsolutePath(), "title").toLowerCase();
|
||||
String attrib = XML_processing.readXMLHeaderAttribute(f.getAbsolutePath(), "body", "base").toLowerCase();
|
||||
String test = CCKRES.getNameLowerCase();
|
||||
String debug = "";
|
||||
|
||||
|
@ -442,6 +443,8 @@ public class CorpusTab {
|
|||
corpusType = CCKRES;
|
||||
} else if (title.contains(GOS.getNameLowerCase())) {
|
||||
corpusType = GOS;
|
||||
} else if (attrib.contains(SSJ500K.getNameLowerCase())) {
|
||||
corpusType = SSJ500K;
|
||||
}
|
||||
|
||||
if (corpusType == null) {
|
||||
|
|
|
@ -415,7 +415,6 @@ public class OneWordAnalysisTab {
|
|||
Filter filter = new Filter();
|
||||
filter.setNgramValue(1);
|
||||
filter.setCalculateFor(calculateFor);
|
||||
filter.setMsd(msd);
|
||||
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
|
||||
filter.setDisplayTaxonomy(displayTaxonomy);
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
|
@ -424,6 +423,9 @@ public class OneWordAnalysisTab {
|
|||
filter.setSolarFilters(solarFiltersMap);
|
||||
filter.setStringLength(1);
|
||||
filter.setMultipleKeys(alsoVisualize);
|
||||
|
||||
// setMsd must be behind alsoVisualize
|
||||
filter.setMsd(msd);
|
||||
filter.setMinimalOccurrences(minimalOccurrences);
|
||||
filter.setMinimalTaxonomy(minimalTaxonomy);
|
||||
filter.setWriteMsdAtTheEnd(writeMsdAtTheEnd);
|
||||
|
|
|
@ -522,7 +522,6 @@ public class StringAnalysisTabNew2 {
|
|||
Filter filter = new Filter();
|
||||
filter.setNgramValue(ngramValue);
|
||||
filter.setCalculateFor(calculateFor);
|
||||
filter.setMsd(msd);
|
||||
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
|
||||
filter.setDisplayTaxonomy(displayTaxonomy);
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
|
@ -531,6 +530,9 @@ public class StringAnalysisTabNew2 {
|
|||
filter.setSolarFilters(solarFiltersMap);
|
||||
filter.setNotePunctuations(notePunctuations);
|
||||
filter.setMultipleKeys(alsoVisualize);
|
||||
|
||||
// setMsd must be behind alsoVisualize
|
||||
filter.setMsd(msd);
|
||||
filter.setMinimalOccurrences(minimalOccurrences);
|
||||
filter.setMinimalTaxonomy(minimalTaxonomy);
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@ import java.util.concurrent.atomic.AtomicLong;
|
|||
import data.CalculateFor;
|
||||
import data.Filter;
|
||||
import data.MultipleHMKeys;
|
||||
import gui.ValidationUtil;
|
||||
import org.apache.commons.csv.CSVFormat;
|
||||
import org.apache.commons.csv.CSVPrinter;
|
||||
import org.apache.commons.csv.QuoteMode;
|
||||
|
@ -87,6 +88,9 @@ public class Export {
|
|||
|
||||
|
||||
//CSV file header
|
||||
if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) {
|
||||
FILE_HEADER_AL.add("Izpuščene besede");
|
||||
}
|
||||
FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString());
|
||||
if (filter.getCalculateFor().equals(CalculateFor.LEMMA))
|
||||
FILE_HEADER_AL.add("Lema male črke");
|
||||
|
@ -125,11 +129,9 @@ public class Export {
|
|||
|
||||
// for (Map<MultipleHMKeys, AtomicLong> value : taxonomyResults.values()) {
|
||||
for (CalculateFor otherKey : filter.getMultipleKeys()) {
|
||||
if (num_taxonomy_frequencies.get(otherKey) > 0) {
|
||||
FILE_HEADER_AL.add(otherKey.toHeaderString());
|
||||
if (otherKey.equals(CalculateFor.LEMMA))
|
||||
FILE_HEADER_AL.add("Lema male črke");
|
||||
}
|
||||
FILE_HEADER_AL.add(otherKey.toHeaderString());
|
||||
if (otherKey.equals(CalculateFor.LEMMA))
|
||||
FILE_HEADER_AL.add("Lema male črke");
|
||||
}
|
||||
|
||||
// if(otherKey.equals(CalculateFor.LEMMA)){
|
||||
|
@ -215,9 +217,12 @@ public class Export {
|
|||
|
||||
for (Map.Entry<MultipleHMKeys, Long> e : map.entrySet()) {
|
||||
List dataEntry = new ArrayList<>();
|
||||
dataEntry.add(e.getKey().getK1());
|
||||
if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) {
|
||||
dataEntry.add(e.getKey().getK1());
|
||||
}
|
||||
dataEntry.add(eraseSkipgramStars(e.getKey().getK1(), filter));
|
||||
if (filter.getCalculateFor().equals(CalculateFor.LEMMA)){
|
||||
dataEntry.add(e.getKey().getK1().toLowerCase());
|
||||
dataEntry.add(eraseSkipgramStars(e.getKey().getK1().toLowerCase(), filter));
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
|
@ -225,20 +230,20 @@ public class Export {
|
|||
switch(i){
|
||||
case 0:
|
||||
if (otherKey.equals(CalculateFor.LEMMA)){
|
||||
dataEntry.add(e.getKey().getK2());
|
||||
dataEntry.add(e.getKey().getK2().toLowerCase());
|
||||
dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter));
|
||||
dataEntry.add(eraseSkipgramStars(e.getKey().getK2().toLowerCase(), filter));
|
||||
} else {
|
||||
dataEntry.add(e.getKey().getK2());
|
||||
dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter));
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
dataEntry.add(e.getKey().getK3());
|
||||
dataEntry.add(eraseSkipgramStars(e.getKey().getK3(), filter));
|
||||
break;
|
||||
case 2:
|
||||
dataEntry.add(e.getKey().getK4());
|
||||
dataEntry.add(eraseSkipgramStars(e.getKey().getK4(), filter));
|
||||
break;
|
||||
case 3:
|
||||
dataEntry.add(e.getKey().getK5());
|
||||
dataEntry.add(eraseSkipgramStars(e.getKey().getK5(), filter));
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -330,6 +335,13 @@ public class Export {
|
|||
return fileName;
|
||||
}
|
||||
|
||||
private static String eraseSkipgramStars(String s, Filter filter){
|
||||
if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) {
|
||||
s = s.replace("* ", "");
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
public static String SetToCSV(String title, Object[][] result, File resultsPath, LinkedHashMap<String, String> headerInfoBlock) {
|
||||
//Delimiter used in CSV file
|
||||
String NEW_LINE_SEPARATOR = "\n";
|
||||
|
|
Loading…
Reference in New Issue
Block a user