diff --git a/src/main/java/alg/XML_processing.java b/src/main/java/alg/XML_processing.java index 6544613..5eafce9 100755 --- a/src/main/java/alg/XML_processing.java +++ b/src/main/java/alg/XML_processing.java @@ -535,6 +535,7 @@ public class XML_processing { public static boolean readXMLGigafida(String path, StatisticsNew stats) { boolean inWord = false; boolean inPunctuation = false; + boolean taxonomyMatch = true; ArrayList currentFiletaxonomy = new ArrayList<>(); ArrayList currentFiletaxonomyLong = new ArrayList<>(); String lemma = ""; @@ -635,13 +636,19 @@ public class XML_processing { // parser reached end of the current sentence if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { + // count all UniGramOccurrences in sentence for statistics + stats.updateUniGramOccurrences(sentence.size()); + // add sentence to corpus if it passes filters sentence = runFilters(sentence, stats.getFilter()); - if (!ValidationUtil.isEmpty(sentence)) { + + + if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) { corpus.add(new Sentence(sentence, currentFiletaxonomyLong)); } +// taxonomyMatch = true; // and start a new one sentence = new ArrayList<>(); @@ -666,7 +673,9 @@ public class XML_processing { if (currentFiletaxonomy.isEmpty()) { // taxonomies don't match so stop - return false; +// return false; + taxonomyMatch = false; +// System.out.println("TEST"); } } } diff --git a/src/main/java/alg/ngram/Ngrams.java b/src/main/java/alg/ngram/Ngrams.java index 19b160b..8712f08 100755 --- a/src/main/java/alg/ngram/Ngrams.java +++ b/src/main/java/alg/ngram/Ngrams.java @@ -36,6 +36,8 @@ public class Ngrams { ArrayList otherKeys = stats.getFilter().getMultipleKeys(); for (Sentence s : corpus) { +// stats.updateUniGramOccurrences(s.getWords().size()); + // skip sentences shorter than specified ngram length if (s.getWords().size() < stats.getFilter().getNgramValue()) { continue; @@ -176,6 +178,8 @@ public class Ngrams { + + // UPDATE TAXONOMY HERE!!! stats.updateTaxonomyResults(multipleKeys, s.getTaxonomy()); // stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor())); @@ -261,16 +265,34 @@ public class Ngrams { */ private static void generateNgramLetterCandidates(List corpus, StatisticsNew stats) { for (Sentence s : corpus) { +// stats.updateUniGramOccurrences(s.getWords().size()); for (Word w : s.getWords()) { List taxonomy = s.getTaxonomy(); + +//// List ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue()); + List ngramCandidate = new ArrayList<>(); + ngramCandidate.add(w); +// +// // if msd regex is set and this candidate doesn't pass it, skip this iteration +// if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) { +// continue; +// } + String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv(), stats.getFilter().getWordParts()); // skip this iteration if: // - word doesn't contain a proper version (missing lemma for example) // - msd regex is given but this word's msd doesn't match it, skip this iteration // - given substring length is larger than the word length + +// boolean t1 = stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern()); +// boolean t2 = !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern()); +// String t3 = stats.getFilter().getMsd().get(0).pattern(); +// ArrayList t4 = stats.getFilter().getWordParts(); +// boolean t5 = word.length() < stats.getFilter().getStringLength(); + if (ValidationUtil.isEmpty(word) - || stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern()) + || stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts()) || word.length() < stats.getFilter().getStringLength()) { continue; } @@ -330,6 +352,8 @@ public class Ngrams { for (Sentence s : corpus) { List sentence = s.getWords(); +// stats.updateUniGramOccurrences(s.getWords().size()); + if (sentence == null){ continue; } diff --git a/src/main/java/data/MultipleHMKeys.java b/src/main/java/data/MultipleHMKeys.java index ff8f2a9..40d3351 100755 --- a/src/main/java/data/MultipleHMKeys.java +++ b/src/main/java/data/MultipleHMKeys.java @@ -15,6 +15,42 @@ public interface MultipleHMKeys { default ArrayList getSplittedMultipleHMKeys(){ return null; } + default String getMsd(Filter filter) { + String msd = ""; + if (filter.getCalculateFor().equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ + msd = getK1(); + } else if (filter.getMultipleKeys().contains(CalculateFor.MORPHOSYNTACTIC_SPECS)) { + int i = 0; + for (CalculateFor otherKey : filter.getMultipleKeys()) { + switch (i) { + case 0: + if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)) { + msd = getK2(); + } + break; + case 1: + if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)) { + msd = getK3(); + } + break; + case 2: + if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)) { + msd = getK4(); + } + break; + case 3: + if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)) { + msd = getK5(); + } + break; + } + + i++; + } + } + return msd; + } + @Override int hashCode(); diff --git a/src/main/java/data/StatisticsNew.java b/src/main/java/data/StatisticsNew.java index eca6eb2..a5f4ff2 100755 --- a/src/main/java/data/StatisticsNew.java +++ b/src/main/java/data/StatisticsNew.java @@ -5,6 +5,7 @@ import static gui.ValidationUtil.*; import java.io.UnsupportedEncodingException; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; +import java.time.temporal.ChronoUnit; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; @@ -39,8 +40,10 @@ public class StatisticsNew { private boolean useDB; private RDB db; private boolean analysisProducedResults; - private LocalDateTime time; + private LocalDateTime timeBeginning; + private LocalDateTime timeEnding; private Map> collocability; + private AtomicLong uniGramOccurrences; public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) { this.corpus = corpus; @@ -48,8 +51,10 @@ public class StatisticsNew { this.taxonomyResult = new ConcurrentHashMap<>(); this.taxonomyResult.put("Total", new ConcurrentHashMap<>()); this.collocability = new ConcurrentHashMap<>(); + this.uniGramOccurrences = new AtomicLong(0L); - // create table for counting word occurrences per taxonomies + + // create table for counting word occurrences per taxonomies if (this.corpus.getTaxonomy() != null && filter.getDisplayTaxonomy()) { if (this.filter.getTaxonomy().isEmpty()) { for (int i = 0; i < this.corpus.getTaxonomy().size(); i++) { @@ -75,7 +80,9 @@ public class StatisticsNew { result = new ConcurrentHashMap<>(); } - resultTitle = generateResultTitle(); + this.timeBeginning = LocalDateTime.now(); + +// resultTitle = generateResultTitle(); logger.debug(toString()); } @@ -94,7 +101,7 @@ public class StatisticsNew { * * @return */ - private String generateResultTitle() { + public String generateResultTitle() { String separator = "_"; StringBuilder sb = new StringBuilder(); @@ -108,12 +115,21 @@ public class StatisticsNew { .append(filter.getCalculateFor()) .append(separator); } else if(ngramLevel == 1) { - sb.append(corpus.getCorpusType().toString()) - .append(separator) - .append("besede") - .append(separator) - .append(filter.getCalculateFor()) - .append(separator); + if (filter.getSuffixLength() != null && filter.getSuffixList() != null && filter.getPrefixLength() != null && filter.getPrefixList() != null) { + sb.append(corpus.getCorpusType().toString()) + .append(separator) + .append("besedni-deli") + .append(separator) + .append(filter.getCalculateFor()) + .append(separator); + } else { + sb.append(corpus.getCorpusType().toString()) + .append(separator) + .append("besede") + .append(separator) + .append(filter.getCalculateFor()) + .append(separator); + } } else { sb.append(filter.getAl().toString()) @@ -141,13 +157,20 @@ public class StatisticsNew { // if taxonomy -> taxonomy // if cvv -> cvv + dolžina - this.time = this.time != null ? this.time : LocalDateTime.now(); - sb.append(time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm.ss"))); + sb.append(getTimeEnding()); return sb.toString(); } + public void setTimeEnding(){ + this.timeEnding = LocalDateTime.now(); + } + + public String getTimeEnding(){ + return timeEnding.format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm.ss")); + } + public boolean isAnalysisProducedResults() { return analysisProducedResults; } @@ -319,6 +342,14 @@ public class StatisticsNew { return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit); } + public void updateUniGramOccurrences(int amount){ + uniGramOccurrences.set(uniGramOccurrences.get() + amount); + } + + public long getUniGramOccurrences(){ + return uniGramOccurrences.longValue(); + } + public void updateTaxonomyResults(MultipleHMKeys o, List taxonomy) { for (String key : taxonomyResult.keySet()) { // first word should have the same taxonomy as others @@ -423,22 +454,23 @@ public class StatisticsNew { LinkedHashMap info = new LinkedHashMap<>(); info.put("Korpus:", corpus.getCorpusType().toString()); - info.put("Datum:", time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy hh:mm"))); + setTimeEnding(); + info.put("Datum:", timeEnding.format(DateTimeFormatter.ofPattern("dd.MM.yyyy hh:mm"))); if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { Integer ngramLevel = filter.getNgramValue(); if (ngramLevel == 0) - info.put("Analiza", "Črke"); + info.put("Analiza:", "Črke"); else if (ngramLevel == 1) { // if suffixes or prefixes are not null print word parts if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) { - info.put("Analiza", "Besedni deli"); + info.put("Analiza:", "Besedni deli"); } else { - info.put("Analiza", "Besede"); + info.put("Analiza:", "Besede"); } } else - info.put("Analiza", filter.getAl().toString()); + info.put("Analiza:", filter.getAl().toString()); } else { - info.put("Analiza", filter.getAl().toString()); + info.put("Analiza:", filter.getAl().toString()); } if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { @@ -453,9 +485,68 @@ public class StatisticsNew { if (ngramLevel > 1) info.put("Skip:", isNotEmpty(filter.getSkipValue()) ? filter.getSkipValue().toString() : "0"); - // izračunaj za + // calculate for info.put("Izračunaj za:", filter.getCalculateFor().toString()); + // also write + if (filter.getMultipleKeys().size() > 0){ + + StringBuilder mk = new StringBuilder(); + for (CalculateFor s : filter.getMultipleKeys()) { + mk.append(s.toString()).append("; "); + } + info.put("Izpiši tudi: ", String.join("; ", mk.substring(0, mk.length() - 2))); + } + + // time elapsed +// setTimeEnding(); + long seconds = ChronoUnit.MILLIS.between(timeBeginning, timeEnding) / 1000; + info.put("Čas izvajanja:", String.valueOf(seconds) + " s"); + + // data limitations + if (filter.getDisplayTaxonomy()){ + info.put("Izpiši taksonomije: ", "Da"); + } else { + info.put("Izpiši taksonomije: ", "Ne"); + } + + // note punctuations - ngram > 1 + if(ngramLevel > 1) { + if (filter.getNotePunctuations()) { + info.put("Upoštevaj ločila: ", "Da"); + } else { + info.put("Upoštevaj ločila: ", "Ne"); + } + } + + // also write - n - gram > 1 + if (ngramLevel > 1 && filter.getCollocability().size() > 0){ + StringBuilder mk = new StringBuilder(); + for (Collocability s : filter.getCollocability()) { + mk.append(s.toString()).append("; "); + } + info.put("Kolokabilnost: ", String.join("; ", mk.substring(0, mk.length() - 2))); + } + + // fragmented MSD - n-gram = 1 + if (info.get("Analiza:").equals("Besede")){ + if (filter.getWriteMsdAtTheEnd()){ + info.put("Izpiši razbit MSD: ", "Da"); + } else { + info.put("Izpiši razbit MSD: ", "Ne"); + } + } + + if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) { + if (filter.getPrefixLength() > 0 || filter.getSuffixLength() > 0) { + info.put("Dolžina predpone: ", String.valueOf(filter.getPrefixLength())); + info.put("Dolžina pripone: ", String.valueOf(filter.getSuffixLength())); + } else { + info.put("Seznam predpon: ", String.join("; ", filter.getPrefixList())); + info.put("Seznam pripon: ", String.join("; ", filter.getSuffixList())); + } + } + // msd if (!isEmpty(filter.getMsd())) { StringBuilder msdPattern = new StringBuilder(); @@ -479,6 +570,9 @@ public class StatisticsNew { } } + info.put("Min. št. pojavitev: ", String.valueOf(filter.getMinimalOccurrences())); + info.put("Min. št. taksonomij: ", String.valueOf(filter.getMinimalTaxonomy())); + if (corpus.getCorpusType() == CorpusType.SOLAR) { HashMap> filters = corpus.getSolarFilters(); diff --git a/src/main/java/gui/CharacterAnalysisTab.java b/src/main/java/gui/CharacterAnalysisTab.java index 6b7f144..20760e1 100755 --- a/src/main/java/gui/CharacterAnalysisTab.java +++ b/src/main/java/gui/CharacterAnalysisTab.java @@ -63,9 +63,13 @@ public class CharacterAnalysisTab { private TextField minimalTaxonomyTF; private Integer minimalTaxonomy; +// @FXML +// private ToggleGroup calculateForRB; +// private CalculateFor calculateFor; + @FXML - private ToggleGroup calculateForRB; - private CalculateFor calculateFor; + private ComboBox calculateForCB; + private CalculateFor calculateFor; @FXML private RadioButton lemmaRB; @@ -115,17 +119,25 @@ public class CharacterAnalysisTab { currentMode = MODE.LETTER; toggleMode(currentMode); - calculateForRB.selectedToggleProperty().addListener(new ChangeListener() { - @Override - public void changed(ObservableValue observable, Toggle oldValue, Toggle newValue) { - //logger.info("calculateForRB:", newValue.toString()); - RadioButton chk = (RadioButton)newValue.getToggleGroup().getSelectedToggle(); // Cast object to radio button - calculateFor = CalculateFor.factory(chk.getText()); - logger.info("calculateForRB:", chk.getText()); - //System.out.println("Selected Radio Button - "+chk.getText()); - } +// calculateForRB.selectedToggleProperty().addListener(new ChangeListener() { +// @Override +// public void changed(ObservableValue observable, Toggle oldValue, Toggle newValue) { +// //logger.info("calculateForRB:", newValue.toString()); +// RadioButton chk = (RadioButton)newValue.getToggleGroup().getSelectedToggle(); // Cast object to radio button +// calculateFor = CalculateFor.factory(chk.getText()); +// logger.info("calculateForRB:", chk.getText()); +// //System.out.println("Selected Radio Button - "+chk.getText()); +// } +// }); + + calculateForCB.valueProperty().addListener((observable, oldValue, newValue) -> { + calculateFor = CalculateFor.factory(newValue); + + logger.info("calculateForCB:", calculateFor.toString()); }); + calculateForCB.getSelectionModel().select(0); + // msd msdTF.focusedProperty().addListener((observable, oldValue, newValue) -> { if (!newValue) { @@ -292,10 +304,10 @@ public class CharacterAnalysisTab { // TODO: check for GOS, GIGAFIDA, SOLAR... // refresh and: // TODO if current value != null && is in new calculateFor ? keep : otherwise reset - if (calculateFor == null) { - calculateForRB.selectToggle(lemmaRB); - calculateFor = CalculateFor.factory(calculateForRB.getSelectedToggle().toString()); - } +// if (calculateFor == null) { +// calculateForRB.selectToggle(lemmaRB); +// calculateFor = CalculateFor.factory(calculateForRB.getSelectedToggle().toString()); +// } if (!filter.hasMsd()) { // if current corpus doesn't have msd data, disable this field @@ -381,18 +393,19 @@ public class CharacterAnalysisTab { if (!(calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LEMMA)) { // if the user selected something else before selecting ngram for letters, reset that choice calculateFor = CalculateFor.LEMMA; - calculateForRB.selectToggle(lemmaRB); + + calculateForCB.getSelectionModel().select(0); } } // override if orth mode, allow only word if (corpus.isGosOrthMode()) { // TODO change to - varietyRB.setDisable(true); +// varietyRB.setDisable(true); msdTF.setDisable(true); } else { msdTF.setDisable(false); - varietyRB.setDisable(false); +// varietyRB.setDisable(false); } } @@ -400,6 +413,8 @@ public class CharacterAnalysisTab { Filter filter = new Filter(); filter.setNgramValue(0); filter.setCalculateFor(calculateFor); + + filter.setMultipleKeys(new ArrayList<>()); filter.setMsd(msd); filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType())); filter.setDisplayTaxonomy(displayTaxonomy); diff --git a/src/main/java/util/Export.java b/src/main/java/util/Export.java index ee686f9..3a04b10 100755 --- a/src/main/java/util/Export.java +++ b/src/main/java/util/Export.java @@ -105,7 +105,8 @@ public class Export { } } - headerInfoBlock.put(filter.getCalculateFor().toMetadataString(), String.valueOf(num_frequencies)); + headerInfoBlock.put(filter.getCalculateFor().toMetadataString(), String.valueOf(statistics.getUniGramOccurrences())); +// headerInfoBlock.put(filter.getCalculateFor().toMetadataString(), String.valueOf(num_frequencies)); for (CalculateFor otherKey : filter.getMultipleKeys()) { FILE_HEADER_AL.add(otherKey.toHeaderString()); @@ -132,55 +133,75 @@ public class Export { FILE_HEADER_AL.add("Relativna pogostost [" + key + "]"); } } + + if (filter.getWriteMsdAtTheEnd()) { + String msd = ""; + int maxMsdLength = 0; + for(MultipleHMKeys key : set.iterator().next().getRight().keySet()){ + msd = key.getMsd(filter); + if (msd.length() > maxMsdLength){ + maxMsdLength = msd.length(); + } + } + for(int i = 0; i < maxMsdLength; i++){ + FILE_HEADER_AL.add("msd" + String.format("%02d", i + 1)); + } + + } FILE_HEADER = new String[ FILE_HEADER_AL.size() ]; FILE_HEADER_AL.toArray(FILE_HEADER); String fileName = ""; - for (Pair> p : set) { - String title = p.getLeft(); - fileName = title.replace(": ", "-"); - fileName = fileName.replace(" ", "_").concat(".csv"); + for (Pair> p : set) { + String title = p.getLeft(); - fileName = resultsPath.toString().concat(File.separator).concat(fileName); +// statistics.setTimeEnding(); + title = statistics.generateResultTitle(); +// statistics. - Map map = p.getRight(); + fileName = title.replace(": ", "-"); + fileName = fileName.replace(" ", "_").concat(".csv"); - if (map.isEmpty()) - continue; + fileName = resultsPath.toString().concat(File.separator).concat(fileName); + + Map map = p.getRight(); + + if (map.isEmpty()) + continue; // long total = Util.mapSumFrequencies(map); - OutputStreamWriter fileWriter = null; - CSVPrinter csvFilePrinter = null; + OutputStreamWriter fileWriter = null; + CSVPrinter csvFilePrinter = null; - //Create the CSVFormat object with "\n" as a record delimiter it puts all words in braces - CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';').withQuoteMode(QuoteMode.ALL); + //Create the CSVFormat object with "\n" as a record delimiter it puts all words in braces + CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';').withQuoteMode(QuoteMode.ALL); - try { - //initialize FileWriter object - fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); + try { + //initialize FileWriter object + fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); - //initialize CSVPrinter object - csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat); + //initialize CSVPrinter object + csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat); - // write info block - printHeaderInfo(csvFilePrinter, headerInfoBlock); + // write info block + printHeaderInfo(csvFilePrinter, headerInfoBlock); - //Create CSV file header - csvFilePrinter.printRecord(FILE_HEADER); + //Create CSV file header + csvFilePrinter.printRecord(FILE_HEADER); - for (Map.Entry e : map.entrySet()) { - List dataEntry = new ArrayList<>(); - if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { - dataEntry.add(e.getKey().getK1()); - } - dataEntry.add(eraseSkipgramStars(e.getKey().getK1(), filter)); + for (Map.Entry e : map.entrySet()) { + List dataEntry = new ArrayList<>(); + if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { + dataEntry.add(e.getKey().getK1()); + } + dataEntry.add(eraseSkipgramStars(e.getKey().getK1(), filter)); if (filter.getCalculateFor().equals(CalculateFor.LEMMA)){ dataEntry.add(eraseSkipgramStars(e.getKey().getK1().toLowerCase(), filter)); } - if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) { + if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) { if(filter.getSuffixLength() > 0 || filter.getPrefixLength() > 0) { if (filter.getPrefixLength() > 0) { dataEntry.add(((String) dataEntry.get(0)).substring(0, filter.getPrefixLength())); @@ -217,46 +238,48 @@ public class Export { dataEntry.add(rsf); } } - } + } int i = 0; for (CalculateFor otherKey : filter.getMultipleKeys()){ - switch(i){ - case 0: - if (otherKey.equals(CalculateFor.LEMMA)){ - dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter)); - dataEntry.add(eraseSkipgramStars(e.getKey().getK2().toLowerCase(), filter)); - } else { - dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter)); - } - break; - case 1: - dataEntry.add(eraseSkipgramStars(e.getKey().getK3(), filter)); - break; - case 2: - dataEntry.add(eraseSkipgramStars(e.getKey().getK4(), filter)); - break; - case 3: - dataEntry.add(eraseSkipgramStars(e.getKey().getK5(), filter)); - break; - } - - i++; - } + switch(i){ + case 0: + if (otherKey.equals(CalculateFor.LEMMA)){ + dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter)); + dataEntry.add(eraseSkipgramStars(e.getKey().getK2().toLowerCase(), filter)); + } else { + dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter)); + } + break; + case 1: + dataEntry.add(eraseSkipgramStars(e.getKey().getK3(), filter)); + break; + case 2: + dataEntry.add(eraseSkipgramStars(e.getKey().getK4(), filter)); + break; + case 3: + dataEntry.add(eraseSkipgramStars(e.getKey().getK5(), filter)); + break; + } + i++; + } - dataEntry.add(e.getValue().toString()); - dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies)); - dataEntry.add(String.format("%.2f", ((double) e.getValue() * 1000000)/num_frequencies)); - for (String key : taxonomyResults.keySet()){ - if(!key.equals("Total") && num_taxonomy_frequencies.get(key) > 0) { - AtomicLong frequency = taxonomyResults.get(key).get(e.getKey()); - dataEntry.add(frequency.toString()); - dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_taxonomy_frequencies.get(key))); - dataEntry.add(String.format("%.2f", ((double) frequency.get() * 1000000) / num_taxonomy_frequencies.get(key))); - } - } + dataEntry.add(e.getValue().toString()); + dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies)); + dataEntry.add(String.format("%.2f", ((double) e.getValue() * 1000000)/num_frequencies)); + for (String key : taxonomyResults.keySet()){ + if(!key.equals("Total") && num_taxonomy_frequencies.get(key) > 0) { + AtomicLong frequency = taxonomyResults.get(key).get(e.getKey()); + dataEntry.add(frequency.toString()); +// dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_taxonomy_frequencies.get(key))); +// dataEntry.add(String.format("%.2f", ((double) frequency.get() * 1000000) / num_taxonomy_frequencies.get(key))); + dataEntry.add(formatNumberAsPercent((double) frequency.get() / statistics.getUniGramOccurrences())); + dataEntry.add(String.format("%.2f", ((double) frequency.get() * 1000000) / statistics.getUniGramOccurrences())); + } + + } if (filter.getCollocability().size() > 0){ for (Collocability c : filter.getCollocability()) { @@ -264,67 +287,68 @@ public class Export { } } - // Write msd separated per letters at the end of each line in csv - if (filter.getWriteMsdAtTheEnd()) { - String msd = ""; - if (filter.getCalculateFor().equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ - msd = e.getKey().getK1(); - } else if (filter.getMultipleKeys().contains(CalculateFor.MORPHOSYNTACTIC_SPECS)) { - i = 0; - for (CalculateFor otherKey : filter.getMultipleKeys()){ - switch(i){ - case 0: - if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ - msd = e.getKey().getK2(); - } - break; - case 1: - if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ - msd = e.getKey().getK3(); - } - break; - case 2: - if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ - msd = e.getKey().getK4(); - } - break; - case 3: - if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ - msd = e.getKey().getK5(); - } - break; - } - - i++; - } - } - String [] charArray = msd.split("(?!^)"); - dataEntry.addAll(Arrays.asList(charArray)); - - } + // Write msd separated per letters at the end of each line in csv + if (filter.getWriteMsdAtTheEnd()) { +// String msd = ""; +// +// if (filter.getCalculateFor().equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ +// msd = e.getKey().getK1(); +// } else if (filter.getMultipleKeys().contains(CalculateFor.MORPHOSYNTACTIC_SPECS)) { +// i = 0; +// for (CalculateFor otherKey : filter.getMultipleKeys()){ +// switch(i){ +// case 0: +// if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ +// msd = e.getKey().getK2(); +// } +// break; +// case 1: +// if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ +// msd = e.getKey().getK3(); +// } +// break; +// case 2: +// if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ +// msd = e.getKey().getK4(); +// } +// break; +// case 3: +// if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ +// msd = e.getKey().getK5(); +// } +// break; +// } +// +// i++; +// } +// } + String msd = e.getKey().getMsd(filter); + String [] charArray = msd.split("(?!^)"); + dataEntry.addAll(Arrays.asList(charArray)); + } - csvFilePrinter.printRecord(dataEntry); - } - } catch (Exception e) { - System.out.println("Error in CsvFileWriter!"); - e.printStackTrace(); - } finally { - try { - if (fileWriter != null) { - fileWriter.flush(); - fileWriter.close(); - } - if (csvFilePrinter != null) { - csvFilePrinter.close(); - } - } catch (IOException e) { - System.out.println("Error while flushing/closing fileWriter/csvPrinter!"); - e.printStackTrace(); - } - } - } + csvFilePrinter.printRecord(dataEntry); + } + } catch (Exception e) { + System.out.println("Error in CsvFileWriter!"); + e.printStackTrace(); + } finally { + try { + if (fileWriter != null) { + fileWriter.flush(); + fileWriter.close(); + } + if (csvFilePrinter != null) { + csvFilePrinter.close(); + } + } catch (IOException e) { + System.out.println("Error while flushing/closing fileWriter/csvPrinter!"); + e.printStackTrace(); + } + } + } - return fileName; + return fileName; } private static String eraseSkipgramStars(String s, Filter filter){ diff --git a/src/main/resources/gui/CharacterAnalysisTab.fxml b/src/main/resources/gui/CharacterAnalysisTab.fxml index bb58036..03b50e4 100755 --- a/src/main/resources/gui/CharacterAnalysisTab.fxml +++ b/src/main/resources/gui/CharacterAnalysisTab.fxml @@ -13,35 +13,49 @@ + + +