package data; import static gui.ValidationUtil.*; import java.io.UnsupportedEncodingException; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.time.temporal.ChronoUnit; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Pattern; import gui.I18N; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import alg.inflectedJOS.WordFormation; import data.Enums.WordLevelType; import javafx.collections.ObservableList; import util.Export; import util.Util; import util.db.RDB; @SuppressWarnings("Duplicates") public class StatisticsNew { public final static Logger logger = LogManager.getLogger(StatisticsNew.class); private Corpus corpus; private Filter filter; private String resultTitle; private Map result; private Map> taxonomyResult; private Object[][] resultCustom; // for when calculating percentages that don't add up to 100% private Map> resultNestedSuffix; private Map> resultNestedPrefix; private boolean useDB; private RDB db; private boolean analysisProducedResults; private LocalDateTime timeBeginning; private LocalDateTime timeEnding; private Map> collocability; private Map uniGramTaxonomyOccurrences; public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) { this.corpus = corpus; this.filter = filter; this.taxonomyResult = new ConcurrentHashMap<>(); this.taxonomyResult.put(corpus.getTotal(), new ConcurrentHashMap<>()); this.collocability = new ConcurrentHashMap<>(); this.uniGramTaxonomyOccurrences = new ConcurrentHashMap<>(); this.uniGramTaxonomyOccurrences.put(corpus.getTotal(), new AtomicLong(0L)); // create table for counting word occurrences per taxonomies if (this.corpus.getObservableListTaxonomy() != null && filter.getDisplayTaxonomy()) { if (this.filter.getTaxonomy().isEmpty()) { for (int i = 0; i < this.corpus.getObservableListTaxonomy().size(); i++) { this.taxonomyResult.put(Taxonomy.factoryLongName(this.corpus.getObservableListTaxonomy().get(i), corpus), new ConcurrentHashMap<>()); } } else { for (int i = 0; i < this.filter.getTaxonomy().size(); i++) { // Tax taxonomy = new Tax(); this.taxonomyResult.put(this.filter.getTaxonomy().get(i), new ConcurrentHashMap<>()); } } } if (useDB) { this.useDB = true; db = new RDB(); } if (filter.getAl() == AnalysisLevel.WORD_LEVEL) { resultNestedSuffix = new ConcurrentHashMap<>(); resultNestedPrefix = new ConcurrentHashMap<>(); } else { result = new ConcurrentHashMap<>(); } this.timeBeginning = LocalDateTime.now(); // resultTitle = generateResultTitle(); logger.debug(toString()); } /** * Result's title consists of: *

Corpus type
Analysis level
Calculate for

* * @return */ public String generateResultTitle() { String separator = "_"; StringBuilder sb = new StringBuilder(); if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { Integer ngramLevel = filter.getNgramValue(); String name = corpus.getCorpusName(); if(ngramLevel == 0) { if(!name.equals("")) { sb.append(name) .append(separator); } sb.append(I18N.get("exportFileName.letters")) .append(separator) .append(filter.getCalculateFor()) .append(separator); } else if(ngramLevel == 1) { if (filter.getSuffixLength() != null && filter.getSuffixList() != null && filter.getPrefixLength() != null && filter.getPrefixList() != null) { if(!name.equals("")) { sb.append(name) .append(separator); } sb.append(I18N.get("exportFileName.wordParts")) .append(separator) .append(filter.getCalculateFor()) .append(separator); } else { if(!name.equals("")) { sb.append(name) .append(separator); } sb.append(I18N.get("exportFileName.words")) .append(separator) .append(filter.getCalculateFor()) .append(separator); } } else { if(!name.equals("")) { sb.append(name) .append(separator); } sb.append(I18N.get("exportFileName.wordSets")) .append(separator); sb.append(filter.getCalculateFor().toString()) .append(separator); // ngram value sb.append(filter.getNgramValue()).append(I18N.get("exportFileName.gram")) .append(separator); sb.append(filter.getSkipValue()).append(I18N.get("exportFileName.skip")) .append(separator); } // TODO: assure skip is not null but zero } else { sb.append(filter.getAl().toString()) // analysis level .append(separator) .append(corpus.getCorpusType().toString()) .append(separator); } // skip value // msd ? // if taxonomy -> taxonomy // if cvv -> cvv + dolžina sb.append(getTimeEnding()); return sb.toString(); } public void setTimeEnding(){ this.timeEnding = LocalDateTime.now(); } public String getTimeEnding(){ return timeEnding.format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm.ss")); } public boolean isAnalysisProducedResults() { return analysisProducedResults; } public void setAnalysisProducedResults(boolean analysisProducedResults) { this.analysisProducedResults = analysisProducedResults; } public String toString() { String newLine = "\n\t- "; StringBuilder sb = new StringBuilder(); sb.append(newLine).append("Statistic properties:"); sb.append(newLine).append(corpus.getCorpusType().toString()).append(String.format(" (%d files)", corpus.getDetectedCorpusFiles().size())); sb.append(newLine).append(useDB ? "use DB" : "run in memory"); sb.append(newLine).append(filter.toString()); return sb.toString(); } public String getResultTitle() { return resultTitle; } // **************************************** // ***************** util ***************** // **************************************** /** * Stores results from this batch to a database and clears results map */ // public void storeTmpResultsToDB() { // try { // db.writeBatch(result); // result = new ConcurrentHashMap<>(); // } catch (UnsupportedEncodingException e) { // logger.error("Store tmp results to DB", e); // // e.printStackTrace(); // } // } public Filter getFilter() { return filter; } public Corpus getCorpus() { return corpus; } public boolean saveResultToDisk(int... limit) throws UnsupportedEncodingException { Set>> stats = new HashSet<>(); if (useDB) { result = db.getDump(); db.delete(); } removeMinimalOccurrences(filter.getMinimalOccurrences()); removeMinimalTaxonomy(taxonomyResult, filter.getMinimalTaxonomy()); // if no results and nothing to save, return false if (!(taxonomyResult.get(corpus.getTotal()).size() > 0)) { analysisProducedResults = false; return false; } else { analysisProducedResults = true; } stats.add(ImmutablePair.of(resultTitle, getSortedResult(taxonomyResult.get(corpus.getTotal()), Util.getValidInt(limit)))); Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), this, filter); return true; } /** * Removes lines, where number of different taxonomies is lower than specified number (minimalTaxonomy) */ private void removeMinimalTaxonomy(Map> taxonomyResult, Integer minimalTaxonomy) { if (minimalTaxonomy == 1) return; int occurances; for (MultipleHMKeys key : taxonomyResult.get(corpus.getTotal()).keySet()){ occurances = 0; for (Taxonomy columnNameKey : taxonomyResult.keySet()){ if(!columnNameKey.equals(corpus.getTotal()) && taxonomyResult.get(columnNameKey).get(key).intValue() >= 1) occurances++; } if(occurances < minimalTaxonomy){ taxonomyResult.get(corpus.getTotal()).remove(key); } } } /** * Removes lines where total number of occurrences is lower than specified number (minimalOccurrences) */ private void removeMinimalOccurrences(Integer minimalOccurrences) { if (minimalOccurrences == 0) return; for (MultipleHMKeys key : taxonomyResult.get(corpus.getTotal()).keySet()){ if(taxonomyResult.get(corpus.getTotal()).get(key).intValue() < minimalOccurrences){ for (Taxonomy t : taxonomyResult.keySet()){ taxonomyResult.get(t).remove(key); } } } } public boolean saveResultNestedToDisk(int... limit) throws UnsupportedEncodingException { resultTitle = generateResultTitle(); if (useDB) { result = db.getDump(); db.delete(); } Map>> results = new HashMap<>(); // UNCOMMENT!!!!!! // if (!isEmpty(resultNestedSuffix)) { // results.put(WordLevelType.SUFFIX, sortNestedMap(resultNestedSuffix, Util.getValidInt(limit))); // } // // if (!isEmpty(resultNestedPrefix)) { // results.put(WordLevelType.PREFIX, sortNestedMap(resultNestedPrefix, Util.getValidInt(limit))); // } // if no results and nothing to save, return false if (!(results.size() > 0)) { analysisProducedResults = false; return false; } else { analysisProducedResults = true; } Export.nestedMapToCSV(resultTitle, results, corpus.getChosenResultsLocation(), headerInfoBlock()); return true; } public boolean recalculateAndSaveResultToDisk() throws UnsupportedEncodingException { filter.setAl(AnalysisLevel.WORD_FORMATION); resultTitle = generateResultTitle(); if (useDB) { result = db.getDump(); db.delete(); } // if no results and nothing to save, return false if (!(result.size() > 0)) { analysisProducedResults = false; return false; } else { analysisProducedResults = true; } WordFormation.calculateStatistics(this); Export.SetToCSV(resultTitle, resultCustom, corpus.getChosenResultsLocation(), headerInfoBlock()); return true; } private Map> sortNestedMap(Map> nestedMap, int limit) { Map> sorted = new HashMap<>(); for (String s : nestedMap.keySet()) { sorted.put(s, getSortedResult(nestedMap.get(s), Util.getValidInt(limit))); } return sorted; } private Map getSortedResult(Map map, int limit) { return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit); } public void updateUniGramOccurrences(int amount, ArrayList taxonomy){ uniGramTaxonomyOccurrences.get(corpus.getTotal()).set(uniGramTaxonomyOccurrences.get(corpus.getTotal()).longValue() + amount); for (Taxonomy t : taxonomy){ if (uniGramTaxonomyOccurrences.get(t) != null){ uniGramTaxonomyOccurrences.get(t).set(uniGramTaxonomyOccurrences.get(t).longValue() + amount); } else { uniGramTaxonomyOccurrences.put(t, new AtomicLong(amount)); } } } public Map getUniGramOccurrences(){ // return uniGramTaxonomyOccurrences.get(corpus.getTotal()).longValue(); return uniGramTaxonomyOccurrences; } public void updateTaxonomyResults(MultipleHMKeys o, List taxonomy) { for (Taxonomy key : taxonomyResult.keySet()) { // first word should have the same taxonomy as others if (key.equals(corpus.getTotal()) || taxonomy.contains(key)) { // if (key.equals(corpus.getTotal()) || taxonomy != null && taxonomy.contains(key)) { // if taxonomy not in map and in this word AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(1)); if (r != null) taxonomyResult.get(key).get(o).incrementAndGet(); } else { // if taxonomy not in map and not in this word AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(0)); } } // if not in map // else } public Map> getTaxonomyResult() { return taxonomyResult; } public void updateResults(String o) { // if not in map AtomicLong r = result.putIfAbsent(o, new AtomicLong(1)); // else if (r != null) result.get(o).incrementAndGet(); } public Map getResult() { return result; } public Object[][] getResultCustom() { return resultCustom; } public void setResultCustom(Object[][] resultCustom) { this.resultCustom = resultCustom; } public void updateResultsNested(WordLevelType type, String key, String stringValue) { ConcurrentHashMap> resultsMap; if (type == WordLevelType.SUFFIX) { updateResultsNestedSuffix(key, stringValue); } else if (type == WordLevelType.PREFIX) { updateResultsNestedPrefix(key, stringValue); } } public void updateResultsNestedSuffix(String key, String stringValue) { MultipleHMKeys mkStringValue = new MultipleHMKeys1(stringValue); if (resultNestedSuffix.containsKey(key)) { // if not in map AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); // else if (r != null) { resultNestedSuffix.get(key).get(stringValue).incrementAndGet(); } } else { resultNestedSuffix.putIfAbsent(key, new ConcurrentHashMap<>()); AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); if (r != null) { resultNestedSuffix.get(key).get(stringValue).incrementAndGet(); } } } public void updateResultsNestedPrefix(String key, String stringValue) { MultipleHMKeys mkStringValue = new MultipleHMKeys1(stringValue); if (resultNestedPrefix.containsKey(key)) { // if not in map AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); // else if (r != null) { resultNestedPrefix.get(key).get(stringValue).incrementAndGet(); } } else { resultNestedPrefix.putIfAbsent(key, new ConcurrentHashMap<>()); AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); if (r != null) { resultNestedPrefix.get(key).get(stringValue).incrementAndGet(); } } } private LinkedHashMap headerInfoBlock() { LinkedHashMap info = new LinkedHashMap<>(); info.put(I18N.get("exportHeader.corpus"), corpus.getCorpusType().toString()); setTimeEnding(); info.put(I18N.get("exportHeader.date"), timeEnding.format(DateTimeFormatter.ofPattern("dd.MM.yyyy hh:mm"))); // time elapsed long seconds = ChronoUnit.MILLIS.between(timeBeginning, timeEnding) / 1000; info.put(I18N.get("exportHeader.executionTime"), String.valueOf(seconds) + " s"); if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { Integer ngramLevel = filter.getNgramValue(); if (ngramLevel == 0) info.put(I18N.get("exportHeader.analysis"), I18N.get("exportHeader.analysis.letters")); else if (ngramLevel == 1) { // if suffixes or prefixes are not null print word parts if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) { info.put(I18N.get("exportHeader.analysis"), I18N.get("exportHeader.analysis.wordParts")); } else { info.put(I18N.get("exportHeader.analysis"), I18N.get("exportHeader.analysis.words")); } } else info.put(I18N.get("exportHeader.analysis"), I18N.get("exportHeader.analysis.wordSets")); } else { info.put(I18N.get("exportHeader.analysis"), filter.getAl().toString()); } // if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { Integer ngramLevel = filter.getNgramValue(); if (ngramLevel == 0){ info.put(I18N.get("exportHeader.numberLetters"), filter.getStringLength().toString()); } // calculate for info.put(I18N.get("exportHeader.calculateFor"), filter.getCalculateFor().toString()); // also write if (ngramLevel > 0) { if (filter.getMultipleKeys().size() > 0) { StringBuilder mk = new StringBuilder(); for (CalculateFor s : filter.getMultipleKeys()) { mk.append(s.toString()).append("; "); } info.put(I18N.get("exportHeader.alsoFilter"), String.join("; ", mk.substring(0, mk.length() - 2))); } else { info.put(I18N.get("exportHeader.alsoFilter"), ""); } } // data limitations if (filter.getDisplayTaxonomy()){ info.put(I18N.get("exportHeader.displayTaxonomies"), I18N.get("exportHeader.yes")); } else { info.put(I18N.get("exportHeader.displayTaxonomies"), I18N.get("exportHeader.no")); } // n.gram nivo if (ngramLevel > 1) { info.put(I18N.get("exportHeader.ngramLevel"), String.valueOf(ngramLevel)); } // skip if (ngramLevel > 1) info.put(I18N.get("exportHeader.skipValue"), isNotEmpty(filter.getSkipValue()) ? filter.getSkipValue().toString() : "0"); // note punctuations - ngram > 1 if(ngramLevel > 1) { if (filter.getNotePunctuations()) { info.put(I18N.get("exportHeader.notePunctuations"), I18N.get("exportHeader.yes")); } else { info.put(I18N.get("exportHeader.notePunctuations"), I18N.get("exportHeader.no")); } } // also write - n - gram > 1 if(ngramLevel > 1) { if (filter.getCollocability().size() > 0) { StringBuilder mk = new StringBuilder(); for (Collocability s : filter.getCollocability()) { mk.append(s.toString()).append("; "); } info.put(I18N.get("exportHeader.collocability"), String.join("; ", mk.substring(0, mk.length() - 2))); } else { info.put(I18N.get("exportHeader.collocability"), ""); } } // fragmented MSD - n-gram = 1 if (info.get(I18N.get("exportHeader.analysis")).equals(I18N.get("exportHeader.analysis.words"))){ if (filter.getWriteMsdAtTheEnd()){ info.put(I18N.get("exportHeader.writeMSDAtTheEnd"), I18N.get("exportHeader.yes")); } else { info.put(I18N.get("exportHeader.writeMSDAtTheEnd"), I18N.get("exportHeader.no")); } } if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) { if (filter.getPrefixLength() > 0 || filter.getSuffixLength() > 0) { info.put(I18N.get("exportHeader.prefixLength"), String.valueOf(filter.getPrefixLength())); info.put(I18N.get("exportHeader.suffixLength"), String.valueOf(filter.getSuffixLength())); } else { info.put(I18N.get("exportHeader.prefixList"), String.join("; ", filter.getPrefixList())); info.put(I18N.get("exportHeader.suffixList"), String.join("; ", filter.getSuffixList())); } } // msd if (!isEmpty(filter.getMsd())) { StringBuilder msdPattern = new StringBuilder(); for (Pattern pattern : filter.getMsd()) { msdPattern.append(pattern.toString()).append(" "); } info.put(I18N.get("exportHeader.msd"), msdPattern.toString()); } else { info.put(I18N.get("exportHeader.msd"), ""); } // } info.put(I18N.get("exportHeader.taxonomy"), ""); if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) || filter.getDisplayTaxonomy()) { ArrayList tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy()); if (filter.getDisplayTaxonomy() && tax.size() == 0) { // ArrayList intList = (new ArrayList<>(taxonomyResult.keySet()).stream() // .forEach(x -> {x.toString();})); // ArrayList taxonomyString = new ArrayList<>(); // for (Taxonomy t : taxonomyResult.keySet()){ // taxonomyString.add(t.toString()); // } // ObservableList taxonomyObservableString = Tax.getTaxonomyForComboBox(corpus.getCorpusType(), new HashSet<>(taxonomyString)); // ArrayList sortedTaxonomyString = new ArrayList<>(); // for (String t : taxonomyObservableString){ // sortedTaxonomyString.add(t); // } // getTaxonomyForTaxonomyResult tax = Tax.getTaxonomyForTaxonomyResult(corpus, taxonomyResult.keySet()); } // String sep = ""; for (String s : tax) { if (corpus.getTaxonomy().size() == 0 || s == null) { continue; } // info.put(sep = sep + " ", s); if (uniGramTaxonomyOccurrences.get(Taxonomy.factoryLongName(s, corpus)) == null) { info.put(s, ""); continue; } int n = uniGramTaxonomyOccurrences.get(Taxonomy.factoryLongName(s, corpus)).intValue(); if (n == 0) { info.put(s, ""); } else { info.put(s, String.valueOf(n)); } } } info.put(I18N.get("exportHeader.minOccurrences"), String.valueOf(filter.getMinimalOccurrences())); info.put(I18N.get("exportHeader.minTaxonomies"), String.valueOf(filter.getMinimalTaxonomy())); if (corpus.getCorpusType() == CorpusType.SOLAR) { HashMap> filters = corpus.getSolarSelectedFilters(); if (!isEmpty(filters)) { info.put(I18N.get("exportHeader.additionalFilters"), ""); for (Map.Entry> f : filters.entrySet()) { info.put(I18N.get(f.getKey() + "L"), StringUtils.join(f.getValue(), ", ")); } } } return info; } public void updateCalculateCollocabilities(StatisticsNew oneWordStatistics) { Map> oneWordTaxonomyResult = oneWordStatistics.getTaxonomyResult(); Map> collocabilityMap = new ConcurrentHashMap<>(); for(Collocability c : filter.getCollocability()){ collocabilityMap.put(c, new ConcurrentHashMap<>()); } // count number of all words long N = 0; for(AtomicLong a : oneWordTaxonomyResult.get(corpus.getTotal()).values()){ N += a.longValue(); } for(MultipleHMKeys hmKey : taxonomyResult.get(corpus.getTotal()).keySet()) { // String[] splitedString = hmKey.getK1().split("\\s+"); long sum_fwi =0L; long mul_fwi =1L; // if(hmKey.getK1().equals("ja ja ja ja ja") || hmKey.getK1().equals("ne ne ne ne ne")){ // System.out.println("TEST"); // // } for(MultipleHMKeys smallHmKey : hmKey.getSplittedMultipleHMKeys()){ // System.out.println(smallHmKey.getK1()); sum_fwi += oneWordTaxonomyResult.get(corpus.getTotal()).get(smallHmKey).longValue(); mul_fwi *= oneWordTaxonomyResult.get(corpus.getTotal()).get(smallHmKey).longValue(); } // String t = hmKey.getK1(); if(mul_fwi < 0){ mul_fwi = Long.MAX_VALUE; } double O = (double)taxonomyResult.get(corpus.getTotal()).get(hmKey).longValue(); double n = (double)filter.getNgramValue(); double E = (double)mul_fwi / Math.pow(N, n - 1); if (collocabilityMap.keySet().contains(Collocability.DICE)){ double dice_value = n * O / sum_fwi; collocabilityMap.get(Collocability.DICE).put(hmKey, dice_value); } if (collocabilityMap.keySet().contains(Collocability.TSCORE)){ double t_score = (O - E) / Math.sqrt(O); collocabilityMap.get(Collocability.TSCORE).put(hmKey, t_score); } if (collocabilityMap.keySet().contains(Collocability.MI)){ double MI = Math.log(O / E) / Math.log(2); collocabilityMap.get(Collocability.MI).put(hmKey, MI); } if (collocabilityMap.keySet().contains(Collocability.MI3)){ double MI3 = Math.log(Math.pow(O, 3.0) / E) / Math.log(2); collocabilityMap.get(Collocability.MI3).put(hmKey, MI3); } if (collocabilityMap.keySet().contains(Collocability.LOGDICE)){ double dice_value = n * O / sum_fwi; double log_dice = 14 + Math.log(dice_value) / Math.log(2); collocabilityMap.get(Collocability.LOGDICE).put(hmKey, log_dice); } if (collocabilityMap.keySet().contains(Collocability.SIMPLELL)){ double simple_ll = 2 * (O * Math.log10(O / E) - (O - E)); collocabilityMap.get(Collocability.SIMPLELL).put(hmKey, simple_ll); } } for(Collocability c : collocabilityMap.keySet()){ collocability.put(c, collocabilityMap.get(c)); } } public Map> getCollocability(){ return this.collocability; } }