package data; import static gui.ValidationUtil.*; import java.io.UnsupportedEncodingException; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.time.temporal.ChronoUnit; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import alg.inflectedJOS.WordFormation; import data.Enums.WordLevelType; import javafx.collections.ObservableList; import util.Export; import util.Util; import util.db.RDB; @SuppressWarnings("Duplicates") public class StatisticsNew { public final static Logger logger = LogManager.getLogger(StatisticsNew.class); private Corpus corpus; private Filter filter; private String resultTitle; private Map result; private Map> taxonomyResult; private Object[][] resultCustom; // for when calculating percentages that don't add up to 100% private Map> resultNestedSuffix; private Map> resultNestedPrefix; private boolean useDB; private RDB db; private boolean analysisProducedResults; private LocalDateTime timeBeginning; private LocalDateTime timeEnding; private Map> collocability; private AtomicLong uniGramOccurrences; public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) { this.corpus = corpus; this.filter = filter; this.taxonomyResult = new ConcurrentHashMap<>(); this.taxonomyResult.put("Total", new ConcurrentHashMap<>()); this.collocability = new ConcurrentHashMap<>(); this.uniGramOccurrences = new AtomicLong(0L); // create table for counting word occurrences per taxonomies if (this.corpus.getTaxonomy() != null && filter.getDisplayTaxonomy()) { if (this.filter.getTaxonomy().isEmpty()) { for (int i = 0; i < this.corpus.getTaxonomy().size(); i++) { this.taxonomyResult.put(this.corpus.getTaxonomy().get(i), new ConcurrentHashMap<>()); } } else { for (int i = 0; i < this.filter.getTaxonomy().size(); i++) { Tax taxonomy = new Tax(); this.taxonomyResult.put(taxonomy.getLongTaxonomyName(this.filter.getTaxonomy().get(i)), new ConcurrentHashMap<>()); } } } if (useDB) { this.useDB = true; db = new RDB(); } if (filter.getAl() == AnalysisLevel.WORD_LEVEL) { resultNestedSuffix = new ConcurrentHashMap<>(); resultNestedPrefix = new ConcurrentHashMap<>(); } else { result = new ConcurrentHashMap<>(); } this.timeBeginning = LocalDateTime.now(); // resultTitle = generateResultTitle(); logger.debug(toString()); } /** * Result's title consists of: *
    *
  • Corpus type
  • *
  • Analysis level
  • *
  • Calculate for
  • *
  • *
  • *
  • *
  • *
* * @return */ public String generateResultTitle() { String separator = "_"; StringBuilder sb = new StringBuilder(); if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { Integer ngramLevel = filter.getNgramValue(); if(ngramLevel == 0) { sb.append(corpus.getCorpusType().toString()) .append(separator) .append("crke") .append(separator) .append(filter.getCalculateFor()) .append(separator); } else if(ngramLevel == 1) { if (filter.getSuffixLength() != null && filter.getSuffixList() != null && filter.getPrefixLength() != null && filter.getPrefixList() != null) { sb.append(corpus.getCorpusType().toString()) .append(separator) .append("besedni-deli") .append(separator) .append(filter.getCalculateFor()) .append(separator); } else { sb.append(corpus.getCorpusType().toString()) .append(separator) .append("besede") .append(separator) .append(filter.getCalculateFor()) .append(separator); } } else { sb.append(filter.getAl().toString()) .append(separator) .append(corpus.getCorpusType().toString()) .append(separator); sb.append(filter.getCalculateFor().toString()) .append(separator); // ngram value sb.append(filter.getNgramValue()).append("-gram") .append(separator); sb.append(filter.getSkipValue()).append("-preskok") .append(separator); } // TODO: assure skip is not null but zero } else { sb.append(filter.getAl().toString()) // analysis level .append(separator) .append(corpus.getCorpusType().toString()) .append(separator); } // skip value // msd ? // if taxonomy -> taxonomy // if cvv -> cvv + dolžina sb.append(getTimeEnding()); return sb.toString(); } public void setTimeEnding(){ this.timeEnding = LocalDateTime.now(); } public String getTimeEnding(){ return timeEnding.format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm.ss")); } public boolean isAnalysisProducedResults() { return analysisProducedResults; } public void setAnalysisProducedResults(boolean analysisProducedResults) { this.analysisProducedResults = analysisProducedResults; } public String toString() { String newLine = "\n\t- "; StringBuilder sb = new StringBuilder(); sb.append(newLine).append("Statistic properties:"); sb.append(newLine).append(corpus.getCorpusType().toString()).append(String.format(" (%d files)", corpus.getDetectedCorpusFiles().size())); sb.append(newLine).append(useDB ? "use DB" : "run in memory"); sb.append(newLine).append(filter.toString()); return sb.toString(); } public String getResultTitle() { return resultTitle; } // **************************************** // ***************** util ***************** // **************************************** /** * Stores results from this batch to a database and clears results map */ public void storeTmpResultsToDB() { try { db.writeBatch(result); result = new ConcurrentHashMap<>(); } catch (UnsupportedEncodingException e) { logger.error("Store tmp results to DB", e); // e.printStackTrace(); } } public Filter getFilter() { return filter; } public Corpus getCorpus() { return corpus; } public boolean saveResultToDisk(int... limit) throws UnsupportedEncodingException { Set>> stats = new HashSet<>(); if (useDB) { result = db.getDump(); db.delete(); } // if no results and nothing to save, return false if (!(taxonomyResult.get("Total").size() > 0)) { analysisProducedResults = false; return false; } else { analysisProducedResults = true; } removeMinimalOccurrences(taxonomyResult.get("Total"), filter.getMinimalOccurrences()); removeMinimalTaxonomy(taxonomyResult, filter.getMinimalTaxonomy()); stats.add(ImmutablePair.of(resultTitle, getSortedResult(taxonomyResult.get("Total"), Util.getValidInt(limit)))); Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), this, filter); return true; } /** * Removes lines, where number of different taxonomies is lower than specified number (minimalTaxonomy) */ private void removeMinimalTaxonomy(Map> taxonomyResult, Integer minimalTaxonomy) { if (minimalTaxonomy == 1) return; int occurances; for (MultipleHMKeys key : taxonomyResult.get("Total").keySet()){ occurances = 0; for (String columnNameKey : taxonomyResult.keySet()){ if(!columnNameKey.equals("Total") && taxonomyResult.get(columnNameKey).get(key).intValue() >= 1) occurances++; } if(occurances < minimalTaxonomy){ taxonomyResult.get("Total").remove(key); } } } /** * Removes lines where total number of occurrences is lower than specified number (minimalOccurrences) */ private void removeMinimalOccurrences(Map taxonomyResultTotal, Integer minimalOccurrences) { if (minimalOccurrences == 0) return; for (MultipleHMKeys key : taxonomyResultTotal.keySet()){ if(taxonomyResultTotal.get(key).intValue() < minimalOccurrences){ taxonomyResultTotal.remove(key); } } } public boolean saveResultNestedToDisk(int... limit) throws UnsupportedEncodingException { resultTitle = generateResultTitle(); if (useDB) { result = db.getDump(); db.delete(); } Map>> results = new HashMap<>(); // UNCOMMENT!!!!!! // if (!isEmpty(resultNestedSuffix)) { // results.put(WordLevelType.SUFFIX, sortNestedMap(resultNestedSuffix, Util.getValidInt(limit))); // } // // if (!isEmpty(resultNestedPrefix)) { // results.put(WordLevelType.PREFIX, sortNestedMap(resultNestedPrefix, Util.getValidInt(limit))); // } // if no results and nothing to save, return false if (!(results.size() > 0)) { analysisProducedResults = false; return false; } else { analysisProducedResults = true; } Export.nestedMapToCSV(resultTitle, results, corpus.getChosenResultsLocation(), headerInfoBlock()); return true; } public boolean recalculateAndSaveResultToDisk() throws UnsupportedEncodingException { filter.setAl(AnalysisLevel.WORD_FORMATION); resultTitle = generateResultTitle(); if (useDB) { result = db.getDump(); db.delete(); } // if no results and nothing to save, return false if (!(result.size() > 0)) { analysisProducedResults = false; return false; } else { analysisProducedResults = true; } WordFormation.calculateStatistics(this); Export.SetToCSV(resultTitle, resultCustom, corpus.getChosenResultsLocation(), headerInfoBlock()); return true; } private Map> sortNestedMap(Map> nestedMap, int limit) { Map> sorted = new HashMap<>(); for (String s : nestedMap.keySet()) { sorted.put(s, getSortedResult(nestedMap.get(s), Util.getValidInt(limit))); } return sorted; } private Map getSortedResult(Map map, int limit) { return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit); } public void updateUniGramOccurrences(int amount){ uniGramOccurrences.set(uniGramOccurrences.get() + amount); } public long getUniGramOccurrences(){ return uniGramOccurrences.longValue(); } public void updateTaxonomyResults(MultipleHMKeys o, List taxonomy) { for (String key : taxonomyResult.keySet()) { // first word should have the same taxonomy as others if (key.equals("Total") || taxonomy.contains(key)) { // if (key.equals("Total") || taxonomy != null && taxonomy.contains(key)) { // if taxonomy not in map and in this word AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(1)); if (r != null) taxonomyResult.get(key).get(o).incrementAndGet(); } else { // if taxonomy not in map and not in this word AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(0)); } } // if not in map // else } public Map> getTaxonomyResult() { return taxonomyResult; } public void updateResults(String o) { // if not in map AtomicLong r = result.putIfAbsent(o, new AtomicLong(1)); // else if (r != null) result.get(o).incrementAndGet(); } public Map getResult() { return result; } public Object[][] getResultCustom() { return resultCustom; } public void setResultCustom(Object[][] resultCustom) { this.resultCustom = resultCustom; } public void updateResultsNested(WordLevelType type, String key, String stringValue) { ConcurrentHashMap> resultsMap; if (type == WordLevelType.SUFFIX) { updateResultsNestedSuffix(key, stringValue); } else if (type == WordLevelType.PREFIX) { updateResultsNestedPrefix(key, stringValue); } } public void updateResultsNestedSuffix(String key, String stringValue) { MultipleHMKeys mkStringValue = new MultipleHMKeys1(stringValue); if (resultNestedSuffix.containsKey(key)) { // if not in map AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); // else if (r != null) { resultNestedSuffix.get(key).get(stringValue).incrementAndGet(); } } else { resultNestedSuffix.putIfAbsent(key, new ConcurrentHashMap<>()); AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); if (r != null) { resultNestedSuffix.get(key).get(stringValue).incrementAndGet(); } } } public void updateResultsNestedPrefix(String key, String stringValue) { MultipleHMKeys mkStringValue = new MultipleHMKeys1(stringValue); if (resultNestedPrefix.containsKey(key)) { // if not in map AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); // else if (r != null) { resultNestedPrefix.get(key).get(stringValue).incrementAndGet(); } } else { resultNestedPrefix.putIfAbsent(key, new ConcurrentHashMap<>()); AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); if (r != null) { resultNestedPrefix.get(key).get(stringValue).incrementAndGet(); } } } private LinkedHashMap headerInfoBlock() { LinkedHashMap info = new LinkedHashMap<>(); info.put("Korpus:", corpus.getCorpusType().toString()); setTimeEnding(); info.put("Datum:", timeEnding.format(DateTimeFormatter.ofPattern("dd.MM.yyyy hh:mm"))); if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { Integer ngramLevel = filter.getNgramValue(); if (ngramLevel == 0) info.put("Analiza:", "Črke"); else if (ngramLevel == 1) { // if suffixes or prefixes are not null print word parts if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) { info.put("Analiza:", "Besedni deli"); } else { info.put("Analiza:", "Besede"); } } else info.put("Analiza:", filter.getAl().toString()); } else { info.put("Analiza:", filter.getAl().toString()); } if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { Integer ngramLevel = filter.getNgramValue(); // n.gram nivo if (ngramLevel > 1) { info.put("n-gram nivo:", String.valueOf(ngramLevel)); } // skip if (ngramLevel > 1) info.put("Skip:", isNotEmpty(filter.getSkipValue()) ? filter.getSkipValue().toString() : "0"); // calculate for info.put("Izračunaj za:", filter.getCalculateFor().toString()); // also write if (filter.getMultipleKeys().size() > 0){ StringBuilder mk = new StringBuilder(); for (CalculateFor s : filter.getMultipleKeys()) { mk.append(s.toString()).append("; "); } info.put("Izpiši tudi: ", String.join("; ", mk.substring(0, mk.length() - 2))); } // time elapsed // setTimeEnding(); long seconds = ChronoUnit.MILLIS.between(timeBeginning, timeEnding) / 1000; info.put("Čas izvajanja:", String.valueOf(seconds) + " s"); // data limitations if (filter.getDisplayTaxonomy()){ info.put("Izpiši taksonomije: ", "Da"); } else { info.put("Izpiši taksonomije: ", "Ne"); } // note punctuations - ngram > 1 if(ngramLevel > 1) { if (filter.getNotePunctuations()) { info.put("Upoštevaj ločila: ", "Da"); } else { info.put("Upoštevaj ločila: ", "Ne"); } } // also write - n - gram > 1 if (ngramLevel > 1 && filter.getCollocability().size() > 0){ StringBuilder mk = new StringBuilder(); for (Collocability s : filter.getCollocability()) { mk.append(s.toString()).append("; "); } info.put("Kolokabilnost: ", String.join("; ", mk.substring(0, mk.length() - 2))); } // fragmented MSD - n-gram = 1 if (info.get("Analiza:").equals("Besede")){ if (filter.getWriteMsdAtTheEnd()){ info.put("Izpiši razbit MSD: ", "Da"); } else { info.put("Izpiši razbit MSD: ", "Ne"); } } if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) { if (filter.getPrefixLength() > 0 || filter.getSuffixLength() > 0) { info.put("Dolžina predpone: ", String.valueOf(filter.getPrefixLength())); info.put("Dolžina pripone: ", String.valueOf(filter.getSuffixLength())); } else { info.put("Seznam predpon: ", String.join("; ", filter.getPrefixList())); info.put("Seznam pripon: ", String.join("; ", filter.getSuffixList())); } } // msd if (!isEmpty(filter.getMsd())) { StringBuilder msdPattern = new StringBuilder(); for (Pattern pattern : filter.getMsd()) { msdPattern.append(pattern.toString()).append(" "); } info.put("MSD:", msdPattern.toString()); } } if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) { ArrayList tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy()); info.put("Taksonomija: ", ""); String sep = ""; for (String s : tax) { info.put(sep = sep + " ", s); } } info.put("Min. št. pojavitev: ", String.valueOf(filter.getMinimalOccurrences())); info.put("Min. št. taksonomij: ", String.valueOf(filter.getMinimalTaxonomy())); if (corpus.getCorpusType() == CorpusType.SOLAR) { HashMap> filters = corpus.getSolarFilters(); if (!isEmpty(filters)) { info.put("Dodatni filtri: ", ""); for (Map.Entry> f : filters.entrySet()) { info.put(f.getKey(), StringUtils.join(f.getValue(), ", ")); } } } return info; } public void updateCalculateCollocabilities(StatisticsNew oneWordStatistics) { Map> oneWordTaxonomyResult = oneWordStatistics.getTaxonomyResult(); Map> collocabilityMap = new ConcurrentHashMap<>(); for(Collocability c : filter.getCollocability()){ collocabilityMap.put(c, new ConcurrentHashMap<>()); } // count number of all words long N = 0; for(AtomicLong a : oneWordTaxonomyResult.get("Total").values()){ N += a.longValue(); } for(MultipleHMKeys hmKey : taxonomyResult.get("Total").keySet()) { // String[] splitedString = hmKey.getK1().split("\\s+"); long sum_fwi =0L; long mul_fwi =1L; for(MultipleHMKeys smallHmKey : hmKey.getSplittedMultipleHMKeys()){ // System.out.println(smallHmKey.getK1()); sum_fwi += oneWordTaxonomyResult.get("Total").get(smallHmKey).longValue(); mul_fwi *= oneWordTaxonomyResult.get("Total").get(smallHmKey).longValue(); } // String t = hmKey.getK1(); // if(hmKey.getK1().equals("v Slovenija")){ // System.out.println("TEST"); // // } double O = (double)taxonomyResult.get("Total").get(hmKey).longValue(); double n = (double)filter.getNgramValue(); double E = (double)mul_fwi / Math.pow(N, n - 1); if (collocabilityMap.keySet().contains(Collocability.DICE)){ double dice_value = n * O / sum_fwi; collocabilityMap.get(Collocability.DICE).put(hmKey, dice_value); } if (collocabilityMap.keySet().contains(Collocability.TSCORE)){ double t_score = (O - E) / Math.sqrt(O); collocabilityMap.get(Collocability.TSCORE).put(hmKey, t_score); } if (collocabilityMap.keySet().contains(Collocability.MI)){ double MI = Math.log(O / E) / Math.log(2); collocabilityMap.get(Collocability.MI).put(hmKey, MI); } if (collocabilityMap.keySet().contains(Collocability.MI3)){ double MI3 = Math.log(Math.pow(O, 3.0) / E) / Math.log(2); collocabilityMap.get(Collocability.MI3).put(hmKey, MI3); } if (collocabilityMap.keySet().contains(Collocability.LOGDICE)){ double dice_value = n * O / sum_fwi; double log_dice = 14 + Math.log(dice_value) / Math.log(2); collocabilityMap.get(Collocability.LOGDICE).put(hmKey, log_dice); } if (collocabilityMap.keySet().contains(Collocability.SIMPLELL)){ double simple_ll = 2 * (O * Math.log10(O / E) - (O - E)); collocabilityMap.get(Collocability.SIMPLELL).put(hmKey, simple_ll); } } for(Collocability c : collocabilityMap.keySet()){ collocability.put(c, collocabilityMap.get(c)); } } public Map> getCollocability(){ return this.collocability; } }