package data; import static gui.ValidationUtil.*; import java.io.UnsupportedEncodingException; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import alg.inflectedJOS.WordFormation; import data.Enums.WordLevelType; import javafx.collections.ObservableList; import util.Export; import util.Util; import util.db.RDB; @SuppressWarnings("Duplicates") public class StatisticsNew { public final static Logger logger = LogManager.getLogger(StatisticsNew.class); private Corpus corpus; private Filter filter; private String resultTitle; private Map result; private Map> taxonomyResult; private Object[][] resultCustom; // for when calculating percentages that don't add up to 100% private Map> resultNestedSuffix; private Map> resultNestedPrefix; private boolean useDB; private RDB db; private boolean analysisProducedResults; private LocalDateTime time; private Map> collocability; public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) { this.corpus = corpus; this.filter = filter; this.taxonomyResult = new ConcurrentHashMap<>(); this.taxonomyResult.put("Total", new ConcurrentHashMap<>()); this.collocability = new ConcurrentHashMap<>(); // create table for counting word occurrences per taxonomies if (this.corpus.getTaxonomy() != null && filter.getDisplayTaxonomy()) { if (this.filter.getTaxonomy().isEmpty()) { for (int i = 0; i < this.corpus.getTaxonomy().size(); i++) { this.taxonomyResult.put(this.corpus.getTaxonomy().get(i), new ConcurrentHashMap<>()); } } else { for (int i = 0; i < this.filter.getTaxonomy().size(); i++) { Tax taxonomy = new Tax(); this.taxonomyResult.put(taxonomy.getLongTaxonomyName(this.filter.getTaxonomy().get(i)), new ConcurrentHashMap<>()); } } } if (useDB) { this.useDB = true; db = new RDB(); } if (filter.getAl() == AnalysisLevel.WORD_LEVEL) { resultNestedSuffix = new ConcurrentHashMap<>(); resultNestedPrefix = new ConcurrentHashMap<>(); } else { result = new ConcurrentHashMap<>(); } resultTitle = generateResultTitle(); logger.debug(toString()); } /** * Result's title consists of: *
    *
  • Corpus type
  • *
  • Analysis level
  • *
  • Calculate for
  • *
  • *
  • *
  • *
  • *
* * @return */ private String generateResultTitle() { String separator = "_"; StringBuilder sb = new StringBuilder(); if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { Integer ngramLevel = filter.getNgramValue(); if(ngramLevel == 0) { sb.append(corpus.getCorpusType().toString()) .append(separator) .append("crke") .append(separator) .append(filter.getCalculateFor()) .append(separator); } else if(ngramLevel == 1) { sb.append(corpus.getCorpusType().toString()) .append(separator) .append("besede") .append(separator) .append(filter.getCalculateFor()) .append(separator); } else { sb.append(filter.getAl().toString()) .append(separator) .append(corpus.getCorpusType().toString()) .append(separator); sb.append(filter.getCalculateFor().toString()) .append(separator); // ngram value sb.append(filter.getNgramValue()).append("-gram") .append(separator); sb.append(filter.getSkipValue()).append("-preskok") .append(separator); } // TODO: assure skip is not null but zero } else { sb.append(filter.getAl().toString()) // analysis level .append(separator) .append(corpus.getCorpusType().toString()) .append(separator); } // skip value // msd ? // if taxonomy -> taxonomy // if cvv -> cvv + dolžina this.time = this.time != null ? this.time : LocalDateTime.now(); sb.append(time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm.ss"))); return sb.toString(); } public boolean isAnalysisProducedResults() { return analysisProducedResults; } public void setAnalysisProducedResults(boolean analysisProducedResults) { this.analysisProducedResults = analysisProducedResults; } public String toString() { String newLine = "\n\t- "; StringBuilder sb = new StringBuilder(); sb.append(newLine).append("Statistic properties:"); sb.append(newLine).append(corpus.getCorpusType().toString()).append(String.format(" (%d files)", corpus.getDetectedCorpusFiles().size())); sb.append(newLine).append(useDB ? "use DB" : "run in memory"); sb.append(newLine).append(filter.toString()); return sb.toString(); } public String getResultTitle() { return resultTitle; } // **************************************** // ***************** util ***************** // **************************************** /** * Stores results from this batch to a database and clears results map */ public void storeTmpResultsToDB() { try { db.writeBatch(result); result = new ConcurrentHashMap<>(); } catch (UnsupportedEncodingException e) { logger.error("Store tmp results to DB", e); // e.printStackTrace(); } } public Filter getFilter() { return filter; } public Corpus getCorpus() { return corpus; } public boolean saveResultToDisk(int... limit) throws UnsupportedEncodingException { Set>> stats = new HashSet<>(); if (useDB) { result = db.getDump(); db.delete(); } // if no results and nothing to save, return false if (!(taxonomyResult.get("Total").size() > 0)) { analysisProducedResults = false; return false; } else { analysisProducedResults = true; } removeMinimalOccurrences(taxonomyResult.get("Total"), filter.getMinimalOccurrences()); removeMinimalTaxonomy(taxonomyResult, filter.getMinimalTaxonomy()); stats.add(ImmutablePair.of(resultTitle, getSortedResult(taxonomyResult.get("Total"), Util.getValidInt(limit)))); Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), this, filter); return true; } /** * Removes lines, where number of different taxonomies is lower than specified number (minimalTaxonomy) */ private void removeMinimalTaxonomy(Map> taxonomyResult, Integer minimalTaxonomy) { if (minimalTaxonomy == 1) return; int occurances; for (MultipleHMKeys key : taxonomyResult.get("Total").keySet()){ occurances = 0; for (String columnNameKey : taxonomyResult.keySet()){ if(!columnNameKey.equals("Total") && taxonomyResult.get(columnNameKey).get(key).intValue() >= 1) occurances++; } if(occurances < minimalTaxonomy){ taxonomyResult.get("Total").remove(key); } } } /** * Removes lines where total number of occurrences is lower than specified number (minimalOccurrences) */ private void removeMinimalOccurrences(Map taxonomyResultTotal, Integer minimalOccurrences) { if (minimalOccurrences == 0) return; for (MultipleHMKeys key : taxonomyResultTotal.keySet()){ if(taxonomyResultTotal.get(key).intValue() < minimalOccurrences){ taxonomyResultTotal.remove(key); } } } public boolean saveResultNestedToDisk(int... limit) throws UnsupportedEncodingException { resultTitle = generateResultTitle(); if (useDB) { result = db.getDump(); db.delete(); } Map>> results = new HashMap<>(); // UNCOMMENT!!!!!! // if (!isEmpty(resultNestedSuffix)) { // results.put(WordLevelType.SUFFIX, sortNestedMap(resultNestedSuffix, Util.getValidInt(limit))); // } // // if (!isEmpty(resultNestedPrefix)) { // results.put(WordLevelType.PREFIX, sortNestedMap(resultNestedPrefix, Util.getValidInt(limit))); // } // if no results and nothing to save, return false if (!(results.size() > 0)) { analysisProducedResults = false; return false; } else { analysisProducedResults = true; } Export.nestedMapToCSV(resultTitle, results, corpus.getChosenResultsLocation(), headerInfoBlock()); return true; } public boolean recalculateAndSaveResultToDisk() throws UnsupportedEncodingException { filter.setAl(AnalysisLevel.WORD_FORMATION); resultTitle = generateResultTitle(); if (useDB) { result = db.getDump(); db.delete(); } // if no results and nothing to save, return false if (!(result.size() > 0)) { analysisProducedResults = false; return false; } else { analysisProducedResults = true; } WordFormation.calculateStatistics(this); Export.SetToCSV(resultTitle, resultCustom, corpus.getChosenResultsLocation(), headerInfoBlock()); return true; } private Map> sortNestedMap(Map> nestedMap, int limit) { Map> sorted = new HashMap<>(); for (String s : nestedMap.keySet()) { sorted.put(s, getSortedResult(nestedMap.get(s), Util.getValidInt(limit))); } return sorted; } private Map getSortedResult(Map map, int limit) { return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit); } public void updateTaxonomyResults(MultipleHMKeys o, List taxonomy) { for (String key : taxonomyResult.keySet()) { // first word should have the same taxonomy as others if (key.equals("Total") || taxonomy.contains(key)) { // if (key.equals("Total") || taxonomy != null && taxonomy.contains(key)) { // if taxonomy not in map and in this word AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(1)); if (r != null) taxonomyResult.get(key).get(o).incrementAndGet(); } else { // if taxonomy not in map and not in this word AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(0)); } } // if not in map // else } public Map> getTaxonomyResult() { return taxonomyResult; } public void updateResults(String o) { // if not in map AtomicLong r = result.putIfAbsent(o, new AtomicLong(1)); // else if (r != null) result.get(o).incrementAndGet(); } public Map getResult() { return result; } public Object[][] getResultCustom() { return resultCustom; } public void setResultCustom(Object[][] resultCustom) { this.resultCustom = resultCustom; } public void updateResultsNested(WordLevelType type, String key, String stringValue) { ConcurrentHashMap> resultsMap; if (type == WordLevelType.SUFFIX) { updateResultsNestedSuffix(key, stringValue); } else if (type == WordLevelType.PREFIX) { updateResultsNestedPrefix(key, stringValue); } } public void updateResultsNestedSuffix(String key, String stringValue) { MultipleHMKeys mkStringValue = new MultipleHMKeys1(stringValue); if (resultNestedSuffix.containsKey(key)) { // if not in map AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); // else if (r != null) { resultNestedSuffix.get(key).get(stringValue).incrementAndGet(); } } else { resultNestedSuffix.putIfAbsent(key, new ConcurrentHashMap<>()); AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); if (r != null) { resultNestedSuffix.get(key).get(stringValue).incrementAndGet(); } } } public void updateResultsNestedPrefix(String key, String stringValue) { MultipleHMKeys mkStringValue = new MultipleHMKeys1(stringValue); if (resultNestedPrefix.containsKey(key)) { // if not in map AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); // else if (r != null) { resultNestedPrefix.get(key).get(stringValue).incrementAndGet(); } } else { resultNestedPrefix.putIfAbsent(key, new ConcurrentHashMap<>()); AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1)); if (r != null) { resultNestedPrefix.get(key).get(stringValue).incrementAndGet(); } } } private LinkedHashMap headerInfoBlock() { LinkedHashMap info = new LinkedHashMap<>(); info.put("Korpus:", corpus.getCorpusType().toString()); info.put("Datum:", time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy hh:mm"))); if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { Integer ngramLevel = filter.getNgramValue(); if (ngramLevel == 0) info.put("Analiza", "Črke"); else if (ngramLevel == 1) info.put("Analiza", "Besede"); else info.put("Analiza", filter.getAl().toString()); } else { info.put("Analiza", filter.getAl().toString()); } if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { Integer ngramLevel = filter.getNgramValue(); // n.gram nivo if (ngramLevel > 1) { info.put("n-gram nivo:", String.valueOf(ngramLevel)); } // skip if (ngramLevel > 1) info.put("Skip:", isNotEmpty(filter.getSkipValue()) ? filter.getSkipValue().toString() : "0"); // izračunaj za info.put("Izračunaj za:", filter.getCalculateFor().toString()); // msd if (!isEmpty(filter.getMsd())) { StringBuilder msdPattern = new StringBuilder(); for (Pattern pattern : filter.getMsd()) { msdPattern.append(pattern.toString()).append(" "); } info.put("MSD:", msdPattern.toString()); } } if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) { ArrayList tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy()); info.put("Taksonomija: ", ""); String sep = ""; for (String s : tax) { info.put(sep = sep + " ", s); } } if (corpus.getCorpusType() == CorpusType.SOLAR) { HashMap> filters = corpus.getSolarFilters(); if (!isEmpty(filters)) { info.put("Dodatni filtri: ", ""); for (Map.Entry> f : filters.entrySet()) { info.put(f.getKey(), StringUtils.join(f.getValue(), ", ")); } } } return info; } public void updateCalculateCollocabilities(StatisticsNew oneWordStatistics) { Map> oneWordTaxonomyResult = oneWordStatistics.getTaxonomyResult(); Map collocabilityMap = new ConcurrentHashMap<>(); for(MultipleHMKeys hmKey : taxonomyResult.get("Total").keySet()) { // String[] splitedString = hmKey.getK1().split("\\s+"); long sum_fwi =0L; for(MultipleHMKeys smallHmKey : hmKey.getSplittedMultipleHMKeys()){ System.out.println(smallHmKey.getK1()); sum_fwi += oneWordTaxonomyResult.get("Total").get(smallHmKey).longValue(); } double dice_value = (double) filter.getNgramValue() * (double)taxonomyResult.get("Total").get(hmKey).longValue() / sum_fwi; collocabilityMap.put(hmKey, dice_value); } collocability.put(filter.getCollocability().get(0), collocabilityMap); } public Map> getCollocability(){ return this.collocability; } }