package data; import static gui.ValidationUtil.*; import java.io.UnsupportedEncodingException; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import alg.inflectedJOS.WordFormation; import data.Enums.WordLevelType; import javafx.collections.ObservableList; import util.Export; import util.Util; import util.db.RDB; @SuppressWarnings("Duplicates") public class StatisticsNew { public final static Logger logger = LogManager.getLogger(StatisticsNew.class); private Corpus corpus; private Filter filter; private String resultTitle; private Map result; private Map> taxonomyResult; private Object[][] resultCustom; // for when calculating percentages that don't add up to 100% private Map> resultNestedSuffix; private Map> resultNestedPrefix; private boolean useDB; private RDB db; private boolean analysisProducedResults; private LocalDateTime time; public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) { this.corpus = corpus; this.filter = filter; this.taxonomyResult = new ConcurrentHashMap<>(); // create table for counting word occurances per taxonomies if (this.filter.getTaxonomy().isEmpty()) { for (int i = 0; i < this.corpus.getTaxonomy().size(); i++) { this.taxonomyResult.put(this.corpus.getTaxonomy().get(i), new ConcurrentHashMap<>()); } } else { for (int i = 0; i < this.filter.getTaxonomy().size(); i++) { Tax taxonomy = new Tax(); this.taxonomyResult.put(taxonomy.getLongTaxonomyName(this.filter.getTaxonomy().get(i)), new ConcurrentHashMap<>()); } } if (useDB) { this.useDB = true; db = new RDB(); } if (filter.getAl() == AnalysisLevel.WORD_LEVEL) { resultNestedSuffix = new ConcurrentHashMap<>(); resultNestedPrefix = new ConcurrentHashMap<>(); } else { result = new ConcurrentHashMap<>(); } resultTitle = generateResultTitle(); logger.debug(toString()); } /** * Result's title consists of: *
    *
  • Corpus type
  • *
  • Analysis level
  • *
  • Calculate for
  • *
  • *
  • *
  • *
  • *
* * @return */ private String generateResultTitle() { String separator = "_"; StringBuilder sb = new StringBuilder(); if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { Integer ngramLevel = filter.getNgramValue(); if(ngramLevel == 0) { sb.append("Crke"). append(separator) .append(corpus.getCorpusType().toString()) .append(separator); } else if(ngramLevel == 1) { sb.append("Besede").append(separator) .append(corpus.getCorpusType().toString()) .append(separator); } else { sb.append(filter.getAl().toString()) .append(separator) .append(corpus.getCorpusType().toString()) .append(separator); sb.append(filter.getCalculateFor().toString()) .append(separator); // ngram value sb.append(filter.getNgramValue()).append("-gram") .append(separator); sb.append(filter.getSkipValue()).append("-preskok") .append(separator); } // TODO: assure skip is not null but zero } else { sb.append(filter.getAl().toString()) // analysis level .append(separator) .append(corpus.getCorpusType().toString()) .append(separator); } // skip value // msd ? // if taxonomy -> taxonomy // if cvv -> cvv + dolžina this.time = this.time != null ? this.time : LocalDateTime.now(); sb.append(time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm.ss"))); return sb.toString(); } public boolean isAnalysisProducedResults() { return analysisProducedResults; } public void setAnalysisProducedResults(boolean analysisProducedResults) { this.analysisProducedResults = analysisProducedResults; } public String toString() { String newLine = "\n\t- "; StringBuilder sb = new StringBuilder(); sb.append(newLine).append("Statistic properties:"); sb.append(newLine).append(corpus.getCorpusType().toString()).append(String.format(" (%d files)", corpus.getDetectedCorpusFiles().size())); sb.append(newLine).append(useDB ? "use DB" : "run in memory"); sb.append(newLine).append(filter.toString()); return sb.toString(); } public String getResultTitle() { return resultTitle; } // **************************************** // ***************** util ***************** // **************************************** /** * Stores results from this batch to a database and clears results map */ public void storeTmpResultsToDB() { try { db.writeBatch(result); result = new ConcurrentHashMap<>(); } catch (UnsupportedEncodingException e) { logger.error("Store tmp results to DB", e); // e.printStackTrace(); } } public Filter getFilter() { return filter; } public Corpus getCorpus() { return corpus; } public boolean saveResultToDisk(int... limit) throws UnsupportedEncodingException { Set>> stats = new HashSet<>(); if (useDB) { result = db.getDump(); db.delete(); } // if no results and nothing to save, return false if (!(result.size() > 0)) { analysisProducedResults = false; return false; } else { analysisProducedResults = true; } stats.add(ImmutablePair.of(resultTitle, getSortedResult(result, Util.getValidInt(limit)))); Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), taxonomyResult); return true; } public boolean saveResultNestedToDisk(int... limit) throws UnsupportedEncodingException { resultTitle = generateResultTitle(); if (useDB) { result = db.getDump(); db.delete(); } Map>> results = new HashMap<>(); if (!isEmpty(resultNestedSuffix)) { results.put(WordLevelType.SUFFIX, sortNestedMap(resultNestedSuffix, Util.getValidInt(limit))); } if (!isEmpty(resultNestedPrefix)) { results.put(WordLevelType.PREFIX, sortNestedMap(resultNestedPrefix, Util.getValidInt(limit))); } // if no results and nothing to save, return false if (!(results.size() > 0)) { analysisProducedResults = false; return false; } else { analysisProducedResults = true; } Export.nestedMapToCSV(resultTitle, results, corpus.getChosenResultsLocation(), headerInfoBlock()); return true; } public boolean recalculateAndSaveResultToDisk() throws UnsupportedEncodingException { filter.setAl(AnalysisLevel.WORD_FORMATION); resultTitle = generateResultTitle(); if (useDB) { result = db.getDump(); db.delete(); } // if no results and nothing to save, return false if (!(result.size() > 0)) { analysisProducedResults = false; return false; } else { analysisProducedResults = true; } WordFormation.calculateStatistics(this); Export.SetToCSV(resultTitle, resultCustom, corpus.getChosenResultsLocation(), headerInfoBlock()); return true; } private Map> sortNestedMap(Map> nestedMap, int limit) { Map> sorted = new HashMap<>(); for (String s : nestedMap.keySet()) { sorted.put(s, getSortedResult(nestedMap.get(s), Util.getValidInt(limit))); } return sorted; } private Map getSortedResult(Map map, int limit) { return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit); } public void updateTaxonomyResults(String o, List ngramCandidate) { for (String key : taxonomyResult.keySet()) { // first word should have the same taxonomy as others if (ngramCandidate.get(0).getTaxonomy().contains(key)) { // if taxonomy not in map and in this word AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(1)); if (r != null) taxonomyResult.get(key).get(o).incrementAndGet(); } else { // if taxonomy not in map and not in this word AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(0)); } } // if not in map // else } public void updateResults(String o) { // if not in map AtomicLong r = result.putIfAbsent(o, new AtomicLong(1)); // else if (r != null) result.get(o).incrementAndGet(); } public Map getResult() { return result; } public Object[][] getResultCustom() { return resultCustom; } public void setResultCustom(Object[][] resultCustom) { this.resultCustom = resultCustom; } public void updateResultsNested(WordLevelType type, String key, String stringValue) { ConcurrentHashMap> resultsMap; if (type == WordLevelType.SUFFIX) { updateResultsNestedSuffix(key, stringValue); } else if (type == WordLevelType.PREFIX) { updateResultsNestedPrefix(key, stringValue); } } public void updateResultsNestedSuffix(String key, String stringValue) { if (resultNestedSuffix.containsKey(key)) { // if not in map AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1)); // else if (r != null) { resultNestedSuffix.get(key).get(stringValue).incrementAndGet(); } } else { resultNestedSuffix.putIfAbsent(key, new ConcurrentHashMap<>()); AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1)); if (r != null) { resultNestedSuffix.get(key).get(stringValue).incrementAndGet(); } } } public void updateResultsNestedPrefix(String key, String stringValue) { if (resultNestedPrefix.containsKey(key)) { // if not in map AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1)); // else if (r != null) { resultNestedPrefix.get(key).get(stringValue).incrementAndGet(); } } else { resultNestedPrefix.putIfAbsent(key, new ConcurrentHashMap<>()); AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1)); if (r != null) { resultNestedPrefix.get(key).get(stringValue).incrementAndGet(); } } } private LinkedHashMap headerInfoBlock() { LinkedHashMap info = new LinkedHashMap<>(); info.put("Korpus:", corpus.getCorpusType().toString()); info.put("Datum:", time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy hh:mm"))); if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { Integer ngramLevel = filter.getNgramValue(); if (ngramLevel == 0) info.put("Analiza:", "Črke"); else if (ngramLevel == 1) info.put("Analiza", "Besede"); else info.put("Analiza:", filter.getAl().toString()); } else { info.put("Analiza:", filter.getAl().toString()); } if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { Integer ngramLevel = filter.getNgramValue(); // n.gram nivo if (ngramLevel > 1) { info.put("n-gram nivo:", String.valueOf(ngramLevel)); } // else if (ngramLevel == 1){ // info.put("n-gram nivo:", "nivo besed"); // } else { // info.put("n-gram nivo:", "nivo črk"); // } // skip if (ngramLevel > 1) info.put("Skip:", isNotEmpty(filter.getSkipValue()) ? filter.getSkipValue().toString() : "0"); // izračunaj za info.put("Izračunaj za:", filter.getCalculateFor().toString()); // msd if (!isEmpty(filter.getMsd())) { StringBuilder msdPattern = new StringBuilder(); for (Pattern pattern : filter.getMsd()) { msdPattern.append(pattern.toString()).append(" "); } info.put("MSD:", msdPattern.toString()); } // taksonomija // if (!isEmpty(filter.getTaxonomy())) { // info.put("Taksonomija:", StringUtils.join(filter.getTaxonomy(), ", ")); // } } if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) { ArrayList tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy()); info.put("Taksonomija: ", ""); String sep = ""; for (String s : tax) { info.put(sep = sep + " ", s); } } if (corpus.getCorpusType() == CorpusType.SOLAR) { HashMap> filters = corpus.getSolarFilters(); if (!isEmpty(filters)) { info.put("Dodatni filtri: ", ""); for (Map.Entry> f : filters.entrySet()) { info.put(f.getKey(), StringUtils.join(f.getValue(), ", ")); } } } return info; } }