You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

449 lines
12 KiB

package data;
import static gui.ValidationUtil.*;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import alg.inflectedJOS.WordFormation;
import data.Enums.WordLevelType;
import javafx.collections.ObservableList;
import util.Export;
import util.Util;
import util.db.RDB;
public class StatisticsNew {
public final static Logger logger = LogManager.getLogger(StatisticsNew.class);
private Corpus corpus;
private Filter filter;
private String resultTitle;
private Map<String, AtomicLong> result;
private Map<String, Map<String, AtomicLong>> taxonomyResult;
private Object[][] resultCustom; // for when calculating percentages that don't add up to 100%
private Map<String, ConcurrentHashMap<String, AtomicLong>> resultNestedSuffix;
private Map<String, ConcurrentHashMap<String, AtomicLong>> resultNestedPrefix;
private boolean useDB;
private RDB db;
private boolean analysisProducedResults;
private LocalDateTime time;
public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) {
this.corpus = corpus;
this.filter = filter;
this.taxonomyResult = new ConcurrentHashMap<>();
// create table for counting word occurances per taxonomies
if (this.filter.getTaxonomy().isEmpty()) {
for (int i = 0; i < this.corpus.getTaxonomy().size(); i++) {
this.taxonomyResult.put(this.corpus.getTaxonomy().get(i), new ConcurrentHashMap<>());
} else {
for (int i = 0; i < this.filter.getTaxonomy().size(); i++) {
Tax taxonomy = new Tax();
this.taxonomyResult.put(taxonomy.getLongTaxonomyName(this.filter.getTaxonomy().get(i)), new ConcurrentHashMap<>());
if (useDB) {
this.useDB = true;
db = new RDB();
if (filter.getAl() == AnalysisLevel.WORD_LEVEL) {
resultNestedSuffix = new ConcurrentHashMap<>();
resultNestedPrefix = new ConcurrentHashMap<>();
} else {
result = new ConcurrentHashMap<>();
resultTitle = generateResultTitle();
* Result's title consists of:
* <ul>
* <li>Corpus type</li>
* <li>Analysis level</li>
* <li>Calculate for</li>
* <li></li>
* <li></li>
* <li></li>
* <li></li>
* </ul>
* @return
private String generateResultTitle() {
String separator = "_";
StringBuilder sb = new StringBuilder();
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
if(ngramLevel == 0) {
} else if(ngramLevel == 1) {
else {
// ngram value
// TODO: assure skip is not null but zero
} else {
sb.append(filter.getAl().toString()) // analysis level
// skip value
// msd ?
// if taxonomy -> taxonomy
// if cvv -> cvv + dolžina
this.time = this.time != null ? this.time :;
return sb.toString();
public boolean isAnalysisProducedResults() {
return analysisProducedResults;
public void setAnalysisProducedResults(boolean analysisProducedResults) {
this.analysisProducedResults = analysisProducedResults;
public String toString() {
String newLine = "\n\t- ";
StringBuilder sb = new StringBuilder();
sb.append(newLine).append("Statistic properties:");
sb.append(newLine).append(corpus.getCorpusType().toString()).append(String.format(" (%d files)", corpus.getDetectedCorpusFiles().size()));
sb.append(newLine).append(useDB ? "use DB" : "run in memory");
return sb.toString();
public String getResultTitle() {
return resultTitle;
// ****************************************
// ***************** util *****************
// ****************************************
* Stores results from this batch to a database and clears results map
public void storeTmpResultsToDB() {
try {
result = new ConcurrentHashMap<>();
} catch (UnsupportedEncodingException e) {
logger.error("Store tmp results to DB", e);
// e.printStackTrace();
public Filter getFilter() {
return filter;
public Corpus getCorpus() {
return corpus;
public boolean saveResultToDisk(int... limit) throws UnsupportedEncodingException {
Set<Pair<String, Map<String, Long>>> stats = new HashSet<>();
if (useDB) {
result = db.getDump();
// if no results and nothing to save, return false
if (!(result.size() > 0)) {
analysisProducedResults = false;
return false;
} else {
analysisProducedResults = true;
stats.add(ImmutablePair.of(resultTitle, getSortedResult(result, Util.getValidInt(limit))));
Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), taxonomyResult);
return true;
public boolean saveResultNestedToDisk(int... limit) throws UnsupportedEncodingException {
resultTitle = generateResultTitle();
if (useDB) {
result = db.getDump();
Map<WordLevelType, Map<String, Map<String, Long>>> results = new HashMap<>();
if (!isEmpty(resultNestedSuffix)) {
results.put(WordLevelType.SUFFIX, sortNestedMap(resultNestedSuffix, Util.getValidInt(limit)));
if (!isEmpty(resultNestedPrefix)) {
results.put(WordLevelType.PREFIX, sortNestedMap(resultNestedPrefix, Util.getValidInt(limit)));
// if no results and nothing to save, return false
if (!(results.size() > 0)) {
analysisProducedResults = false;
return false;
} else {
analysisProducedResults = true;
Export.nestedMapToCSV(resultTitle, results, corpus.getChosenResultsLocation(), headerInfoBlock());
return true;
public boolean recalculateAndSaveResultToDisk() throws UnsupportedEncodingException {
resultTitle = generateResultTitle();
if (useDB) {
result = db.getDump();
// if no results and nothing to save, return false
if (!(result.size() > 0)) {
analysisProducedResults = false;
return false;
} else {
analysisProducedResults = true;
Export.SetToCSV(resultTitle, resultCustom, corpus.getChosenResultsLocation(), headerInfoBlock());
return true;
private Map<String, Map<String, Long>> sortNestedMap(Map<String, ConcurrentHashMap<String, AtomicLong>> nestedMap, int limit) {
Map<String, Map<String, Long>> sorted = new HashMap<>();
for (String s : nestedMap.keySet()) {
sorted.put(s, getSortedResult(nestedMap.get(s), Util.getValidInt(limit)));
return sorted;
private Map<String, Long> getSortedResult(Map<String, AtomicLong> map, int limit) {
return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit);
public void updateTaxonomyResults(String o, List<Word> ngramCandidate) {
for (String key : taxonomyResult.keySet()) {
// first word should have the same taxonomy as others
if (ngramCandidate.get(0).getTaxonomy().contains(key)) {
// if taxonomy not in map and in this word
AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(1));
if (r != null)
} else {
// if taxonomy not in map and not in this word
AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(0));
// if not in map
// else
public void updateResults(String o) {
// if not in map
AtomicLong r = result.putIfAbsent(o, new AtomicLong(1));
// else
if (r != null)
public Map<String, AtomicLong> getResult() {
return result;
public Object[][] getResultCustom() {
return resultCustom;
public void setResultCustom(Object[][] resultCustom) {
this.resultCustom = resultCustom;
public void updateResultsNested(WordLevelType type, String key, String stringValue) {
ConcurrentHashMap<String, ConcurrentHashMap<String, AtomicLong>> resultsMap;
if (type == WordLevelType.SUFFIX) {
updateResultsNestedSuffix(key, stringValue);
} else if (type == WordLevelType.PREFIX) {
updateResultsNestedPrefix(key, stringValue);
public void updateResultsNestedSuffix(String key, String stringValue) {
if (resultNestedSuffix.containsKey(key)) {
// if not in map
AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
// else
if (r != null) {
} else {
resultNestedSuffix.putIfAbsent(key, new ConcurrentHashMap<>());
AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
if (r != null) {
public void updateResultsNestedPrefix(String key, String stringValue) {
if (resultNestedPrefix.containsKey(key)) {
// if not in map
AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
// else
if (r != null) {
} else {
resultNestedPrefix.putIfAbsent(key, new ConcurrentHashMap<>());
AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
if (r != null) {
private LinkedHashMap<String, String> headerInfoBlock() {
LinkedHashMap<String, String> info = new LinkedHashMap<>();
info.put("Korpus:", corpus.getCorpusType().toString());
info.put("Datum:", time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy hh:mm")));
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
if (ngramLevel == 0)
info.put("Analiza:", "Črke");
else if (ngramLevel == 1)
info.put("Analiza", "Besede");
info.put("Analiza:", filter.getAl().toString());
} else {
info.put("Analiza:", filter.getAl().toString());
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
// n.gram nivo
if (ngramLevel > 1) {
info.put("n-gram nivo:", String.valueOf(ngramLevel));
// else if (ngramLevel == 1){
// info.put("n-gram nivo:", "nivo besed");
// } else {
// info.put("n-gram nivo:", "nivo črk");
// }
// skip
if (ngramLevel > 1)
info.put("Skip:", isNotEmpty(filter.getSkipValue()) ? filter.getSkipValue().toString() : "0");
// izračunaj za
info.put("Izračunaj za:", filter.getCalculateFor().toString());
// msd
if (!isEmpty(filter.getMsd())) {
StringBuilder msdPattern = new StringBuilder();
for (Pattern pattern : filter.getMsd()) {
msdPattern.append(pattern.toString()).append(" ");
info.put("MSD:", msdPattern.toString());
// taksonomija
// if (!isEmpty(filter.getTaxonomy())) {
// info.put("Taksonomija:", StringUtils.join(filter.getTaxonomy(), ", "));
// }
if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
ArrayList<String> tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy());
info.put("Taksonomija: ", "");
String sep = "";
for (String s : tax) {
info.put(sep = sep + " ", s);
if (corpus.getCorpusType() == CorpusType.SOLAR) {
HashMap<String, ObservableList<String>> filters = corpus.getSolarFilters();
if (!isEmpty(filters)) {
info.put("Dodatni filtri: ", "");
for (Map.Entry<String, ObservableList<String>> f : filters.entrySet()) {
info.put(f.getKey(), StringUtils.join(f.getValue(), ", "));
return info;