list/src/main/java/data/StatisticsNew.java

724 lines
24 KiB
Java
Raw Normal View History

2018-06-19 07:15:37 +00:00
package data;
import static gui.ValidationUtil.*;
import java.io.UnsupportedEncodingException;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.time.temporal.ChronoUnit;
2018-06-19 07:15:37 +00:00
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import gui.I18N;
2018-06-19 07:15:37 +00:00
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import alg.inflectedJOS.WordFormation;
import data.Enums.WordLevelType;
import javafx.collections.ObservableList;
import util.Export;
import util.Util;
import util.db.RDB;
@SuppressWarnings("Duplicates")
public class StatisticsNew {
public final static Logger logger = LogManager.getLogger(StatisticsNew.class);
private Corpus corpus;
private Filter filter;
private String resultTitle;
private Map<String, AtomicLong> result;
2018-11-26 12:41:35 +00:00
private Map<Taxonomy, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
2018-06-19 07:15:37 +00:00
private Object[][] resultCustom; // for when calculating percentages that don't add up to 100%
private Map<String, ConcurrentHashMap<MultipleHMKeys, AtomicLong>> resultNestedSuffix;
private Map<String, ConcurrentHashMap<MultipleHMKeys, AtomicLong>> resultNestedPrefix;
2018-06-19 07:15:37 +00:00
private boolean useDB;
private RDB db;
private boolean analysisProducedResults;
private LocalDateTime timeBeginning;
private LocalDateTime timeEnding;
private Map<Collocability, Map<MultipleHMKeys, Double>> collocability;
2018-11-26 12:41:35 +00:00
private Map<Taxonomy, AtomicLong> uniGramTaxonomyOccurrences;
2018-06-19 07:15:37 +00:00
public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) {
this.corpus = corpus;
this.filter = filter;
2018-06-29 10:53:29 +00:00
this.taxonomyResult = new ConcurrentHashMap<>();
this.taxonomyResult.put(corpus.getTotal(), new ConcurrentHashMap<>());
this.collocability = new ConcurrentHashMap<>();
2018-11-20 08:52:16 +00:00
this.uniGramTaxonomyOccurrences = new ConcurrentHashMap<>();
this.uniGramTaxonomyOccurrences.put(corpus.getTotal(), new AtomicLong(0L));
2018-06-29 10:53:29 +00:00
// create table for counting word occurrences per taxonomies
if (this.corpus.getObservableListTaxonomy() != null && filter.getDisplayTaxonomy()) {
if (this.filter.getTaxonomy().isEmpty()) {
for (int i = 0; i < this.corpus.getObservableListTaxonomy().size(); i++) {
this.taxonomyResult.put(Taxonomy.factoryLongName(this.corpus.getObservableListTaxonomy().get(i), corpus), new ConcurrentHashMap<>());
}
} else {
for (int i = 0; i < this.filter.getTaxonomy().size(); i++) {
2018-11-26 12:41:35 +00:00
// Tax taxonomy = new Tax();
this.taxonomyResult.put(this.filter.getTaxonomy().get(i), new ConcurrentHashMap<>());
}
2018-06-29 10:53:29 +00:00
}
}
2018-06-19 07:15:37 +00:00
if (useDB) {
this.useDB = true;
db = new RDB();
}
if (filter.getAl() == AnalysisLevel.WORD_LEVEL) {
resultNestedSuffix = new ConcurrentHashMap<>();
resultNestedPrefix = new ConcurrentHashMap<>();
} else {
result = new ConcurrentHashMap<>();
}
this.timeBeginning = LocalDateTime.now();
// resultTitle = generateResultTitle();
2018-06-19 07:15:37 +00:00
logger.debug(toString());
}
/**
* Result's title consists of:
* <ul>
* <li>Corpus type</li>
* <li>Analysis level</li>
* <li>Calculate for</li>
* <li></li>
* <li></li>
* <li></li>
* <li></li>
* </ul>
*
* @return
*/
public String generateResultTitle() {
2018-06-19 07:15:37 +00:00
String separator = "_";
StringBuilder sb = new StringBuilder();
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
if(ngramLevel == 0) {
sb.append(corpus.getCorpusType().toString())
.append(separator)
.append(I18N.get("exportFileName.letters"))
.append(separator)
.append(filter.getCalculateFor())
2018-06-19 07:15:37 +00:00
.append(separator);
} else if(ngramLevel == 1) {
if (filter.getSuffixLength() != null && filter.getSuffixList() != null && filter.getPrefixLength() != null && filter.getPrefixList() != null) {
sb.append(corpus.getCorpusType().toString())
.append(separator)
.append(I18N.get("exportFileName.wordParts"))
.append(separator)
.append(filter.getCalculateFor())
.append(separator);
} else {
sb.append(corpus.getCorpusType().toString())
.append(separator)
.append(I18N.get("exportFileName.words"))
.append(separator)
.append(filter.getCalculateFor())
.append(separator);
}
2018-06-19 07:15:37 +00:00
}
else {
sb.append(corpus.getCorpusType().toString())
2018-06-19 07:15:37 +00:00
.append(separator)
.append(I18N.get("exportFileName.wordSets"))
2018-06-19 07:15:37 +00:00
.append(separator);
sb.append(filter.getCalculateFor().toString())
.append(separator);
// ngram value
sb.append(filter.getNgramValue()).append(I18N.get("exportFileName.gram"))
2018-06-19 07:15:37 +00:00
.append(separator);
sb.append(filter.getSkipValue()).append(I18N.get("exportFileName.skip"))
2018-06-19 07:15:37 +00:00
.append(separator);
}
// TODO: assure skip is not null but zero
} else {
sb.append(filter.getAl().toString()) // analysis level
.append(separator)
.append(corpus.getCorpusType().toString())
.append(separator);
}
// skip value
// msd ?
// if taxonomy -> taxonomy
// if cvv -> cvv + dolžina
sb.append(getTimeEnding());
2018-06-19 07:15:37 +00:00
return sb.toString();
}
public void setTimeEnding(){
this.timeEnding = LocalDateTime.now();
}
public String getTimeEnding(){
return timeEnding.format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm.ss"));
}
2018-06-19 07:15:37 +00:00
public boolean isAnalysisProducedResults() {
return analysisProducedResults;
}
public void setAnalysisProducedResults(boolean analysisProducedResults) {
this.analysisProducedResults = analysisProducedResults;
}
public String toString() {
String newLine = "\n\t- ";
StringBuilder sb = new StringBuilder();
sb.append(newLine).append("Statistic properties:");
sb.append(newLine).append(corpus.getCorpusType().toString()).append(String.format(" (%d files)", corpus.getDetectedCorpusFiles().size()));
sb.append(newLine).append(useDB ? "use DB" : "run in memory");
sb.append(newLine).append(filter.toString());
return sb.toString();
}
public String getResultTitle() {
return resultTitle;
}
// ****************************************
// ***************** util *****************
// ****************************************
/**
* Stores results from this batch to a database and clears results map
*/
2018-11-26 12:41:35 +00:00
// public void storeTmpResultsToDB() {
// try {
// db.writeBatch(result);
// result = new ConcurrentHashMap<>();
// } catch (UnsupportedEncodingException e) {
// logger.error("Store tmp results to DB", e);
// // e.printStackTrace();
// }
// }
2018-06-19 07:15:37 +00:00
public Filter getFilter() {
return filter;
}
public Corpus getCorpus() {
return corpus;
}
public boolean saveResultToDisk(int... limit) throws UnsupportedEncodingException {
Set<Pair<String, Map<MultipleHMKeys, Long>>> stats = new HashSet<>();
2018-06-19 07:15:37 +00:00
if (useDB) {
result = db.getDump();
db.delete();
}
removeMinimalOccurrences(filter.getMinimalOccurrences());
removeMinimalTaxonomy(taxonomyResult, filter.getMinimalTaxonomy());
2018-06-19 07:15:37 +00:00
// if no results and nothing to save, return false
if (!(taxonomyResult.get(corpus.getTotal()).size() > 0)) {
2018-06-19 07:15:37 +00:00
analysisProducedResults = false;
return false;
} else {
analysisProducedResults = true;
}
stats.add(ImmutablePair.of(resultTitle, getSortedResult(taxonomyResult.get(corpus.getTotal()), Util.getValidInt(limit))));
Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), this, filter);
2018-06-19 07:15:37 +00:00
return true;
}
/**
* Removes lines, where number of different taxonomies is lower than specified number (minimalTaxonomy)
*/
2018-11-26 12:41:35 +00:00
private void removeMinimalTaxonomy(Map<Taxonomy, Map<MultipleHMKeys, AtomicLong>> taxonomyResult, Integer minimalTaxonomy) {
if (minimalTaxonomy == 1)
return;
int occurances;
for (MultipleHMKeys key : taxonomyResult.get(corpus.getTotal()).keySet()){
occurances = 0;
2018-11-26 12:41:35 +00:00
for (Taxonomy columnNameKey : taxonomyResult.keySet()){
if(!columnNameKey.equals(corpus.getTotal()) && taxonomyResult.get(columnNameKey).get(key).intValue() >= 1)
occurances++;
}
if(occurances < minimalTaxonomy){
taxonomyResult.get(corpus.getTotal()).remove(key);
}
}
}
/**
* Removes lines where total number of occurrences is lower than specified number (minimalOccurrences)
*/
private void removeMinimalOccurrences(Integer minimalOccurrences) {
if (minimalOccurrences == 0)
return;
for (MultipleHMKeys key : taxonomyResult.get(corpus.getTotal()).keySet()){
if(taxonomyResult.get(corpus.getTotal()).get(key).intValue() < minimalOccurrences){
for (Taxonomy t : taxonomyResult.keySet()){
taxonomyResult.get(t).remove(key);
}
}
}
}
2018-06-19 07:15:37 +00:00
public boolean saveResultNestedToDisk(int... limit) throws UnsupportedEncodingException {
resultTitle = generateResultTitle();
if (useDB) {
result = db.getDump();
db.delete();
}
Map<WordLevelType, Map<String, Map<String, Long>>> results = new HashMap<>();
// UNCOMMENT!!!!!!
// if (!isEmpty(resultNestedSuffix)) {
// results.put(WordLevelType.SUFFIX, sortNestedMap(resultNestedSuffix, Util.getValidInt(limit)));
// }
//
// if (!isEmpty(resultNestedPrefix)) {
// results.put(WordLevelType.PREFIX, sortNestedMap(resultNestedPrefix, Util.getValidInt(limit)));
// }
2018-06-19 07:15:37 +00:00
// if no results and nothing to save, return false
if (!(results.size() > 0)) {
analysisProducedResults = false;
return false;
} else {
analysisProducedResults = true;
}
Export.nestedMapToCSV(resultTitle, results, corpus.getChosenResultsLocation(), headerInfoBlock());
return true;
}
public boolean recalculateAndSaveResultToDisk() throws UnsupportedEncodingException {
filter.setAl(AnalysisLevel.WORD_FORMATION);
resultTitle = generateResultTitle();
if (useDB) {
result = db.getDump();
db.delete();
}
// if no results and nothing to save, return false
if (!(result.size() > 0)) {
analysisProducedResults = false;
return false;
} else {
analysisProducedResults = true;
}
WordFormation.calculateStatistics(this);
Export.SetToCSV(resultTitle, resultCustom, corpus.getChosenResultsLocation(), headerInfoBlock());
return true;
}
private Map<String, Map<MultipleHMKeys, Long>> sortNestedMap(Map<String, ConcurrentHashMap<MultipleHMKeys, AtomicLong>> nestedMap, int limit) {
Map<String, Map<MultipleHMKeys, Long>> sorted = new HashMap<>();
2018-06-19 07:15:37 +00:00
for (String s : nestedMap.keySet()) {
sorted.put(s, getSortedResult(nestedMap.get(s), Util.getValidInt(limit)));
}
return sorted;
}
private Map<MultipleHMKeys, Long> getSortedResult(Map<MultipleHMKeys, AtomicLong> map, int limit) {
2018-06-19 07:15:37 +00:00
return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit);
}
2018-11-26 12:41:35 +00:00
public void updateUniGramOccurrences(int amount, ArrayList<Taxonomy> taxonomy){
uniGramTaxonomyOccurrences.get(corpus.getTotal()).set(uniGramTaxonomyOccurrences.get(corpus.getTotal()).longValue() + amount);
2018-11-26 12:41:35 +00:00
for (Taxonomy t : taxonomy){
2018-11-20 08:52:16 +00:00
if (uniGramTaxonomyOccurrences.get(t) != null){
uniGramTaxonomyOccurrences.get(t).set(uniGramTaxonomyOccurrences.get(t).longValue() + amount);
} else {
uniGramTaxonomyOccurrences.put(t, new AtomicLong(amount));
}
}
}
2018-11-26 12:41:35 +00:00
public Map<Taxonomy, AtomicLong> getUniGramOccurrences(){
// return uniGramTaxonomyOccurrences.get(corpus.getTotal()).longValue();
2018-11-20 08:52:16 +00:00
return uniGramTaxonomyOccurrences;
}
2018-11-26 12:41:35 +00:00
public void updateTaxonomyResults(MultipleHMKeys o, List<Taxonomy> taxonomy) {
for (Taxonomy key : taxonomyResult.keySet()) {
2018-06-29 10:53:29 +00:00
// first word should have the same taxonomy as others
if (key.equals(corpus.getTotal()) || taxonomy.contains(key)) {
// if (key.equals(corpus.getTotal()) || taxonomy != null && taxonomy.contains(key)) {
2018-06-29 10:53:29 +00:00
// if taxonomy not in map and in this word
AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(1));
if (r != null)
taxonomyResult.get(key).get(o).incrementAndGet();
} else {
// if taxonomy not in map and not in this word
AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(0));
}
}
// if not in map
// else
}
2018-11-26 12:41:35 +00:00
public Map<Taxonomy, Map<MultipleHMKeys, AtomicLong>> getTaxonomyResult() {
return taxonomyResult;
}
2018-06-19 07:15:37 +00:00
public void updateResults(String o) {
// if not in map
AtomicLong r = result.putIfAbsent(o, new AtomicLong(1));
// else
if (r != null)
result.get(o).incrementAndGet();
}
public Map<String, AtomicLong> getResult() {
return result;
}
public Object[][] getResultCustom() {
return resultCustom;
}
public void setResultCustom(Object[][] resultCustom) {
this.resultCustom = resultCustom;
}
public void updateResultsNested(WordLevelType type, String key, String stringValue) {
ConcurrentHashMap<String, ConcurrentHashMap<String, AtomicLong>> resultsMap;
if (type == WordLevelType.SUFFIX) {
updateResultsNestedSuffix(key, stringValue);
} else if (type == WordLevelType.PREFIX) {
updateResultsNestedPrefix(key, stringValue);
}
}
public void updateResultsNestedSuffix(String key, String stringValue) {
2018-08-09 07:21:06 +00:00
MultipleHMKeys mkStringValue = new MultipleHMKeys1(stringValue);
2018-06-19 07:15:37 +00:00
if (resultNestedSuffix.containsKey(key)) {
// if not in map
AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1));
2018-06-19 07:15:37 +00:00
// else
if (r != null) {
resultNestedSuffix.get(key).get(stringValue).incrementAndGet();
}
} else {
resultNestedSuffix.putIfAbsent(key, new ConcurrentHashMap<>());
AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1));
2018-06-19 07:15:37 +00:00
if (r != null) {
resultNestedSuffix.get(key).get(stringValue).incrementAndGet();
}
}
}
public void updateResultsNestedPrefix(String key, String stringValue) {
2018-08-09 07:21:06 +00:00
MultipleHMKeys mkStringValue = new MultipleHMKeys1(stringValue);
2018-06-19 07:15:37 +00:00
if (resultNestedPrefix.containsKey(key)) {
// if not in map
AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1));
2018-06-19 07:15:37 +00:00
// else
if (r != null) {
resultNestedPrefix.get(key).get(stringValue).incrementAndGet();
}
} else {
resultNestedPrefix.putIfAbsent(key, new ConcurrentHashMap<>());
AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(mkStringValue, new AtomicLong(1));
2018-06-19 07:15:37 +00:00
if (r != null) {
resultNestedPrefix.get(key).get(stringValue).incrementAndGet();
}
}
}
private LinkedHashMap<String, String> headerInfoBlock() {
LinkedHashMap<String, String> info = new LinkedHashMap<>();
info.put(I18N.get("exportHeader.corpus"), corpus.getCorpusType().toString());
setTimeEnding();
info.put(I18N.get("exportHeader.date"), timeEnding.format(DateTimeFormatter.ofPattern("dd.MM.yyyy hh:mm")));
2018-11-20 08:52:16 +00:00
// time elapsed
long seconds = ChronoUnit.MILLIS.between(timeBeginning, timeEnding) / 1000;
info.put(I18N.get("exportHeader.executionTime"), String.valueOf(seconds) + " s");
2018-11-20 08:52:16 +00:00
2018-06-19 07:15:37 +00:00
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
if (ngramLevel == 0)
info.put(I18N.get("exportHeader.analysis"), I18N.get("exportHeader.analysis.letters"));
2018-11-08 10:37:16 +00:00
else if (ngramLevel == 1) {
// if suffixes or prefixes are not null print word parts
if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) {
info.put(I18N.get("exportHeader.analysis"), I18N.get("exportHeader.analysis.wordParts"));
2018-11-08 10:37:16 +00:00
} else {
info.put(I18N.get("exportHeader.analysis"), I18N.get("exportHeader.analysis.words"));
2018-11-08 10:37:16 +00:00
}
} else
info.put(I18N.get("exportHeader.analysis"), I18N.get("exportHeader.analysis.wordSets"));
2018-06-19 07:15:37 +00:00
} else {
info.put(I18N.get("exportHeader.analysis"), filter.getAl().toString());
2018-06-19 07:15:37 +00:00
}
2018-11-20 08:52:16 +00:00
// if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
2018-06-27 08:14:40 +00:00
2018-11-20 08:52:16 +00:00
if (ngramLevel == 0){
info.put(I18N.get("exportHeader.numberLetters"), filter.getStringLength().toString());
2018-11-20 08:52:16 +00:00
}
2018-06-19 07:15:37 +00:00
2018-11-20 08:52:16 +00:00
// calculate for
info.put(I18N.get("exportHeader.calculateFor"), filter.getCalculateFor().toString());
2018-11-20 08:52:16 +00:00
// also write
if (ngramLevel > 0) {
if (filter.getMultipleKeys().size() > 0) {
StringBuilder mk = new StringBuilder();
for (CalculateFor s : filter.getMultipleKeys()) {
mk.append(s.toString()).append("; ");
}
info.put(I18N.get("exportHeader.alsoFilter"), String.join("; ", mk.substring(0, mk.length() - 2)));
} else {
info.put(I18N.get("exportHeader.alsoFilter"), "");
2018-11-20 08:52:16 +00:00
}
}
2018-11-20 08:52:16 +00:00
// data limitations
if (filter.getDisplayTaxonomy()){
info.put(I18N.get("exportHeader.displayTaxonomies"), I18N.get("exportHeader.yes"));
2018-11-20 08:52:16 +00:00
} else {
info.put(I18N.get("exportHeader.displayTaxonomies"), I18N.get("exportHeader.no"));
2018-11-20 08:52:16 +00:00
}
2018-11-20 08:52:16 +00:00
// n.gram nivo
if (ngramLevel > 1) {
info.put(I18N.get("exportHeader.ngramLevel"), String.valueOf(ngramLevel));
2018-11-20 08:52:16 +00:00
}
2018-11-20 08:52:16 +00:00
// skip
if (ngramLevel > 1)
info.put(I18N.get("exportHeader.skipValue"), isNotEmpty(filter.getSkipValue()) ? filter.getSkipValue().toString() : "0");
2018-11-20 08:52:16 +00:00
// note punctuations - ngram > 1
if(ngramLevel > 1) {
if (filter.getNotePunctuations()) {
info.put(I18N.get("exportHeader.notePunctuations"), I18N.get("exportHeader.yes"));
2018-11-20 08:52:16 +00:00
} else {
info.put(I18N.get("exportHeader.notePunctuations"), I18N.get("exportHeader.no"));
2018-11-20 08:52:16 +00:00
}
}
2018-11-20 08:52:16 +00:00
// also write - n - gram > 1
if(ngramLevel > 1) {
if (filter.getCollocability().size() > 0) {
StringBuilder mk = new StringBuilder();
for (Collocability s : filter.getCollocability()) {
mk.append(s.toString()).append("; ");
}
info.put(I18N.get("exportHeader.collocability"), String.join("; ", mk.substring(0, mk.length() - 2)));
} else {
info.put(I18N.get("exportHeader.collocability"), "");
2018-11-20 08:52:16 +00:00
}
}
2018-11-20 08:52:16 +00:00
// fragmented MSD - n-gram = 1
if (info.get(I18N.get("exportHeader.analysis")).equals(I18N.get("exportHeader.analysis.words"))){
2018-11-20 08:52:16 +00:00
if (filter.getWriteMsdAtTheEnd()){
info.put(I18N.get("exportHeader.writeMSDAtTheEnd"), I18N.get("exportHeader.yes"));
2018-11-20 08:52:16 +00:00
} else {
info.put(I18N.get("exportHeader.writeMSDAtTheEnd"), I18N.get("exportHeader.no"));
2018-11-20 08:52:16 +00:00
}
}
2018-06-19 07:15:37 +00:00
2018-11-20 08:52:16 +00:00
if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) {
if (filter.getPrefixLength() > 0 || filter.getSuffixLength() > 0) {
info.put(I18N.get("exportHeader.prefixLength"), String.valueOf(filter.getPrefixLength()));
info.put(I18N.get("exportHeader.suffixLength"), String.valueOf(filter.getSuffixLength()));
2018-11-20 08:52:16 +00:00
} else {
info.put(I18N.get("exportHeader.prefixList"), String.join("; ", filter.getPrefixList()));
info.put(I18N.get("exportHeader.suffixList"), String.join("; ", filter.getSuffixList()));
2018-06-19 07:15:37 +00:00
}
2018-11-20 08:52:16 +00:00
}
2018-06-19 07:15:37 +00:00
2018-11-20 08:52:16 +00:00
// msd
if (!isEmpty(filter.getMsd())) {
StringBuilder msdPattern = new StringBuilder();
for (Pattern pattern : filter.getMsd()) {
msdPattern.append(pattern.toString()).append(" ");
}
2018-06-19 07:15:37 +00:00
info.put(I18N.get("exportHeader.msd"), msdPattern.toString());
2018-11-20 08:52:16 +00:00
} else {
info.put(I18N.get("exportHeader.msd"), "");
2018-06-19 07:15:37 +00:00
}
2018-11-20 08:52:16 +00:00
// }
info.put(I18N.get("exportHeader.taxonomy"), "");
if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) || filter.getDisplayTaxonomy()) {
2018-06-29 10:53:29 +00:00
ArrayList<String> tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy());
if (filter.getDisplayTaxonomy() && tax.size() == 0) {
// ArrayList<String> intList = (new ArrayList<>(taxonomyResult.keySet()).stream()
// .forEach(x -> {x.toString();}));
// ArrayList<String> taxonomyString = new ArrayList<>();
// for (Taxonomy t : taxonomyResult.keySet()){
// taxonomyString.add(t.toString());
// }
// ObservableList<String> taxonomyObservableString = Tax.getTaxonomyForComboBox(corpus.getCorpusType(), new HashSet<>(taxonomyString));
// ArrayList<String> sortedTaxonomyString = new ArrayList<>();
// for (String t : taxonomyObservableString){
// sortedTaxonomyString.add(t);
// }
// getTaxonomyForTaxonomyResult
tax = Tax.getTaxonomyForTaxonomyResult(corpus, taxonomyResult.keySet());
}
// String sep = "";
2018-06-29 10:53:29 +00:00
for (String s : tax) {
if (s == null) {
continue;
}
// info.put(sep = sep + " ", s);
if (uniGramTaxonomyOccurrences.get(Taxonomy.factoryLongName(s, corpus)) == null) {
info.put(s, "");
continue;
}
int n = uniGramTaxonomyOccurrences.get(Taxonomy.factoryLongName(s, corpus)).intValue();
if (n == 0) {
info.put(s, "");
} else {
info.put(s, String.valueOf(n));
}
2018-06-29 10:53:29 +00:00
}
2018-06-29 10:53:29 +00:00
}
2018-06-19 07:15:37 +00:00
info.put(I18N.get("exportHeader.minOccurrences"), String.valueOf(filter.getMinimalOccurrences()));
info.put(I18N.get("exportHeader.minTaxonomies"), String.valueOf(filter.getMinimalTaxonomy()));
2018-06-19 07:15:37 +00:00
if (corpus.getCorpusType() == CorpusType.SOLAR) {
HashMap<String, ObservableList<String>> filters = corpus.getSolarFilters();
if (!isEmpty(filters)) {
info.put(I18N.get("exportHeader.additionalFilters"), "");
2018-06-19 07:15:37 +00:00
for (Map.Entry<String, ObservableList<String>> f : filters.entrySet()) {
info.put(f.getKey(), StringUtils.join(f.getValue(), ", "));
}
}
}
return info;
}
public void updateCalculateCollocabilities(StatisticsNew oneWordStatistics) {
2018-11-26 12:41:35 +00:00
Map<Taxonomy, Map<MultipleHMKeys, AtomicLong>> oneWordTaxonomyResult = oneWordStatistics.getTaxonomyResult();
2018-11-08 10:37:16 +00:00
Map<Collocability, Map<MultipleHMKeys, Double>> collocabilityMap = new ConcurrentHashMap<>();
for(Collocability c : filter.getCollocability()){
collocabilityMap.put(c, new ConcurrentHashMap<>());
}
// count number of all words
long N = 0;
for(AtomicLong a : oneWordTaxonomyResult.get(corpus.getTotal()).values()){
2018-11-08 10:37:16 +00:00
N += a.longValue();
}
for(MultipleHMKeys hmKey : taxonomyResult.get(corpus.getTotal()).keySet()) {
// String[] splitedString = hmKey.getK1().split("\\s+");
long sum_fwi =0L;
2018-11-08 10:37:16 +00:00
long mul_fwi =1L;
for(MultipleHMKeys smallHmKey : hmKey.getSplittedMultipleHMKeys()){
2018-11-08 10:37:16 +00:00
// System.out.println(smallHmKey.getK1());
sum_fwi += oneWordTaxonomyResult.get(corpus.getTotal()).get(smallHmKey).longValue();
mul_fwi *= oneWordTaxonomyResult.get(corpus.getTotal()).get(smallHmKey).longValue();
2018-11-08 10:37:16 +00:00
}
// String t = hmKey.getK1();
// if(hmKey.getK1().equals("v Slovenija")){
// System.out.println("TEST");
//
// }
double O = (double)taxonomyResult.get(corpus.getTotal()).get(hmKey).longValue();
2018-11-08 10:37:16 +00:00
double n = (double)filter.getNgramValue();
double E = (double)mul_fwi / Math.pow(N, n - 1);
if (collocabilityMap.keySet().contains(Collocability.DICE)){
double dice_value = n * O / sum_fwi;
collocabilityMap.get(Collocability.DICE).put(hmKey, dice_value);
}
if (collocabilityMap.keySet().contains(Collocability.TSCORE)){
double t_score = (O - E) / Math.sqrt(O);
collocabilityMap.get(Collocability.TSCORE).put(hmKey, t_score);
}
2018-11-08 10:37:16 +00:00
if (collocabilityMap.keySet().contains(Collocability.MI)){
double MI = Math.log(O / E) / Math.log(2);
collocabilityMap.get(Collocability.MI).put(hmKey, MI);
}
if (collocabilityMap.keySet().contains(Collocability.MI3)){
double MI3 = Math.log(Math.pow(O, 3.0) / E) / Math.log(2);
collocabilityMap.get(Collocability.MI3).put(hmKey, MI3);
}
if (collocabilityMap.keySet().contains(Collocability.LOGDICE)){
double dice_value = n * O / sum_fwi;
double log_dice = 14 + Math.log(dice_value) / Math.log(2);
collocabilityMap.get(Collocability.LOGDICE).put(hmKey, log_dice);
}
if (collocabilityMap.keySet().contains(Collocability.SIMPLELL)){
double simple_ll = 2 * (O * Math.log10(O / E) - (O - E));
collocabilityMap.get(Collocability.SIMPLELL).put(hmKey, simple_ll);
}
}
for(Collocability c : collocabilityMap.keySet()){
collocability.put(c, collocabilityMap.get(c));
}
}
public Map<Collocability, Map<MultipleHMKeys, Double>> getCollocability(){
return this.collocability;
}
2018-06-19 07:15:37 +00:00
}