Added functionality for n-grams (comma separation), minimal occurances etc.

This commit is contained in:
2018-07-31 08:58:17 +02:00
parent 681eb4f949
commit 179f09c4bd
27 changed files with 405 additions and 4962 deletions

View File

@@ -26,7 +26,9 @@ public class Filter {
HAS_MSD,
SOLAR_FILTERS,
MULTIPLE_KEYS,
NOTE_PUNCTUATIONS
NOTE_PUNCTUATIONS,
MINIMAL_OCCURRENCES,
MINIMAL_TAXONOMY
}
public Filter() {
@@ -170,4 +172,21 @@ public class Filter {
public boolean getNotePunctuations() {
return filter.containsKey(NOTE_PUNCTUATIONS) && (boolean) filter.get(NOTE_PUNCTUATIONS);
}
public void setMinimalOccurrences(Integer minOccurrences) {
filter.put(MINIMAL_OCCURRENCES, minOccurrences);
}
public Integer getMinimalOccurrences() {
return (Integer) filter.get(MINIMAL_OCCURRENCES);
}
public void setMinimalTaxonomy(Integer minTaxonomy) {
filter.put(MINIMAL_TAXONOMY, minTaxonomy);
}
public Integer getMinimalTaxonomy() {
return (Integer) filter.get(MINIMAL_TAXONOMY);
}
}

View File

@@ -48,15 +48,16 @@ public class StatisticsNew {
this.taxonomyResult.put("Total", new ConcurrentHashMap<>());
// create table for counting word occurances per taxonomies
if (this.filter.getTaxonomy().isEmpty()) {
for (int i = 0; i < this.corpus.getTaxonomy().size(); i++) {
this.taxonomyResult.put(this.corpus.getTaxonomy().get(i), new ConcurrentHashMap<>());
}
} else {
for (int i = 0; i < this.filter.getTaxonomy().size(); i++) {
Tax taxonomy = new Tax();
this.taxonomyResult.put(taxonomy.getLongTaxonomyName(this.filter.getTaxonomy().get(i)), new ConcurrentHashMap<>());
if (this.corpus.getTaxonomy() != null) {
if (this.filter.getTaxonomy().isEmpty()) {
for (int i = 0; i < this.corpus.getTaxonomy().size(); i++) {
this.taxonomyResult.put(this.corpus.getTaxonomy().get(i), new ConcurrentHashMap<>());
}
} else {
for (int i = 0; i < this.filter.getTaxonomy().size(); i++) {
Tax taxonomy = new Tax();
this.taxonomyResult.put(taxonomy.getLongTaxonomyName(this.filter.getTaxonomy().get(i)), new ConcurrentHashMap<>());
}
}
}
@@ -209,11 +210,45 @@ public class StatisticsNew {
analysisProducedResults = true;
}
removeMinimalOccurrences(taxonomyResult.get("Total"), filter.getMinimalOccurrences());
removeMinimalTaxonomy(taxonomyResult, filter.getMinimalTaxonomy());
stats.add(ImmutablePair.of(resultTitle, getSortedResult(taxonomyResult.get("Total"), Util.getValidInt(limit))));
Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), taxonomyResult);
return true;
}
/**
* Removes lines, where number of different taxonomies is lower than specified number (minimalTaxonomy)
*/
private void removeMinimalTaxonomy(Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult, Integer minimalTaxonomy) {
if (minimalTaxonomy == 1)
return;
int occurances;
for (MultipleHMKeys key : taxonomyResult.get("Total").keySet()){
occurances = 0;
for (String columnNameKey : taxonomyResult.keySet()){
if(!columnNameKey.equals("Total") && taxonomyResult.get(columnNameKey).get(key).intValue() >= 1)
occurances++;
}
if(occurances < minimalTaxonomy){
taxonomyResult.get("Total").remove(key);
}
}
}
/**
* Removes lines where total number of occurrences is lower than specified number (minimalOccurrences)
*/
private void removeMinimalOccurrences(Map<MultipleHMKeys, AtomicLong> taxonomyResultTotal, Integer minimalOccurrences) {
if (minimalOccurrences == 0)
return;
for (MultipleHMKeys key : taxonomyResultTotal.keySet()){
if(taxonomyResultTotal.get(key).intValue() < minimalOccurrences){
taxonomyResultTotal.remove(key);
}
}
}
public boolean saveResultNestedToDisk(int... limit) throws UnsupportedEncodingException {
resultTitle = generateResultTitle();
@@ -285,7 +320,8 @@ public class StatisticsNew {
public void updateTaxonomyResults(MultipleHMKeys o, List<String> taxonomy) {
for (String key : taxonomyResult.keySet()) {
// first word should have the same taxonomy as others
if (taxonomy.contains(key) || key.equals("Total")) {
if (key.equals("Total") || taxonomy.contains(key)) {
// if (key.equals("Total") || taxonomy != null && taxonomy.contains(key)) {
// if taxonomy not in map and in this word
AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(1));
@@ -389,13 +425,13 @@ public class StatisticsNew {
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
if (ngramLevel == 0)
info.put("Analiza:", "Črke");
info.put("Analiza", "Črke");
else if (ngramLevel == 1)
info.put("Analiza", "Besede");
else
info.put("Analiza:", filter.getAl().toString());
info.put("Analiza", filter.getAl().toString());
} else {
info.put("Analiza:", filter.getAl().toString());
info.put("Analiza", filter.getAl().toString());
}
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {

View File

@@ -16,6 +16,7 @@ public class Word implements Serializable {
private String word;
private String lemma;
private String msd;
// private String msd;
private List<String> taxonomy;
private final HashSet<Character> VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u'));