Added functionality for n-grams (comma separation), minimal occurances etc.
This commit is contained in:
@@ -26,7 +26,9 @@ public class Filter {
|
||||
HAS_MSD,
|
||||
SOLAR_FILTERS,
|
||||
MULTIPLE_KEYS,
|
||||
NOTE_PUNCTUATIONS
|
||||
NOTE_PUNCTUATIONS,
|
||||
MINIMAL_OCCURRENCES,
|
||||
MINIMAL_TAXONOMY
|
||||
}
|
||||
|
||||
public Filter() {
|
||||
@@ -170,4 +172,21 @@ public class Filter {
|
||||
public boolean getNotePunctuations() {
|
||||
return filter.containsKey(NOTE_PUNCTUATIONS) && (boolean) filter.get(NOTE_PUNCTUATIONS);
|
||||
}
|
||||
|
||||
public void setMinimalOccurrences(Integer minOccurrences) {
|
||||
filter.put(MINIMAL_OCCURRENCES, minOccurrences);
|
||||
}
|
||||
|
||||
public Integer getMinimalOccurrences() {
|
||||
return (Integer) filter.get(MINIMAL_OCCURRENCES);
|
||||
}
|
||||
|
||||
|
||||
public void setMinimalTaxonomy(Integer minTaxonomy) {
|
||||
filter.put(MINIMAL_TAXONOMY, minTaxonomy);
|
||||
}
|
||||
|
||||
public Integer getMinimalTaxonomy() {
|
||||
return (Integer) filter.get(MINIMAL_TAXONOMY);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,15 +48,16 @@ public class StatisticsNew {
|
||||
this.taxonomyResult.put("Total", new ConcurrentHashMap<>());
|
||||
|
||||
// create table for counting word occurances per taxonomies
|
||||
|
||||
if (this.filter.getTaxonomy().isEmpty()) {
|
||||
for (int i = 0; i < this.corpus.getTaxonomy().size(); i++) {
|
||||
this.taxonomyResult.put(this.corpus.getTaxonomy().get(i), new ConcurrentHashMap<>());
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < this.filter.getTaxonomy().size(); i++) {
|
||||
Tax taxonomy = new Tax();
|
||||
this.taxonomyResult.put(taxonomy.getLongTaxonomyName(this.filter.getTaxonomy().get(i)), new ConcurrentHashMap<>());
|
||||
if (this.corpus.getTaxonomy() != null) {
|
||||
if (this.filter.getTaxonomy().isEmpty()) {
|
||||
for (int i = 0; i < this.corpus.getTaxonomy().size(); i++) {
|
||||
this.taxonomyResult.put(this.corpus.getTaxonomy().get(i), new ConcurrentHashMap<>());
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < this.filter.getTaxonomy().size(); i++) {
|
||||
Tax taxonomy = new Tax();
|
||||
this.taxonomyResult.put(taxonomy.getLongTaxonomyName(this.filter.getTaxonomy().get(i)), new ConcurrentHashMap<>());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,11 +210,45 @@ public class StatisticsNew {
|
||||
analysisProducedResults = true;
|
||||
}
|
||||
|
||||
removeMinimalOccurrences(taxonomyResult.get("Total"), filter.getMinimalOccurrences());
|
||||
removeMinimalTaxonomy(taxonomyResult, filter.getMinimalTaxonomy());
|
||||
stats.add(ImmutablePair.of(resultTitle, getSortedResult(taxonomyResult.get("Total"), Util.getValidInt(limit))));
|
||||
Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), taxonomyResult);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes lines, where number of different taxonomies is lower than specified number (minimalTaxonomy)
|
||||
*/
|
||||
private void removeMinimalTaxonomy(Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult, Integer minimalTaxonomy) {
|
||||
if (minimalTaxonomy == 1)
|
||||
return;
|
||||
int occurances;
|
||||
for (MultipleHMKeys key : taxonomyResult.get("Total").keySet()){
|
||||
occurances = 0;
|
||||
for (String columnNameKey : taxonomyResult.keySet()){
|
||||
if(!columnNameKey.equals("Total") && taxonomyResult.get(columnNameKey).get(key).intValue() >= 1)
|
||||
occurances++;
|
||||
}
|
||||
if(occurances < minimalTaxonomy){
|
||||
taxonomyResult.get("Total").remove(key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes lines where total number of occurrences is lower than specified number (minimalOccurrences)
|
||||
*/
|
||||
private void removeMinimalOccurrences(Map<MultipleHMKeys, AtomicLong> taxonomyResultTotal, Integer minimalOccurrences) {
|
||||
if (minimalOccurrences == 0)
|
||||
return;
|
||||
for (MultipleHMKeys key : taxonomyResultTotal.keySet()){
|
||||
if(taxonomyResultTotal.get(key).intValue() < minimalOccurrences){
|
||||
taxonomyResultTotal.remove(key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean saveResultNestedToDisk(int... limit) throws UnsupportedEncodingException {
|
||||
resultTitle = generateResultTitle();
|
||||
|
||||
@@ -285,7 +320,8 @@ public class StatisticsNew {
|
||||
public void updateTaxonomyResults(MultipleHMKeys o, List<String> taxonomy) {
|
||||
for (String key : taxonomyResult.keySet()) {
|
||||
// first word should have the same taxonomy as others
|
||||
if (taxonomy.contains(key) || key.equals("Total")) {
|
||||
if (key.equals("Total") || taxonomy.contains(key)) {
|
||||
// if (key.equals("Total") || taxonomy != null && taxonomy.contains(key)) {
|
||||
// if taxonomy not in map and in this word
|
||||
AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(1));
|
||||
|
||||
@@ -389,13 +425,13 @@ public class StatisticsNew {
|
||||
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
|
||||
Integer ngramLevel = filter.getNgramValue();
|
||||
if (ngramLevel == 0)
|
||||
info.put("Analiza:", "Črke");
|
||||
info.put("Analiza", "Črke");
|
||||
else if (ngramLevel == 1)
|
||||
info.put("Analiza", "Besede");
|
||||
else
|
||||
info.put("Analiza:", filter.getAl().toString());
|
||||
info.put("Analiza", filter.getAl().toString());
|
||||
} else {
|
||||
info.put("Analiza:", filter.getAl().toString());
|
||||
info.put("Analiza", filter.getAl().toString());
|
||||
}
|
||||
|
||||
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
|
||||
|
||||
@@ -16,6 +16,7 @@ public class Word implements Serializable {
|
||||
private String word;
|
||||
private String lemma;
|
||||
private String msd;
|
||||
// private String msd;
|
||||
private List<String> taxonomy;
|
||||
private final HashSet<Character> VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u'));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user