Added functionality for n-grams (comma separation), minimal occurances etc.

2018-07-31 08:58:17 +02:00
parent 681eb4f949
commit 179f09c4bd
27 changed files with 405 additions and 4962 deletions
--- a/src/main/java/data/Filter.java
+++ b/src/main/java/data/Filter.java
@@ -26,7 +26,9 @@ public class Filter {
 		HAS_MSD,
 		SOLAR_FILTERS,
 		MULTIPLE_KEYS,
-		NOTE_PUNCTUATIONS
+		NOTE_PUNCTUATIONS,
+		MINIMAL_OCCURRENCES,
+		MINIMAL_TAXONOMY
 	}

 	public Filter() {
@@ -170,4 +172,21 @@ public class Filter {
    public boolean getNotePunctuations() {
        return filter.containsKey(NOTE_PUNCTUATIONS) && (boolean) filter.get(NOTE_PUNCTUATIONS);
    }
+
+	public void setMinimalOccurrences(Integer minOccurrences) {
+		filter.put(MINIMAL_OCCURRENCES, minOccurrences);
+	}
+
+	public Integer getMinimalOccurrences() {
+		return (Integer) filter.get(MINIMAL_OCCURRENCES);
+	}
+
+
+	public void setMinimalTaxonomy(Integer minTaxonomy) {
+		filter.put(MINIMAL_TAXONOMY, minTaxonomy);
+	}
+
+	public Integer getMinimalTaxonomy() {
+		return (Integer) filter.get(MINIMAL_TAXONOMY);
+	}
 }
--- a/src/main/java/data/StatisticsNew.java
+++ b/src/main/java/data/StatisticsNew.java
@@ -48,15 +48,16 @@ public class StatisticsNew {
 		this.taxonomyResult.put("Total", new ConcurrentHashMap<>());

 		// create table for counting word occurances per taxonomies
-
-		if (this.filter.getTaxonomy().isEmpty()) {
-			for (int i = 0; i < this.corpus.getTaxonomy().size(); i++) {
-				this.taxonomyResult.put(this.corpus.getTaxonomy().get(i), new ConcurrentHashMap<>());
-			}
-		} else {
-			for (int i = 0; i < this.filter.getTaxonomy().size(); i++) {
-				Tax taxonomy = new Tax();
-				this.taxonomyResult.put(taxonomy.getLongTaxonomyName(this.filter.getTaxonomy().get(i)), new ConcurrentHashMap<>());
+		if (this.corpus.getTaxonomy() != null) {
+			if (this.filter.getTaxonomy().isEmpty()) {
+				for (int i = 0; i < this.corpus.getTaxonomy().size(); i++) {
+					this.taxonomyResult.put(this.corpus.getTaxonomy().get(i), new ConcurrentHashMap<>());
+				}
+			} else {
+				for (int i = 0; i < this.filter.getTaxonomy().size(); i++) {
+					Tax taxonomy = new Tax();
+					this.taxonomyResult.put(taxonomy.getLongTaxonomyName(this.filter.getTaxonomy().get(i)), new ConcurrentHashMap<>());
+				}
 			}
 		}

@@ -209,11 +210,45 @@ public class StatisticsNew {
 			analysisProducedResults = true;
 		}

+		removeMinimalOccurrences(taxonomyResult.get("Total"), filter.getMinimalOccurrences());
+		removeMinimalTaxonomy(taxonomyResult, filter.getMinimalTaxonomy());
 		stats.add(ImmutablePair.of(resultTitle, getSortedResult(taxonomyResult.get("Total"), Util.getValidInt(limit))));
 		Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), taxonomyResult);
 		return true;
 	}

+	/**
+	 * Removes lines, where number of different taxonomies is lower than specified number (minimalTaxonomy)
+	 */
+	private void removeMinimalTaxonomy(Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult, Integer minimalTaxonomy) {
+		if (minimalTaxonomy == 1)
+			return;
+		int occurances;
+		for (MultipleHMKeys key : taxonomyResult.get("Total").keySet()){
+			occurances = 0;
+			for (String columnNameKey : taxonomyResult.keySet()){
+				if(!columnNameKey.equals("Total") && taxonomyResult.get(columnNameKey).get(key).intValue() >= 1)
+					occurances++;
+			}
+			if(occurances < minimalTaxonomy){
+				taxonomyResult.get("Total").remove(key);
+			}
+		}
+	}
+
+	/**
+	 * Removes lines where total number of occurrences is lower than specified number (minimalOccurrences)
+	 */
+	private void removeMinimalOccurrences(Map<MultipleHMKeys, AtomicLong> taxonomyResultTotal, Integer minimalOccurrences) {
+		if (minimalOccurrences == 0)
+			return;
+		for (MultipleHMKeys key : taxonomyResultTotal.keySet()){
+			if(taxonomyResultTotal.get(key).intValue() < minimalOccurrences){
+				taxonomyResultTotal.remove(key);
+			}
+		}
+	}
+
 	public boolean saveResultNestedToDisk(int... limit) throws UnsupportedEncodingException {
 		resultTitle = generateResultTitle();

@@ -285,7 +320,8 @@ public class StatisticsNew {
 	public void updateTaxonomyResults(MultipleHMKeys o, List<String> taxonomy) {
 		for (String key : taxonomyResult.keySet()) {
 			// first word should have the same taxonomy as others
-			if (taxonomy.contains(key) || key.equals("Total")) {
+			if (key.equals("Total") || taxonomy.contains(key)) {
+//			if (key.equals("Total") || taxonomy != null && taxonomy.contains(key)) {
 				// if taxonomy not in map and in this word
 				AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(1));

@@ -389,13 +425,13 @@ public class StatisticsNew {
 		if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
 			Integer ngramLevel = filter.getNgramValue();
 			if (ngramLevel == 0)
-				info.put("Analiza:", "Črke");
+				info.put("Analiza", "Črke");
 			else if (ngramLevel == 1)
 				info.put("Analiza", "Besede");
 			else
-				info.put("Analiza:", filter.getAl().toString());
+				info.put("Analiza", filter.getAl().toString());
 		} else {
-			info.put("Analiza:", filter.getAl().toString());
+			info.put("Analiza", filter.getAl().toString());
 		}

 		if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
--- a/src/main/java/data/Word.java
+++ b/src/main/java/data/Word.java
@@ -16,6 +16,7 @@ public class Word implements Serializable {
 	private String word;
 	private String lemma;
 	private String msd;
+//	private String msd;
 	private List<String> taxonomy;
 	private final HashSet<Character> VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u'));