Added functional additional combinational filters for words

This commit is contained in:
2018-07-16 10:14:21 +02:00
parent e2ce656fc5
commit c073e12f55
8 changed files with 192 additions and 74 deletions

View File

@@ -10,6 +10,7 @@ import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicLong;
import data.Filter;
import data.MultipleHMKeys;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.lang3.tuple.Pair;
@@ -20,22 +21,22 @@ import data.Enums.WordLevelType;
@SuppressWarnings("unchecked")
public class Export {
public static void SetToJSON(Set<Pair<String, Map<String, Long>>> set) {
public static void SetToJSON(Set<Pair<String, Map<MultipleHMKeys, Long>>> set) {
JSONArray wrapper = new JSONArray();
for (Pair<String, Map<String, Long>> p : set) {
for (Pair<String, Map<MultipleHMKeys, Long>> p : set) {
JSONArray data_wrapper = new JSONArray();
JSONObject metric = new JSONObject();
String title = p.getLeft();
Map<String, Long> map = p.getRight();
Map<MultipleHMKeys, Long> map = p.getRight();
if (map.isEmpty())
continue;
long total = Util.mapSumFrequencies(map);
for (Map.Entry<String, Long> e : map.entrySet()) {
for (Map.Entry<MultipleHMKeys, Long> e : map.entrySet()) {
JSONObject data_entry = new JSONObject();
data_entry.put("word", e.getKey());
data_entry.put("frequency", e.getValue());
@@ -56,8 +57,8 @@ public class Export {
}
}
public static String SetToCSV(Set<Pair<String, Map<String, Long>>> set, File resultsPath, LinkedHashMap<String, String> headerInfoBlock,
Map<String, Map<String, AtomicLong>> taxonomyResults) {
public static String SetToCSV(Set<Pair<String, Map<MultipleHMKeys, Long>>> set, File resultsPath, LinkedHashMap<String, String> headerInfoBlock,
Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResults) {
//Delimiter used in CSV file
String NEW_LINE_SEPARATOR = "\n";
List<Object> FILE_HEADER_AL = new ArrayList<Object>();
@@ -65,8 +66,8 @@ public class Export {
//Count frequencies
long num_frequencies = 0;
for (Pair<String, Map<String, Long>> p : set) {
Map<String, Long> map = p.getRight();
for (Pair<String, Map<MultipleHMKeys, Long>> p : set) {
Map<MultipleHMKeys, Long> map = p.getRight();
if (map.isEmpty())
continue;
num_frequencies = Util.mapSumFrequencies(map);
@@ -88,21 +89,48 @@ public class Export {
if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("različnica")) {
headerInfoBlock.put("Skupna vsota vseh različnic:", String.valueOf(num_frequencies));
FILE_HEADER_AL.add("Različnica");
FILE_HEADER_AL.add("Skupna absolutna pogostost");
FILE_HEADER_AL.add("Delež glede na vse različnice");
} else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("lema")) {
headerInfoBlock.put("Skupna vsota vseh lem:", String.valueOf(num_frequencies));
FILE_HEADER_AL.add("Lema");
FILE_HEADER_AL.add("Skupna absolutna pogostost");
FILE_HEADER_AL.add("Delež glede na vse leme");
} else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("oblikoskladenjska oznaka")) {
headerInfoBlock.put("Skupna vsota vseh oblikoskladenjskih oznak:", String.valueOf(num_frequencies));
FILE_HEADER_AL.add("Oblikoskladenjska oznaka");
FILE_HEADER_AL.add("Skupna absolutna pogostost");
FILE_HEADER_AL.add("Delež glede na vse oblikoskladenjske oznake");
} else {
headerInfoBlock.put("Skupna vsota vseh različnic:", String.valueOf(num_frequencies));
FILE_HEADER_AL.add("Lema");
}
for (Map<MultipleHMKeys, AtomicLong> value : taxonomyResults.values()) {
for (MultipleHMKeys key : value.keySet()){
if(!key.getLemma().equals("")){
FILE_HEADER_AL.add("Lema");
}
if(!key.getWordType().equals("")){
FILE_HEADER_AL.add("Besedna vrsta");
}
if(!key.getMsd().equals("")){
FILE_HEADER_AL.add("Oblikoskladenjska oznaka");
}
break;
}
break;
}
if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("različnica")) {
FILE_HEADER_AL.add("Skupna absolutna pogostost");
FILE_HEADER_AL.add("Delež glede na vse različnice");
} else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("lema")) {
FILE_HEADER_AL.add("Skupna absolutna pogostost");
FILE_HEADER_AL.add("Delež glede na vse leme");
} else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("oblikoskladenjska oznaka")) {
FILE_HEADER_AL.add("Skupna absolutna pogostost");
FILE_HEADER_AL.add("Delež glede na vse oblikoskladenjske oznake");
} else {
FILE_HEADER_AL.add("Skupna absolutna pogostost");
FILE_HEADER_AL.add("Delež glede na vse leme");
}
@@ -122,14 +150,14 @@ public class Export {
String fileName = "";
for (Pair<String, Map<String, Long>> p : set) {
for (Pair<String, Map<MultipleHMKeys, Long>> p : set) {
String title = p.getLeft();
fileName = title.replace(": ", "-");
fileName = fileName.replace(" ", "_").concat(".csv");
fileName = resultsPath.toString().concat(File.separator).concat(fileName);
Map<String, Long> map = p.getRight();
Map<MultipleHMKeys, Long> map = p.getRight();
if (map.isEmpty())
continue;
@@ -155,9 +183,18 @@ public class Export {
//Create CSV file header
csvFilePrinter.printRecord(FILE_HEADER);
for (Map.Entry<String, Long> e : map.entrySet()) {
for (Map.Entry<MultipleHMKeys, Long> e : map.entrySet()) {
List dataEntry = new ArrayList<>();
dataEntry.add(e.getKey());
dataEntry.add(e.getKey().getKey());
if(!e.getKey().getLemma().equals("")){
dataEntry.add(e.getKey().getLemma());
}
if(!e.getKey().getWordType().equals("")){
dataEntry.add(e.getKey().getWordType());
}
if(!e.getKey().getMsd().equals("")){
dataEntry.add(e.getKey().getMsd());
}
dataEntry.add(e.getValue().toString());
dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies));
dataEntry.add(String.format("%.2f", ((double) e.getValue() * 10000)/num_frequencies));

View File

@@ -9,6 +9,7 @@ import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Stream;
import data.MultipleHMKeys;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@@ -85,11 +86,11 @@ public class Util {
* Generic map converter -> since AtomicLongs aren't as comparable.
* Converts ConcurrentHashMap<K, AtomicLong> to HashMap<K, Long>
*/
public static <K, V> Map<String, Long> atomicInt2StringAndInt(Map<K, V> map) {
Map m = new HashMap<String, Long>();
public static <K, V> Map<MultipleHMKeys, Long> atomicInt2StringAndInt(Map<K, V> map) {
Map m = new HashMap<MultipleHMKeys, Long>();
for (Map.Entry<K, V> e : map.entrySet()) {
m.put(e.getKey().toString(), ((AtomicLong) e.getValue()).longValue());
m.put(e.getKey(), ((AtomicLong) e.getValue()).longValue());
}
return m;
@@ -148,7 +149,7 @@ public class Util {
System.out.println();
}
static long mapSumFrequencies(Map<String, Long> map) {
static long mapSumFrequencies(Map<MultipleHMKeys, Long> map) {
long sum = 0;
for (long value : map.values()) {