Added fixes on ssj500k functionality, fixed prefix/suffix bug and some other bugs.

This commit is contained in:
2018-12-01 10:50:11 +01:00
parent 9efe3d529b
commit ca83cb023b
14 changed files with 530 additions and 162 deletions

View File

@@ -64,27 +64,27 @@ public class Export {
//Delimiter used in CSV file
String NEW_LINE_SEPARATOR = "\n";
List<Object> FILE_HEADER_AL = new ArrayList<Object>();
List<Object> FILE_HEADER_AL = new ArrayList<>();
Object[] FILE_HEADER;
//Count frequencies
long num_frequencies = 0;
for (Pair<String, Map<MultipleHMKeys, Long>> p : set) {
Map<MultipleHMKeys, Long> map = p.getRight();
if (map.isEmpty())
continue;
num_frequencies = Util.mapSumFrequencies(map);
}
// Map<String, Long> num_taxonomy_frequencies = new ConcurrentHashMap<>();
// for (String taxonomyKey : taxonomyResults.keySet()) {
// num_taxonomy_frequencies.put(taxonomyKey, (long) 0);
// for (AtomicLong value : taxonomyResults.get(taxonomyKey).values()){
// long val = num_taxonomy_frequencies.get(taxonomyKey);
// val += value.get();
// num_taxonomy_frequencies.put(taxonomyKey, val);
// }
// long num_frequencies = 0;
// for (Pair<String, Map<MultipleHMKeys, Long>> p : set) {
// Map<MultipleHMKeys, Long> map = p.getRight();
// if (map.isEmpty())
// continue;
// num_frequencies = Util.mapSumFrequencies(map);
// }
Map<Taxonomy, Long> num_selected_taxonomy_frequencies = new ConcurrentHashMap<>();
for (Taxonomy taxonomyKey : taxonomyResults.keySet()) {
num_selected_taxonomy_frequencies.put(taxonomyKey, (long) 0);
for (AtomicLong value : taxonomyResults.get(taxonomyKey).values()){
long val = num_selected_taxonomy_frequencies.get(taxonomyKey);
val += value.get();
num_selected_taxonomy_frequencies.put(taxonomyKey, val);
}
}
Map<Taxonomy, AtomicLong> num_taxonomy_frequencies = statistics.getUniGramOccurrences();
@@ -92,32 +92,37 @@ public class Export {
if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) {
FILE_HEADER_AL.add("Izpuščene besede");
}
FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString());
if (filter.getCalculateFor().equals(CalculateFor.LEMMA))
FILE_HEADER_AL.add("Lema male črke");
FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString(filter.getNgramValue()));
if (filter.getCalculateFor().equals(CalculateFor.LEMMA)) {
if(filter.getNgramValue() == 0) {
FILE_HEADER_AL.add("Črkovni niz (male črke)");
} else if(filter.getNgramValue() >= 1) {
FILE_HEADER_AL.add("Lema (male črke)");
}
}
if (filter.getSuffixLength() != null && filter.getSuffixList() != null && filter.getPrefixLength() != null && filter.getPrefixList() != null) {
if (filter.getPrefixLength() > 0 || filter.getPrefixList().size() > 0) {
FILE_HEADER_AL.add("Predpona");
FILE_HEADER_AL.add("Začetni del besede");
}
FILE_HEADER_AL.add("Preostali del besede");
if (filter.getSuffixLength() > 0 || filter.getSuffixList().size() > 0) {
FILE_HEADER_AL.add("Pripona");
FILE_HEADER_AL.add("Končni del besede");
}
}
headerInfoBlock.put(filter.getCalculateFor().toMetadataString(), String.valueOf(statistics.getUniGramOccurrences().get(Taxonomy.TOTAL).longValue()));
headerInfoBlock.put(filter.getCalculateFor().totalSumString(filter.getNgramValue()), String.valueOf(num_taxonomy_frequencies.get(Taxonomy.TOTAL).longValue()));
headerInfoBlock.put(filter.getCalculateFor().foundSumString(filter.getNgramValue()), String.valueOf(num_selected_taxonomy_frequencies.get(Taxonomy.TOTAL).longValue()));
// headerInfoBlock.put(filter.getCalculateFor().toMetadataString(), String.valueOf(num_frequencies));
for (CalculateFor otherKey : filter.getMultipleKeys()) {
FILE_HEADER_AL.add(otherKey.toHeaderString());
FILE_HEADER_AL.add(otherKey.toHeaderString(filter.getNgramValue()));
if (otherKey.equals(CalculateFor.LEMMA))
FILE_HEADER_AL.add("Lema male črke");
FILE_HEADER_AL.add("Lema (male črke)");
}
FILE_HEADER_AL.add("Skupna absolutna pogostost");
FILE_HEADER_AL.add(filter.getCalculateFor().toPercentString());
FILE_HEADER_AL.add(filter.getCalculateFor().totalAbsoluteFrequencyString(filter.getNgramValue()));
FILE_HEADER_AL.add(filter.getCalculateFor().shareOfTotalString(filter.getNgramValue()));
FILE_HEADER_AL.add("Skupna relativna pogostost (na milijon pojavitev)");
@@ -216,6 +221,9 @@ public class Export {
// real prefix
String rpf = "";
for(String pf : filter.getPrefixList()){
if (key.length() < pf.length()) {
continue;
}
if (pf.equals(key.substring(0, pf.length()))){
rpf = pf;
break;
@@ -225,6 +233,9 @@ public class Export {
// real suffix
String rsf = "";
for(String sf : filter.getSuffixList()){
if (key.length() < sf.length()) {
continue;
}
if (sf.equals(key.substring(key.length() - sf.length()))){
rsf = sf;
break;
@@ -268,13 +279,13 @@ public class Export {
dataEntry.add(e.getValue().toString());
dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies));
dataEntry.add(String.format("%.2f", ((double) e.getValue() * 1000000)/num_frequencies));
dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_selected_taxonomy_frequencies.get(Taxonomy.TOTAL)));
dataEntry.add(String.format("%.2f", ((double) e.getValue() * 1000000)/num_taxonomy_frequencies.get(Taxonomy.TOTAL).longValue()));
for (Taxonomy key : taxonomyResults.keySet()){
if(!key.equals(Taxonomy.TOTAL) && num_taxonomy_frequencies.containsKey(key) && num_taxonomy_frequencies.get(key).longValue() > 0) {
AtomicLong frequency = taxonomyResults.get(key).get(e.getKey());
dataEntry.add(frequency.toString());
dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_taxonomy_frequencies.get(key).longValue()));
dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_selected_taxonomy_frequencies.get(key)));
dataEntry.add(String.format("%.2f", ((double) frequency.get() * 1000000) / num_taxonomy_frequencies.get(key).longValue()));
// dataEntry.add(formatNumberAsPercent((double) frequency.get() / statistics.getUniGramOccurrences()));
// dataEntry.add(String.format("%.2f", ((double) frequency.get() * 1000000) / statistics.getUniGramOccurrences()));