You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

516 lines
21 KiB

package util;
import static util.Util.*;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicLong;
import data.*;
import gui.I18N;
import gui.ValidationUtil;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.csv.QuoteMode;
import org.apache.commons.lang3.tuple.Pair;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import data.Enums.WordLevelType;
@SuppressWarnings("unchecked")
public class Export {
// public static void SetToJSON(Set<Pair<String, Map<MultipleHMKeys, Long>>> set) {
// JSONArray wrapper = new JSONArray();
//
// for (Pair<String, Map<MultipleHMKeys, Long>> p : set) {
// JSONArray data_wrapper = new JSONArray();
// JSONObject metric = new JSONObject();
//
// String title = p.getLeft();
// Map<MultipleHMKeys, Long> map = p.getRight();
//
// if (map.isEmpty())
// continue;
//
// long total = Util.mapSumFrequencies(map);
//
// for (Map.Entry<MultipleHMKeys, Long> e : map.entrySet()) {
// JSONObject data_entry = new JSONObject();
// data_entry.put("word", e.getKey());
// data_entry.put("frequency", e.getValue());
// data_entry.put("percent", formatNumberAsPercent((double) e.getValue() / total));
//
// data_wrapper.add(data_entry);
// }
//
// metric.put("Title", title);
// metric.put("data", data_wrapper);
// wrapper.add(metric);
// }
//
// try (FileWriter file = new FileWriter("statistics.json")) {
// file.write(wrapper.toJSONString());
// } catch (IOException e) {
// e.printStackTrace();
// }
// }
public static String SetToCSV(Set<Pair<String, Map<MultipleHMKeys, Long>>> set, File resultsPath, LinkedHashMap<String, String> headerInfoBlock,
StatisticsNew statistics, Filter filter) {
Map<Taxonomy, Map<MultipleHMKeys, AtomicLong>> taxonomyResults = statistics.getTaxonomyResult();
//Delimiter used in CSV file
String NEW_LINE_SEPARATOR = "\n";
List<Object> FILE_HEADER_AL = new ArrayList<>();
Object[] FILE_HEADER;
//Count frequencies
// long num_frequencies = 0;
// for (Pair<String, Map<MultipleHMKeys, Long>> p : set) {
// Map<MultipleHMKeys, Long> map = p.getRight();
// if (map.isEmpty())
// continue;
// num_frequencies = Util.mapSumFrequencies(map);
// }
Map<Taxonomy, Long> num_selected_taxonomy_frequencies = new ConcurrentHashMap<>();
for (Taxonomy taxonomyKey : taxonomyResults.keySet()) {
num_selected_taxonomy_frequencies.put(taxonomyKey, (long) 0);
for (AtomicLong value : taxonomyResults.get(taxonomyKey).values()){
long val = num_selected_taxonomy_frequencies.get(taxonomyKey);
val += value.get();
num_selected_taxonomy_frequencies.put(taxonomyKey, val);
}
}
Map<Taxonomy, AtomicLong> num_taxonomy_frequencies = statistics.getUniGramOccurrences();
//CSV file header
if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) {
FILE_HEADER_AL.add(I18N.get("exportTable.skippedWords"));
}
FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString(filter.getNgramValue()));
if (filter.getCalculateFor().equals(CalculateFor.LEMMA)) {
if(filter.getNgramValue() == 0) {
FILE_HEADER_AL.add(I18N.get("exportTable.lettersSmall"));
} else if(filter.getNgramValue() >= 1) {
FILE_HEADER_AL.add(I18N.get("exportTable.wordsSmall"));
}
}
if (filter.getSuffixLength() != null && filter.getSuffixList() != null && filter.getPrefixLength() != null && filter.getPrefixList() != null) {
if (filter.getPrefixLength() > 0 || filter.getPrefixList().size() > 0) {
FILE_HEADER_AL.add(I18N.get("exportTable.wordBeginning"));
}
FILE_HEADER_AL.add(I18N.get("exportTable.wordRest"));
if (filter.getSuffixLength() > 0 || filter.getSuffixList().size() > 0) {
FILE_HEADER_AL.add(I18N.get("exportTable.wordEnding"));
}
}
headerInfoBlock.put(filter.getCalculateFor().totalSumString(filter.getNgramValue()), String.valueOf(num_taxonomy_frequencies.get(statistics.getCorpus().getTotal()).longValue()));
headerInfoBlock.put(filter.getCalculateFor().foundSumString(filter.getNgramValue()), String.valueOf(num_selected_taxonomy_frequencies.get(statistics.getCorpus().getTotal()).longValue()));
// headerInfoBlock.put(filter.getCalculateFor().toMetadataString(), String.valueOf(num_frequencies));
for (CalculateFor otherKey : filter.getMultipleKeys()) {
FILE_HEADER_AL.add(otherKey.toHeaderString(filter.getNgramValue()));
if (otherKey.equals(CalculateFor.LEMMA))
FILE_HEADER_AL.add(I18N.get("exportTable.wordsSmall"));
}
FILE_HEADER_AL.add(filter.getCalculateFor().totalAbsoluteFrequencyString(filter.getNgramValue()));
FILE_HEADER_AL.add(filter.getCalculateFor().shareOfTotalString(filter.getNgramValue()));
FILE_HEADER_AL.add(I18N.get("exportTable.totalRelativeFrequency"));
for (Taxonomy key : taxonomyResults.keySet()) {
if(!key.equals(statistics.getCorpus().getTotal()) && num_taxonomy_frequencies.containsKey(key) && num_taxonomy_frequencies.get(key).longValue() > 0) {
FILE_HEADER_AL.add(I18N.get("exportTable.absoluteFrequency") + " [" + key.toString() + "]");
FILE_HEADER_AL.add(I18N.get("exportTable.percentage") + " [" + key.toString() + "]");
FILE_HEADER_AL.add(I18N.get("exportTable.relativeFrequency") + " [" + key.toString() + "]");
}
}
if (filter.getCollocability().size() > 0){
for (Collocability c : filter.getCollocability()) {
FILE_HEADER_AL.add(c.toHeaderString());
}
}
if (filter.getWriteMsdAtTheEnd()) {
String msd = "";
int maxMsdLength = 0;
for(MultipleHMKeys key : set.iterator().next().getRight().keySet()){
msd = key.getMsd(filter);
if (msd.length() > maxMsdLength){
maxMsdLength = msd.length();
}
}
for(int i = 0; i < maxMsdLength; i++){
FILE_HEADER_AL.add(I18N.get("exportTable.msd") + String.format("%02d", i + 1));
}
}
FILE_HEADER = new String[ FILE_HEADER_AL.size() ];
FILE_HEADER_AL.toArray(FILE_HEADER);
String fileName = "";
for (Pair<String, Map<MultipleHMKeys, Long>> p : set) {
String title = p.getLeft();
// statistics.setTimeEnding();
title = statistics.generateResultTitle();
// statistics.
fileName = title.replace(": ", "-");
fileName = fileName.replace(" ", "_").concat(".csv");
fileName = resultsPath.toString().concat(File.separator).concat(fileName);
Map<MultipleHMKeys, Long> map = p.getRight();
if (map.isEmpty())
continue;
// long total = Util.mapSumFrequencies(map);
OutputStreamWriter fileWriter = null;
CSVPrinter csvFilePrinter = null;
//Create the CSVFormat object with "\n" as a record delimiter it puts all words in braces
CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';').withQuoteMode(QuoteMode.ALL);
try {
//initialize FileWriter object
fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8);
//initialize CSVPrinter object
csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat);
// write info block
printHeaderInfo(csvFilePrinter, headerInfoBlock);
//Create CSV file header
csvFilePrinter.printRecord(FILE_HEADER);
for (Map.Entry<MultipleHMKeys, Long> e : map.entrySet()) {
List dataEntry = new ArrayList<>();
if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) {
dataEntry.add(e.getKey().getK1());
}
dataEntry.add(eraseSkipgramStars(e.getKey().getK1(), filter));
if (filter.getCalculateFor().equals(CalculateFor.LEMMA)){
dataEntry.add(eraseSkipgramStars(e.getKey().getK1().toLowerCase(), filter));
}
if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) {
if(filter.getSuffixLength() > 0 || filter.getPrefixLength() > 0) {
if (filter.getPrefixLength() > 0) {
dataEntry.add(((String) dataEntry.get(0)).substring(0, filter.getPrefixLength()));
}
dataEntry.add(((String) dataEntry.get(0)).substring(filter.getPrefixLength(), ((String) dataEntry.get(0)).length() - filter.getSuffixLength()));
if (filter.getSuffixLength() > 0) {
dataEntry.add(((String) dataEntry.get(0)).substring(((String) dataEntry.get(0)).length() - filter.getSuffixLength()));
}
} else {
String key = (String) dataEntry.get(0);
// real prefix
String rpf = "";
for(String pf : filter.getPrefixList()){
if (key.length() < pf.length()) {
continue;
}
if (pf.equals(key.substring(0, pf.length()))){
rpf = pf;
break;
}
}
// real suffix
String rsf = "";
for(String sf : filter.getSuffixList()){
if (key.length() < sf.length()) {
continue;
}
if (sf.equals(key.substring(key.length() - sf.length()))){
rsf = sf;
break;
}
}
if (filter.getPrefixList().size() > 0) {
dataEntry.add(rpf);
}
dataEntry.add(key.substring(rpf.length(), key.length() - rsf.length()));
if (filter.getSuffixList().size() > 0) {
dataEntry.add(rsf);
}
}
}
int i = 0;
for (CalculateFor otherKey : filter.getMultipleKeys()){
switch(i){
case 0:
if (otherKey.equals(CalculateFor.LEMMA)){
dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter));
dataEntry.add(eraseSkipgramStars(e.getKey().getK2().toLowerCase(), filter));
} else {
dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter));
}
break;
case 1:
dataEntry.add(eraseSkipgramStars(e.getKey().getK3(), filter));
break;
case 2:
dataEntry.add(eraseSkipgramStars(e.getKey().getK4(), filter));
break;
case 3:
dataEntry.add(eraseSkipgramStars(e.getKey().getK5(), filter));
break;
}
i++;
}
dataEntry.add(e.getValue().toString());
dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_selected_taxonomy_frequencies.get(statistics.getCorpus().getTotal()), statistics.getCorpus().getPunctuation()));
dataEntry.add(formatNumberForExport(((double) e.getValue() * 1000000)/num_taxonomy_frequencies.get(statistics.getCorpus().getTotal()).longValue(), statistics.getCorpus().getPunctuation()));
for (Taxonomy key : taxonomyResults.keySet()){
if(!key.equals(statistics.getCorpus().getTotal()) && num_taxonomy_frequencies.containsKey(key) && num_taxonomy_frequencies.get(key).longValue() > 0) {
AtomicLong frequency = taxonomyResults.get(key).get(e.getKey());
dataEntry.add(frequency.toString());
dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_selected_taxonomy_frequencies.get(key), statistics.getCorpus().getPunctuation()));
dataEntry.add(formatNumberForExport(((double) frequency.get() * 1000000) / num_taxonomy_frequencies.get(key).longValue(), statistics.getCorpus().getPunctuation()));
// dataEntry.add(formatNumberAsPercent((double) frequency.get() / statistics.getUniGramOccurrences()));
// dataEntry.add(String.format("%.2f", ((double) frequency.get() * 1000000) / statistics.getUniGramOccurrences()));
}
}
if (filter.getCollocability().size() > 0){
for (Collocability c : filter.getCollocability()) {
dataEntry.add(formatNumberForLongExport(statistics.getCollocability().get(c).get(e.getKey()), statistics.getCorpus().getPunctuation()));
}
}
// Write msd separated per letters at the end of each line in csv
if (filter.getWriteMsdAtTheEnd()) {
// String msd = "";
//
// if (filter.getCalculateFor().equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){
// msd = e.getKey().getK1();
// } else if (filter.getMultipleKeys().contains(CalculateFor.MORPHOSYNTACTIC_SPECS)) {
// i = 0;
// for (CalculateFor otherKey : filter.getMultipleKeys()){
// switch(i){
// case 0:
// if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){
// msd = e.getKey().getK2();
// }
// break;
// case 1:
// if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){
// msd = e.getKey().getK3();
// }
// break;
// case 2:
// if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){
// msd = e.getKey().getK4();
// }
// break;
// case 3:
// if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){
// msd = e.getKey().getK5();
// }
// break;
// }
//
// i++;
// }
// }
String msd = e.getKey().getMsd(filter);
String [] charArray = msd.split("(?!^)");
dataEntry.addAll(Arrays.asList(charArray));
}
csvFilePrinter.printRecord(dataEntry);
}
} catch (Exception e) {
System.out.println("Error in CsvFileWriter!");
e.printStackTrace();
} finally {
try {
if (fileWriter != null) {
fileWriter.flush();
fileWriter.close();
}
if (csvFilePrinter != null) {
csvFilePrinter.close();
}
} catch (IOException e) {
System.out.println("Error while flushing/closing fileWriter/csvPrinter!");
e.printStackTrace();
}
}
}
return fileName;
}
private static String eraseSkipgramStars(String s, Filter filter){
if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) {
s = s.replace("* ", "");
}
return s;
}
// public static String SetToCSV(String title, Object[][] result, File resultsPath, LinkedHashMap<String, String> headerInfoBlock) {
// //Delimiter used in CSV file
// String NEW_LINE_SEPARATOR = "\n";
//
// //CSV file header
// Object[] FILE_HEADER = {"word", "frequency", "percent"};
//
// String fileName = "";
//
// fileName = title.replace(": ", "-");
// fileName = fileName.replace(" ", "_").concat(".csv");
//
// fileName = resultsPath.toString().concat(File.separator).concat(fileName);
//
// OutputStreamWriter fileWriter = null;
// CSVPrinter csvFilePrinter = null;
//
// //Create the CSVFormat object with "\n" as a record delimiter
// CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';');
//
// try {
// //initialize FileWriter object
// fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8);
//
// //initialize CSVPrinter object
// csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat);
//
// // write info block
// printHeaderInfo(csvFilePrinter, headerInfoBlock);
//
// //Create CSV file header
// csvFilePrinter.printRecord(FILE_HEADER);
//
// for (Object[] resultEntry : result) {
// List dataEntry = new ArrayList<>();
// dataEntry.add(resultEntry[0]);
// dataEntry.add(resultEntry[1]);
// dataEntry.add(formatNumberAsPercent(resultEntry[2]), statistics.getCorpus().getPunctuation());
// csvFilePrinter.printRecord(dataEntry);
// }
// } catch (Exception e) {
// System.out.println("Error in CsvFileWriter!");
// e.printStackTrace();
// } finally {
// try {
// if (fileWriter != null) {
// fileWriter.flush();
// fileWriter.close();
// }
// if (csvFilePrinter != null) {
// csvFilePrinter.close();
// }
// } catch (IOException e) {
// System.out.println("Error while flushing/closing fileWriter/csvPrinter!");
// e.printStackTrace();
// }
// }
//
// return fileName;
// }
public static String nestedMapToCSV(String title, Map<WordLevelType, Map<String, Map<String, Long>>> result, File resultsPath, LinkedHashMap<String, String> headerInfoBlock) {
//Delimiter used in CSV file
String NEW_LINE_SEPARATOR = "\n";
//CSV file header
Object[] FILE_HEADER = {"type", "key", "word", "frequency"};
String fileName = "";
fileName = title.replace(": ", "-");
fileName = fileName.replace(" ", "_").concat(".csv");
fileName = resultsPath.toString().concat(File.separator).concat(fileName);
OutputStreamWriter fileWriter = null;
CSVPrinter csvFilePrinter = null;
//Create the CSVFormat object with "\n" as a record delimiter
CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';');
try {
//initialize FileWriter object
fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8);
//initialize CSVPrinter object
csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat);
// write info block
printHeaderInfo(csvFilePrinter, headerInfoBlock);
//Create CSV file header
csvFilePrinter.printRecord(FILE_HEADER);
for (Map.Entry<WordLevelType, Map<String, Map<String, Long>>> typeEntry : result.entrySet()) {
for (Map.Entry<String, Map<String, Long>> keyWordEntry : typeEntry.getValue().entrySet()) {
for (Map.Entry<String, Long> calculationResults : keyWordEntry.getValue().entrySet()) {
List values = new ArrayList();
values.add(typeEntry.getKey().getName());
values.add(keyWordEntry.getKey());
values.add(calculationResults.getKey());
values.add(calculationResults.getValue());
csvFilePrinter.printRecord(values);
}
}
}
} catch (Exception e) {
System.out.println("Error in CsvFileWriter!");
e.printStackTrace();
} finally {
try {
if (fileWriter != null) {
fileWriter.flush();
fileWriter.close();
}
if (csvFilePrinter != null) {
csvFilePrinter.close();
}
} catch (IOException e) {
System.out.println("Error while flushing/closing fileWriter/csvPrinter!");
e.printStackTrace();
}
}
return fileName;
}
private static void printHeaderInfo(CSVPrinter csvFilePrinter, LinkedHashMap<String, String> headerInfoBlock) throws IOException {
for (Map.Entry<String, String> entry : headerInfoBlock.entrySet()) {
List values = new ArrayList();
values.add(entry.getKey());
values.add(entry.getValue());
csvFilePrinter.printRecord(values);
}
// 2 empty lines
List values = new ArrayList();
csvFilePrinter.printRecord(values);
csvFilePrinter.printRecord(values);
}
}