package util; import static util.Util.*; import java.io.*; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicLong; import data.CalculateFor; import data.Filter; import data.MultipleHMKeys; import gui.ValidationUtil; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVPrinter; import org.apache.commons.csv.QuoteMode; import org.apache.commons.lang3.tuple.Pair; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import data.Enums.WordLevelType; @SuppressWarnings("unchecked") public class Export { public static void SetToJSON(Set>> set) { JSONArray wrapper = new JSONArray(); for (Pair> p : set) { JSONArray data_wrapper = new JSONArray(); JSONObject metric = new JSONObject(); String title = p.getLeft(); Map map = p.getRight(); if (map.isEmpty()) continue; long total = Util.mapSumFrequencies(map); for (Map.Entry e : map.entrySet()) { JSONObject data_entry = new JSONObject(); data_entry.put("word", e.getKey()); data_entry.put("frequency", e.getValue()); data_entry.put("percent", formatNumberAsPercent((double) e.getValue() / total)); data_wrapper.add(data_entry); } metric.put("Title", title); metric.put("data", data_wrapper); wrapper.add(metric); } try (FileWriter file = new FileWriter("statistics.json")) { file.write(wrapper.toJSONString()); } catch (IOException e) { e.printStackTrace(); } } public static String SetToCSV(Set>> set, File resultsPath, LinkedHashMap headerInfoBlock, Map> taxonomyResults, Filter filter) { //Delimiter used in CSV file String NEW_LINE_SEPARATOR = "\n"; List FILE_HEADER_AL = new ArrayList(); Object[] FILE_HEADER; //Count frequencies long num_frequencies = 0; for (Pair> p : set) { Map map = p.getRight(); if (map.isEmpty()) continue; num_frequencies = Util.mapSumFrequencies(map); } Map num_taxonomy_frequencies = new ConcurrentHashMap<>(); for (String taxonomyKey : taxonomyResults.keySet()) { num_taxonomy_frequencies.put(taxonomyKey, (long) 0); for (AtomicLong value : taxonomyResults.get(taxonomyKey).values()){ long val = num_taxonomy_frequencies.get(taxonomyKey); val += value.get(); num_taxonomy_frequencies.put(taxonomyKey, val); } } //CSV file header if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { FILE_HEADER_AL.add("Izpuščene besede"); } FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString()); if (filter.getCalculateFor().equals(CalculateFor.LEMMA)) FILE_HEADER_AL.add("Lema male črke"); headerInfoBlock.put(filter.getCalculateFor().toMetadataString(), String.valueOf(num_frequencies)); // if (headerInfoBlock.containsKey("Analiza") && (headerInfoBlock.get("Analiza").equals("Besede") || headerInfoBlock.get("Analiza").equals("Besedni nizi"))) { // if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("različnica")) { // headerInfoBlock.put("Skupna vsota vseh različnic:", String.valueOf(num_frequencies)); // if (headerInfoBlock.get("Analiza").equals("Besede")){ // FILE_HEADER_AL.add("Različnica"); // } else if (headerInfoBlock.get("Analiza").equals("Besedni nizi")) { // FILE_HEADER_AL.add("Različnice"); // } // } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("lema")) { // headerInfoBlock.put("Skupna vsota vseh lem:", String.valueOf(num_frequencies)); // if (headerInfoBlock.get("Analiza").equals("Besede")){ // FILE_HEADER_AL.add("Lema"); // FILE_HEADER_AL.add("Lema male črke"); // } else if (headerInfoBlock.get("Analiza").equals("Besedni nizi")) { // FILE_HEADER_AL.add("Leme"); // FILE_HEADER_AL.add("Leme male črke"); // } // } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("oblikoskladenjska oznaka")) { // headerInfoBlock.put("Skupna vsota vseh oblikoskladenjskih oznak:", String.valueOf(num_frequencies)); // if (headerInfoBlock.get("Analiza").equals("Besede")){ // FILE_HEADER_AL.add("Oblikoskladenjska oznaka"); // } else if (headerInfoBlock.get("Analiza").equals("Besedni nizi")) { // FILE_HEADER_AL.add("Oblikoskladenjska oznake"); // } // } else { // headerInfoBlock.put("Skupna vsota vseh različnic:", String.valueOf(num_frequencies)); // FILE_HEADER_AL.add("Lema"); // FILE_HEADER_AL.add("Lema male črke"); // } // for (Map value : taxonomyResults.values()) { for (CalculateFor otherKey : filter.getMultipleKeys()) { FILE_HEADER_AL.add(otherKey.toHeaderString()); if (otherKey.equals(CalculateFor.LEMMA)) FILE_HEADER_AL.add("Lema male črke"); } // if(otherKey.equals(CalculateFor.LEMMA)){ // FILE_HEADER_AL.add("Lema"); // FILE_HEADER_AL.add("Lema male črke"); // } // if(otherKey.equals(CalculateFor.WORD_TYPE)){ // FILE_HEADER_AL.add("Besedna vrsta"); // } // if(otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ // FILE_HEADER_AL.add("Oblikoskladenjska oznaka"); // } // if(otherKey.equals(CalculateFor.NORMALIZED_WORD)){ // FILE_HEADER_AL.add("Normalizirana različnica"); // } // } // break; // } FILE_HEADER_AL.add("Skupna absolutna pogostost"); FILE_HEADER_AL.add(filter.getCalculateFor().toPercentString()); // if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("različnica")) { // FILE_HEADER_AL.add("Delež glede na vse različnice"); // } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("lema")) { // FILE_HEADER_AL.add("Delež glede na vse leme"); // } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("oblikoskladenjska oznaka")) { // FILE_HEADER_AL.add("Delež glede na vse oblikoskladenjske oznake"); // } else { // FILE_HEADER_AL.add("Delež glede na vse leme"); // } FILE_HEADER_AL.add("Skupna relativna pogostost (na milijon pojavitev)"); for (String key : taxonomyResults.keySet()) { if(!key.equals("Total") && num_taxonomy_frequencies.get(key) > 0) { FILE_HEADER_AL.add("Absolutna pogostost [" + key + "]"); FILE_HEADER_AL.add("Delež [" + key + "]"); FILE_HEADER_AL.add("Relativna pogostost [" + key + "]"); } } FILE_HEADER = new String[ FILE_HEADER_AL.size() ]; FILE_HEADER_AL.toArray(FILE_HEADER); // } else { // FILE_HEADER = new Object[]{"word", "frequency", "percent"}; // } String fileName = ""; for (Pair> p : set) { String title = p.getLeft(); fileName = title.replace(": ", "-"); fileName = fileName.replace(" ", "_").concat(".csv"); fileName = resultsPath.toString().concat(File.separator).concat(fileName); Map map = p.getRight(); if (map.isEmpty()) continue; // long total = Util.mapSumFrequencies(map); OutputStreamWriter fileWriter = null; CSVPrinter csvFilePrinter = null; //Create the CSVFormat object with "\n" as a record delimiter it puts all words in braces CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';').withQuoteMode(QuoteMode.ALL); try { //initialize FileWriter object fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); //initialize CSVPrinter object csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat); // write info block printHeaderInfo(csvFilePrinter, headerInfoBlock); //Create CSV file header csvFilePrinter.printRecord(FILE_HEADER); for (Map.Entry e : map.entrySet()) { List dataEntry = new ArrayList<>(); if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { dataEntry.add(e.getKey().getK1()); } dataEntry.add(eraseSkipgramStars(e.getKey().getK1(), filter)); if (filter.getCalculateFor().equals(CalculateFor.LEMMA)){ dataEntry.add(eraseSkipgramStars(e.getKey().getK1().toLowerCase(), filter)); } int i = 0; for (CalculateFor otherKey : filter.getMultipleKeys()){ switch(i){ case 0: if (otherKey.equals(CalculateFor.LEMMA)){ dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter)); dataEntry.add(eraseSkipgramStars(e.getKey().getK2().toLowerCase(), filter)); } else { dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter)); } break; case 1: dataEntry.add(eraseSkipgramStars(e.getKey().getK3(), filter)); break; case 2: dataEntry.add(eraseSkipgramStars(e.getKey().getK4(), filter)); break; case 3: dataEntry.add(eraseSkipgramStars(e.getKey().getK5(), filter)); break; } i++; } // if(!e.getKey().getLemma().equals("")){ // dataEntry.add(e.getKey().getLemma()); // dataEntry.add(e.getKey().getLemma().toLowerCase()); // } // if(!e.getKey().getWordType().equals("")){ // dataEntry.add(e.getKey().getWordType()); // } // if(!e.getKey().getMsd().equals("")){ // dataEntry.add(e.getKey().getMsd()); // } dataEntry.add(e.getValue().toString()); dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies)); dataEntry.add(String.format("%.2f", ((double) e.getValue() * 1000000)/num_frequencies)); for (String key : taxonomyResults.keySet()){ if(!key.equals("Total") && num_taxonomy_frequencies.get(key) > 0) { AtomicLong frequency = taxonomyResults.get(key).get(e.getKey()); dataEntry.add(frequency.toString()); dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_taxonomy_frequencies.get(key))); dataEntry.add(String.format("%.2f", ((double) frequency.get() * 1000000) / num_taxonomy_frequencies.get(key))); } } // Write msd separated per letters at the end of each line in csv if (filter.getWriteMsdAtTheEnd()) { String msd = ""; if (filter.getCalculateFor().equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ msd = e.getKey().getK1(); } else if (filter.getMultipleKeys().contains(CalculateFor.MORPHOSYNTACTIC_SPECS)) { i = 0; for (CalculateFor otherKey : filter.getMultipleKeys()){ switch(i){ case 0: if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ msd = e.getKey().getK2(); } break; case 1: if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ msd = e.getKey().getK3(); } break; case 2: if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ msd = e.getKey().getK4(); } break; case 3: if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ msd = e.getKey().getK5(); } break; } i++; } } String [] charArray = msd.split("(?!^)"); dataEntry.addAll(Arrays.asList(charArray)); } csvFilePrinter.printRecord(dataEntry); } } catch (Exception e) { System.out.println("Error in CsvFileWriter!"); e.printStackTrace(); } finally { try { if (fileWriter != null) { fileWriter.flush(); fileWriter.close(); } if (csvFilePrinter != null) { csvFilePrinter.close(); } } catch (IOException e) { System.out.println("Error while flushing/closing fileWriter/csvPrinter!"); e.printStackTrace(); } } } return fileName; } private static String eraseSkipgramStars(String s, Filter filter){ if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { s = s.replace("* ", ""); } return s; } public static String SetToCSV(String title, Object[][] result, File resultsPath, LinkedHashMap headerInfoBlock) { //Delimiter used in CSV file String NEW_LINE_SEPARATOR = "\n"; //CSV file header Object[] FILE_HEADER = {"word", "frequency", "percent"}; String fileName = ""; fileName = title.replace(": ", "-"); fileName = fileName.replace(" ", "_").concat(".csv"); fileName = resultsPath.toString().concat(File.separator).concat(fileName); OutputStreamWriter fileWriter = null; CSVPrinter csvFilePrinter = null; //Create the CSVFormat object with "\n" as a record delimiter CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';'); try { //initialize FileWriter object fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); //initialize CSVPrinter object csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat); // write info block printHeaderInfo(csvFilePrinter, headerInfoBlock); //Create CSV file header csvFilePrinter.printRecord(FILE_HEADER); for (Object[] resultEntry : result) { List dataEntry = new ArrayList<>(); dataEntry.add(resultEntry[0]); dataEntry.add(resultEntry[1]); dataEntry.add(formatNumberAsPercent(resultEntry[2])); csvFilePrinter.printRecord(dataEntry); } } catch (Exception e) { System.out.println("Error in CsvFileWriter!"); e.printStackTrace(); } finally { try { if (fileWriter != null) { fileWriter.flush(); fileWriter.close(); } if (csvFilePrinter != null) { csvFilePrinter.close(); } } catch (IOException e) { System.out.println("Error while flushing/closing fileWriter/csvPrinter!"); e.printStackTrace(); } } return fileName; } public static String nestedMapToCSV(String title, Map>> result, File resultsPath, LinkedHashMap headerInfoBlock) { //Delimiter used in CSV file String NEW_LINE_SEPARATOR = "\n"; //CSV file header Object[] FILE_HEADER = {"type", "key", "word", "frequency"}; String fileName = ""; fileName = title.replace(": ", "-"); fileName = fileName.replace(" ", "_").concat(".csv"); fileName = resultsPath.toString().concat(File.separator).concat(fileName); OutputStreamWriter fileWriter = null; CSVPrinter csvFilePrinter = null; //Create the CSVFormat object with "\n" as a record delimiter CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';'); try { //initialize FileWriter object fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); //initialize CSVPrinter object csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat); // write info block printHeaderInfo(csvFilePrinter, headerInfoBlock); //Create CSV file header csvFilePrinter.printRecord(FILE_HEADER); for (Map.Entry>> typeEntry : result.entrySet()) { for (Map.Entry> keyWordEntry : typeEntry.getValue().entrySet()) { for (Map.Entry calculationResults : keyWordEntry.getValue().entrySet()) { List values = new ArrayList(); values.add(typeEntry.getKey().getName()); values.add(keyWordEntry.getKey()); values.add(calculationResults.getKey()); values.add(calculationResults.getValue()); csvFilePrinter.printRecord(values); } } } } catch (Exception e) { System.out.println("Error in CsvFileWriter!"); e.printStackTrace(); } finally { try { if (fileWriter != null) { fileWriter.flush(); fileWriter.close(); } if (csvFilePrinter != null) { csvFilePrinter.close(); } } catch (IOException e) { System.out.println("Error while flushing/closing fileWriter/csvPrinter!"); e.printStackTrace(); } } return fileName; } private static void printHeaderInfo(CSVPrinter csvFilePrinter, LinkedHashMap headerInfoBlock) throws IOException { for (Map.Entry entry : headerInfoBlock.entrySet()) { List values = new ArrayList(); values.add(entry.getKey()); values.add(entry.getValue()); csvFilePrinter.printRecord(values); } // 2 empty lines List values = new ArrayList(); csvFilePrinter.printRecord(values); csvFilePrinter.printRecord(values); } }