package util; import static util.Util.*; import java.io.*; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicLong; import data.*; import gui.ValidationUtil; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVPrinter; import org.apache.commons.csv.QuoteMode; import org.apache.commons.lang3.tuple.Pair; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import data.Enums.WordLevelType; @SuppressWarnings("unchecked") public class Export { public static void SetToJSON(Set>> set) { JSONArray wrapper = new JSONArray(); for (Pair> p : set) { JSONArray data_wrapper = new JSONArray(); JSONObject metric = new JSONObject(); String title = p.getLeft(); Map map = p.getRight(); if (map.isEmpty()) continue; long total = Util.mapSumFrequencies(map); for (Map.Entry e : map.entrySet()) { JSONObject data_entry = new JSONObject(); data_entry.put("word", e.getKey()); data_entry.put("frequency", e.getValue()); data_entry.put("percent", formatNumberAsPercent((double) e.getValue() / total)); data_wrapper.add(data_entry); } metric.put("Title", title); metric.put("data", data_wrapper); wrapper.add(metric); } try (FileWriter file = new FileWriter("statistics.json")) { file.write(wrapper.toJSONString()); } catch (IOException e) { e.printStackTrace(); } } public static String SetToCSV(Set>> set, File resultsPath, LinkedHashMap headerInfoBlock, StatisticsNew statistics, Filter filter) { Map> taxonomyResults = statistics.getTaxonomyResult(); //Delimiter used in CSV file String NEW_LINE_SEPARATOR = "\n"; List FILE_HEADER_AL = new ArrayList(); Object[] FILE_HEADER; //Count frequencies long num_frequencies = 0; for (Pair> p : set) { Map map = p.getRight(); if (map.isEmpty()) continue; num_frequencies = Util.mapSumFrequencies(map); } Map num_taxonomy_frequencies = new ConcurrentHashMap<>(); for (String taxonomyKey : taxonomyResults.keySet()) { num_taxonomy_frequencies.put(taxonomyKey, (long) 0); for (AtomicLong value : taxonomyResults.get(taxonomyKey).values()){ long val = num_taxonomy_frequencies.get(taxonomyKey); val += value.get(); num_taxonomy_frequencies.put(taxonomyKey, val); } } //CSV file header if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { FILE_HEADER_AL.add("Izpuščene besede"); } FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString()); if (filter.getCalculateFor().equals(CalculateFor.LEMMA)) FILE_HEADER_AL.add("Lema male črke"); if (filter.getSuffixLength() != null && filter.getSuffixList() != null && filter.getPrefixLength() != null && filter.getPrefixList() != null) { if (filter.getPrefixLength() > 0 || filter.getPrefixList().size() > 0) { FILE_HEADER_AL.add("Predpona"); } FILE_HEADER_AL.add("Preostali del besede"); if (filter.getSuffixLength() > 0 || filter.getSuffixList().size() > 0) { FILE_HEADER_AL.add("Pripona"); } } headerInfoBlock.put(filter.getCalculateFor().toMetadataString(), String.valueOf(statistics.getUniGramOccurrences())); // headerInfoBlock.put(filter.getCalculateFor().toMetadataString(), String.valueOf(num_frequencies)); for (CalculateFor otherKey : filter.getMultipleKeys()) { FILE_HEADER_AL.add(otherKey.toHeaderString()); if (otherKey.equals(CalculateFor.LEMMA)) FILE_HEADER_AL.add("Lema male črke"); } FILE_HEADER_AL.add("Skupna absolutna pogostost"); FILE_HEADER_AL.add(filter.getCalculateFor().toPercentString()); FILE_HEADER_AL.add("Skupna relativna pogostost (na milijon pojavitev)"); if (filter.getCollocability().size() > 0){ for (Collocability c : filter.getCollocability()) { FILE_HEADER_AL.add(c.toHeaderString()); } } for (String key : taxonomyResults.keySet()) { if(!key.equals("Total") && num_taxonomy_frequencies.get(key) > 0) { FILE_HEADER_AL.add("Absolutna pogostost [" + key + "]"); FILE_HEADER_AL.add("Delež [" + key + "]"); FILE_HEADER_AL.add("Relativna pogostost [" + key + "]"); } } if (filter.getWriteMsdAtTheEnd()) { String msd = ""; int maxMsdLength = 0; for(MultipleHMKeys key : set.iterator().next().getRight().keySet()){ msd = key.getMsd(filter); if (msd.length() > maxMsdLength){ maxMsdLength = msd.length(); } } for(int i = 0; i < maxMsdLength; i++){ FILE_HEADER_AL.add("msd" + String.format("%02d", i + 1)); } } FILE_HEADER = new String[ FILE_HEADER_AL.size() ]; FILE_HEADER_AL.toArray(FILE_HEADER); String fileName = ""; for (Pair> p : set) { String title = p.getLeft(); // statistics.setTimeEnding(); title = statistics.generateResultTitle(); // statistics. fileName = title.replace(": ", "-"); fileName = fileName.replace(" ", "_").concat(".csv"); fileName = resultsPath.toString().concat(File.separator).concat(fileName); Map map = p.getRight(); if (map.isEmpty()) continue; // long total = Util.mapSumFrequencies(map); OutputStreamWriter fileWriter = null; CSVPrinter csvFilePrinter = null; //Create the CSVFormat object with "\n" as a record delimiter it puts all words in braces CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';').withQuoteMode(QuoteMode.ALL); try { //initialize FileWriter object fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); //initialize CSVPrinter object csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat); // write info block printHeaderInfo(csvFilePrinter, headerInfoBlock); //Create CSV file header csvFilePrinter.printRecord(FILE_HEADER); for (Map.Entry e : map.entrySet()) { List dataEntry = new ArrayList<>(); if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { dataEntry.add(e.getKey().getK1()); } dataEntry.add(eraseSkipgramStars(e.getKey().getK1(), filter)); if (filter.getCalculateFor().equals(CalculateFor.LEMMA)){ dataEntry.add(eraseSkipgramStars(e.getKey().getK1().toLowerCase(), filter)); } if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) { if(filter.getSuffixLength() > 0 || filter.getPrefixLength() > 0) { if (filter.getPrefixLength() > 0) { dataEntry.add(((String) dataEntry.get(0)).substring(0, filter.getPrefixLength())); } dataEntry.add(((String) dataEntry.get(0)).substring(filter.getPrefixLength(), ((String) dataEntry.get(0)).length() - filter.getSuffixLength())); if (filter.getSuffixLength() > 0) { dataEntry.add(((String) dataEntry.get(0)).substring(((String) dataEntry.get(0)).length() - filter.getSuffixLength())); } } else { String key = (String) dataEntry.get(0); // real prefix String rpf = ""; for(String pf : filter.getPrefixList()){ if (pf.equals(key.substring(0, pf.length()))){ rpf = pf; break; } } // real suffix String rsf = ""; for(String sf : filter.getSuffixList()){ if (sf.equals(key.substring(key.length() - sf.length()))){ rsf = sf; break; } } if (filter.getPrefixList().size() > 0) { dataEntry.add(rpf); } dataEntry.add(key.substring(rpf.length(), key.length() - rsf.length())); if (filter.getSuffixList().size() > 0) { dataEntry.add(rsf); } } } int i = 0; for (CalculateFor otherKey : filter.getMultipleKeys()){ switch(i){ case 0: if (otherKey.equals(CalculateFor.LEMMA)){ dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter)); dataEntry.add(eraseSkipgramStars(e.getKey().getK2().toLowerCase(), filter)); } else { dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter)); } break; case 1: dataEntry.add(eraseSkipgramStars(e.getKey().getK3(), filter)); break; case 2: dataEntry.add(eraseSkipgramStars(e.getKey().getK4(), filter)); break; case 3: dataEntry.add(eraseSkipgramStars(e.getKey().getK5(), filter)); break; } i++; } dataEntry.add(e.getValue().toString()); dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies)); dataEntry.add(String.format("%.2f", ((double) e.getValue() * 1000000)/num_frequencies)); for (String key : taxonomyResults.keySet()){ if(!key.equals("Total") && num_taxonomy_frequencies.get(key) > 0) { AtomicLong frequency = taxonomyResults.get(key).get(e.getKey()); dataEntry.add(frequency.toString()); // dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_taxonomy_frequencies.get(key))); // dataEntry.add(String.format("%.2f", ((double) frequency.get() * 1000000) / num_taxonomy_frequencies.get(key))); dataEntry.add(formatNumberAsPercent((double) frequency.get() / statistics.getUniGramOccurrences())); dataEntry.add(String.format("%.2f", ((double) frequency.get() * 1000000) / statistics.getUniGramOccurrences())); } } if (filter.getCollocability().size() > 0){ for (Collocability c : filter.getCollocability()) { dataEntry.add(statistics.getCollocability().get(c).get(e.getKey())); } } // Write msd separated per letters at the end of each line in csv if (filter.getWriteMsdAtTheEnd()) { // String msd = ""; // // if (filter.getCalculateFor().equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ // msd = e.getKey().getK1(); // } else if (filter.getMultipleKeys().contains(CalculateFor.MORPHOSYNTACTIC_SPECS)) { // i = 0; // for (CalculateFor otherKey : filter.getMultipleKeys()){ // switch(i){ // case 0: // if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ // msd = e.getKey().getK2(); // } // break; // case 1: // if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ // msd = e.getKey().getK3(); // } // break; // case 2: // if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ // msd = e.getKey().getK4(); // } // break; // case 3: // if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ // msd = e.getKey().getK5(); // } // break; // } // // i++; // } // } String msd = e.getKey().getMsd(filter); String [] charArray = msd.split("(?!^)"); dataEntry.addAll(Arrays.asList(charArray)); } csvFilePrinter.printRecord(dataEntry); } } catch (Exception e) { System.out.println("Error in CsvFileWriter!"); e.printStackTrace(); } finally { try { if (fileWriter != null) { fileWriter.flush(); fileWriter.close(); } if (csvFilePrinter != null) { csvFilePrinter.close(); } } catch (IOException e) { System.out.println("Error while flushing/closing fileWriter/csvPrinter!"); e.printStackTrace(); } } } return fileName; } private static String eraseSkipgramStars(String s, Filter filter){ if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { s = s.replace("* ", ""); } return s; } public static String SetToCSV(String title, Object[][] result, File resultsPath, LinkedHashMap headerInfoBlock) { //Delimiter used in CSV file String NEW_LINE_SEPARATOR = "\n"; //CSV file header Object[] FILE_HEADER = {"word", "frequency", "percent"}; String fileName = ""; fileName = title.replace(": ", "-"); fileName = fileName.replace(" ", "_").concat(".csv"); fileName = resultsPath.toString().concat(File.separator).concat(fileName); OutputStreamWriter fileWriter = null; CSVPrinter csvFilePrinter = null; //Create the CSVFormat object with "\n" as a record delimiter CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';'); try { //initialize FileWriter object fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); //initialize CSVPrinter object csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat); // write info block printHeaderInfo(csvFilePrinter, headerInfoBlock); //Create CSV file header csvFilePrinter.printRecord(FILE_HEADER); for (Object[] resultEntry : result) { List dataEntry = new ArrayList<>(); dataEntry.add(resultEntry[0]); dataEntry.add(resultEntry[1]); dataEntry.add(formatNumberAsPercent(resultEntry[2])); csvFilePrinter.printRecord(dataEntry); } } catch (Exception e) { System.out.println("Error in CsvFileWriter!"); e.printStackTrace(); } finally { try { if (fileWriter != null) { fileWriter.flush(); fileWriter.close(); } if (csvFilePrinter != null) { csvFilePrinter.close(); } } catch (IOException e) { System.out.println("Error while flushing/closing fileWriter/csvPrinter!"); e.printStackTrace(); } } return fileName; } public static String nestedMapToCSV(String title, Map>> result, File resultsPath, LinkedHashMap headerInfoBlock) { //Delimiter used in CSV file String NEW_LINE_SEPARATOR = "\n"; //CSV file header Object[] FILE_HEADER = {"type", "key", "word", "frequency"}; String fileName = ""; fileName = title.replace(": ", "-"); fileName = fileName.replace(" ", "_").concat(".csv"); fileName = resultsPath.toString().concat(File.separator).concat(fileName); OutputStreamWriter fileWriter = null; CSVPrinter csvFilePrinter = null; //Create the CSVFormat object with "\n" as a record delimiter CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';'); try { //initialize FileWriter object fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); //initialize CSVPrinter object csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat); // write info block printHeaderInfo(csvFilePrinter, headerInfoBlock); //Create CSV file header csvFilePrinter.printRecord(FILE_HEADER); for (Map.Entry>> typeEntry : result.entrySet()) { for (Map.Entry> keyWordEntry : typeEntry.getValue().entrySet()) { for (Map.Entry calculationResults : keyWordEntry.getValue().entrySet()) { List values = new ArrayList(); values.add(typeEntry.getKey().getName()); values.add(keyWordEntry.getKey()); values.add(calculationResults.getKey()); values.add(calculationResults.getValue()); csvFilePrinter.printRecord(values); } } } } catch (Exception e) { System.out.println("Error in CsvFileWriter!"); e.printStackTrace(); } finally { try { if (fileWriter != null) { fileWriter.flush(); fileWriter.close(); } if (csvFilePrinter != null) { csvFilePrinter.close(); } } catch (IOException e) { System.out.println("Error while flushing/closing fileWriter/csvPrinter!"); e.printStackTrace(); } } return fileName; } private static void printHeaderInfo(CSVPrinter csvFilePrinter, LinkedHashMap headerInfoBlock) throws IOException { for (Map.Entry entry : headerInfoBlock.entrySet()) { List values = new ArrayList(); values.add(entry.getKey()); values.add(entry.getValue()); csvFilePrinter.printRecord(values); } // 2 empty lines List values = new ArrayList(); csvFilePrinter.printRecord(values); csvFilePrinter.printRecord(values); } }