package util; import static util.Util.*; import java.io.*; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicLong; import data.*; import gui.I18N; import gui.ValidationUtil; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVPrinter; import org.apache.commons.csv.QuoteMode; import org.apache.commons.lang3.tuple.Pair; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import data.Enums.WordLevelType; @SuppressWarnings("unchecked") public class Export { // public static void SetToJSON(Set>> set) { // JSONArray wrapper = new JSONArray(); // // for (Pair> p : set) { // JSONArray data_wrapper = new JSONArray(); // JSONObject metric = new JSONObject(); // // String title = p.getLeft(); // Map map = p.getRight(); // // if (map.isEmpty()) // continue; // // long total = Util.mapSumFrequencies(map); // // for (Map.Entry e : map.entrySet()) { // JSONObject data_entry = new JSONObject(); // data_entry.put("word", e.getKey()); // data_entry.put("frequency", e.getValue()); // data_entry.put("percent", formatNumberAsPercent((double) e.getValue() / total)); // // data_wrapper.add(data_entry); // } // // metric.put("Title", title); // metric.put("data", data_wrapper); // wrapper.add(metric); // } // // try (FileWriter file = new FileWriter("statistics.json")) { // file.write(wrapper.toJSONString()); // } catch (IOException e) { // e.printStackTrace(); // } // } public static String SetToCSV(Set>> set, File resultsPath, LinkedHashMap headerInfoBlock, StatisticsNew statistics, Filter filter) { Map> taxonomyResults = statistics.getTaxonomyResult(); //Delimiter used in CSV file String NEW_LINE_SEPARATOR = "\n"; List FILE_HEADER_AL = new ArrayList<>(); Object[] FILE_HEADER; //Count frequencies // long num_frequencies = 0; // for (Pair> p : set) { // Map map = p.getRight(); // if (map.isEmpty()) // continue; // num_frequencies = Util.mapSumFrequencies(map); // } Map num_selected_taxonomy_frequencies = new ConcurrentHashMap<>(); for (Taxonomy taxonomyKey : taxonomyResults.keySet()) { num_selected_taxonomy_frequencies.put(taxonomyKey, (long) 0); for (AtomicLong value : taxonomyResults.get(taxonomyKey).values()){ long val = num_selected_taxonomy_frequencies.get(taxonomyKey); val += value.get(); num_selected_taxonomy_frequencies.put(taxonomyKey, val); } } Map num_taxonomy_frequencies = statistics.getUniGramOccurrences(); //CSV file header if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { FILE_HEADER_AL.add(I18N.get("exportTable.skippedWords")); } FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString(filter.getNgramValue())); if (filter.getCalculateFor().equals(CalculateFor.LEMMA)) { if(filter.getNgramValue() == 0) { FILE_HEADER_AL.add(I18N.get("exportTable.lettersSmall")); } else if(filter.getNgramValue() >= 1) { FILE_HEADER_AL.add(I18N.get("exportTable.wordsSmall")); } } if (filter.getSuffixLength() != null && filter.getSuffixList() != null && filter.getPrefixLength() != null && filter.getPrefixList() != null) { if (filter.getPrefixLength() > 0 || filter.getPrefixList().size() > 0) { FILE_HEADER_AL.add(I18N.get("exportTable.wordBeginning")); } FILE_HEADER_AL.add(I18N.get("exportTable.wordRest")); if (filter.getSuffixLength() > 0 || filter.getSuffixList().size() > 0) { FILE_HEADER_AL.add(I18N.get("exportTable.wordEnding")); } } headerInfoBlock.put(filter.getCalculateFor().totalSumString(filter.getNgramValue()), String.valueOf(num_taxonomy_frequencies.get(statistics.getCorpus().getTotal()).longValue())); headerInfoBlock.put(filter.getCalculateFor().foundSumString(filter.getNgramValue()), String.valueOf(num_selected_taxonomy_frequencies.get(statistics.getCorpus().getTotal()).longValue())); // headerInfoBlock.put(filter.getCalculateFor().toMetadataString(), String.valueOf(num_frequencies)); for (CalculateFor otherKey : filter.getMultipleKeys()) { FILE_HEADER_AL.add(otherKey.toHeaderString(filter.getNgramValue())); if (otherKey.equals(CalculateFor.LEMMA)) FILE_HEADER_AL.add(I18N.get("exportTable.wordsSmall")); } FILE_HEADER_AL.add(filter.getCalculateFor().totalAbsoluteFrequencyString(filter.getNgramValue())); FILE_HEADER_AL.add(filter.getCalculateFor().shareOfTotalString(filter.getNgramValue())); FILE_HEADER_AL.add(I18N.get("exportTable.totalRelativeFrequency")); for (Taxonomy key : taxonomyResults.keySet()) { if(!key.equals(statistics.getCorpus().getTotal()) && num_taxonomy_frequencies.containsKey(key) && num_taxonomy_frequencies.get(key).longValue() > 0) { FILE_HEADER_AL.add(I18N.get("exportTable.absoluteFrequency") + " [" + key.toString() + "]"); FILE_HEADER_AL.add(I18N.get("exportTable.percentage") + " [" + key.toString() + "]"); FILE_HEADER_AL.add(I18N.get("exportTable.relativeFrequency") + " [" + key.toString() + "]"); } } if (filter.getCollocability().size() > 0){ for (Collocability c : filter.getCollocability()) { FILE_HEADER_AL.add(c.toHeaderString()); } } if (filter.getWriteMsdAtTheEnd()) { String msd = ""; int maxMsdLength = 0; for(MultipleHMKeys key : set.iterator().next().getRight().keySet()){ msd = key.getMsd(filter); if (msd.length() > maxMsdLength){ maxMsdLength = msd.length(); } } for(int i = 0; i < maxMsdLength; i++){ FILE_HEADER_AL.add(I18N.get("exportTable.msd") + String.format("%02d", i + 1)); } } FILE_HEADER = new String[ FILE_HEADER_AL.size() ]; FILE_HEADER_AL.toArray(FILE_HEADER); String fileName = ""; for (Pair> p : set) { String title = p.getLeft(); // statistics.setTimeEnding(); title = statistics.generateResultTitle(); // statistics. fileName = title.replace(": ", "-"); fileName = fileName.replace(" ", "_").concat(".csv"); fileName = resultsPath.toString().concat(File.separator).concat(fileName); Map map = p.getRight(); if (map.isEmpty()) continue; // long total = Util.mapSumFrequencies(map); OutputStreamWriter fileWriter = null; CSVPrinter csvFilePrinter = null; //Create the CSVFormat object with "\n" as a record delimiter it puts all words in braces CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';').withQuoteMode(QuoteMode.ALL); try { //initialize FileWriter object fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); //initialize CSVPrinter object csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat); // write info block printHeaderInfo(csvFilePrinter, headerInfoBlock); //Create CSV file header csvFilePrinter.printRecord(FILE_HEADER); for (Map.Entry e : map.entrySet()) { List dataEntry = new ArrayList<>(); if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { dataEntry.add(e.getKey().getK1()); } dataEntry.add(eraseSkipgramStars(e.getKey().getK1(), filter)); if (filter.getCalculateFor().equals(CalculateFor.LEMMA)){ dataEntry.add(eraseSkipgramStars(e.getKey().getK1().toLowerCase(), filter)); } if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) { if(filter.getSuffixLength() > 0 || filter.getPrefixLength() > 0) { if (filter.getPrefixLength() > 0) { dataEntry.add(((String) dataEntry.get(0)).substring(0, filter.getPrefixLength())); } dataEntry.add(((String) dataEntry.get(0)).substring(filter.getPrefixLength(), ((String) dataEntry.get(0)).length() - filter.getSuffixLength())); if (filter.getSuffixLength() > 0) { dataEntry.add(((String) dataEntry.get(0)).substring(((String) dataEntry.get(0)).length() - filter.getSuffixLength())); } } else { String key = (String) dataEntry.get(0); // real prefix String rpf = ""; for(String pf : filter.getPrefixList()){ if (key.length() < pf.length()) { continue; } if (pf.equals(key.substring(0, pf.length()))){ rpf = pf; break; } } // real suffix String rsf = ""; for(String sf : filter.getSuffixList()){ if (key.length() < sf.length()) { continue; } if (sf.equals(key.substring(key.length() - sf.length()))){ rsf = sf; break; } } if (filter.getPrefixList().size() > 0) { dataEntry.add(rpf); } dataEntry.add(key.substring(rpf.length(), key.length() - rsf.length())); if (filter.getSuffixList().size() > 0) { dataEntry.add(rsf); } } } int i = 0; for (CalculateFor otherKey : filter.getMultipleKeys()){ switch(i){ case 0: if (otherKey.equals(CalculateFor.LEMMA)){ dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter)); dataEntry.add(eraseSkipgramStars(e.getKey().getK2().toLowerCase(), filter)); } else { dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter)); } break; case 1: dataEntry.add(eraseSkipgramStars(e.getKey().getK3(), filter)); break; case 2: dataEntry.add(eraseSkipgramStars(e.getKey().getK4(), filter)); break; case 3: dataEntry.add(eraseSkipgramStars(e.getKey().getK5(), filter)); break; } i++; } dataEntry.add(e.getValue().toString()); dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_selected_taxonomy_frequencies.get(statistics.getCorpus().getTotal()), statistics.getCorpus().getPunctuation())); dataEntry.add(formatNumberForExport(((double) e.getValue() * 1000000)/num_taxonomy_frequencies.get(statistics.getCorpus().getTotal()).longValue(), statistics.getCorpus().getPunctuation())); for (Taxonomy key : taxonomyResults.keySet()){ if(!key.equals(statistics.getCorpus().getTotal()) && num_taxonomy_frequencies.containsKey(key) && num_taxonomy_frequencies.get(key).longValue() > 0) { AtomicLong frequency = taxonomyResults.get(key).get(e.getKey()); dataEntry.add(frequency.toString()); dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_selected_taxonomy_frequencies.get(key), statistics.getCorpus().getPunctuation())); dataEntry.add(formatNumberForExport(((double) frequency.get() * 1000000) / num_taxonomy_frequencies.get(key).longValue(), statistics.getCorpus().getPunctuation())); // dataEntry.add(formatNumberAsPercent((double) frequency.get() / statistics.getUniGramOccurrences())); // dataEntry.add(String.format("%.2f", ((double) frequency.get() * 1000000) / statistics.getUniGramOccurrences())); } } if (filter.getCollocability().size() > 0){ for (Collocability c : filter.getCollocability()) { dataEntry.add(formatNumberForLongExport(statistics.getCollocability().get(c).get(e.getKey()), statistics.getCorpus().getPunctuation())); } } // Write msd separated per letters at the end of each line in csv if (filter.getWriteMsdAtTheEnd()) { // String msd = ""; // // if (filter.getCalculateFor().equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ // msd = e.getKey().getK1(); // } else if (filter.getMultipleKeys().contains(CalculateFor.MORPHOSYNTACTIC_SPECS)) { // i = 0; // for (CalculateFor otherKey : filter.getMultipleKeys()){ // switch(i){ // case 0: // if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ // msd = e.getKey().getK2(); // } // break; // case 1: // if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ // msd = e.getKey().getK3(); // } // break; // case 2: // if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ // msd = e.getKey().getK4(); // } // break; // case 3: // if (otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ // msd = e.getKey().getK5(); // } // break; // } // // i++; // } // } String msd = e.getKey().getMsd(filter); String [] charArray = msd.split("(?!^)"); dataEntry.addAll(Arrays.asList(charArray)); } csvFilePrinter.printRecord(dataEntry); } } catch (Exception e) { System.out.println("Error in CsvFileWriter!"); e.printStackTrace(); } finally { try { if (fileWriter != null) { fileWriter.flush(); fileWriter.close(); } if (csvFilePrinter != null) { csvFilePrinter.close(); } } catch (IOException e) { System.out.println("Error while flushing/closing fileWriter/csvPrinter!"); e.printStackTrace(); } } } return fileName; } private static String eraseSkipgramStars(String s, Filter filter){ if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { s = s.replace("* ", ""); } return s; } // public static String SetToCSV(String title, Object[][] result, File resultsPath, LinkedHashMap headerInfoBlock) { // //Delimiter used in CSV file // String NEW_LINE_SEPARATOR = "\n"; // // //CSV file header // Object[] FILE_HEADER = {"word", "frequency", "percent"}; // // String fileName = ""; // // fileName = title.replace(": ", "-"); // fileName = fileName.replace(" ", "_").concat(".csv"); // // fileName = resultsPath.toString().concat(File.separator).concat(fileName); // // OutputStreamWriter fileWriter = null; // CSVPrinter csvFilePrinter = null; // // //Create the CSVFormat object with "\n" as a record delimiter // CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';'); // // try { // //initialize FileWriter object // fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); // // //initialize CSVPrinter object // csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat); // // // write info block // printHeaderInfo(csvFilePrinter, headerInfoBlock); // // //Create CSV file header // csvFilePrinter.printRecord(FILE_HEADER); // // for (Object[] resultEntry : result) { // List dataEntry = new ArrayList<>(); // dataEntry.add(resultEntry[0]); // dataEntry.add(resultEntry[1]); // dataEntry.add(formatNumberAsPercent(resultEntry[2]), statistics.getCorpus().getPunctuation()); // csvFilePrinter.printRecord(dataEntry); // } // } catch (Exception e) { // System.out.println("Error in CsvFileWriter!"); // e.printStackTrace(); // } finally { // try { // if (fileWriter != null) { // fileWriter.flush(); // fileWriter.close(); // } // if (csvFilePrinter != null) { // csvFilePrinter.close(); // } // } catch (IOException e) { // System.out.println("Error while flushing/closing fileWriter/csvPrinter!"); // e.printStackTrace(); // } // } // // return fileName; // } public static String nestedMapToCSV(String title, Map>> result, File resultsPath, LinkedHashMap headerInfoBlock) { //Delimiter used in CSV file String NEW_LINE_SEPARATOR = "\n"; //CSV file header Object[] FILE_HEADER = {"type", "key", "word", "frequency"}; String fileName = ""; fileName = title.replace(": ", "-"); fileName = fileName.replace(" ", "_").concat(".csv"); fileName = resultsPath.toString().concat(File.separator).concat(fileName); OutputStreamWriter fileWriter = null; CSVPrinter csvFilePrinter = null; //Create the CSVFormat object with "\n" as a record delimiter CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';'); try { //initialize FileWriter object fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); //initialize CSVPrinter object csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat); // write info block printHeaderInfo(csvFilePrinter, headerInfoBlock); //Create CSV file header csvFilePrinter.printRecord(FILE_HEADER); for (Map.Entry>> typeEntry : result.entrySet()) { for (Map.Entry> keyWordEntry : typeEntry.getValue().entrySet()) { for (Map.Entry calculationResults : keyWordEntry.getValue().entrySet()) { List values = new ArrayList(); values.add(typeEntry.getKey().getName()); values.add(keyWordEntry.getKey()); values.add(calculationResults.getKey()); values.add(calculationResults.getValue()); csvFilePrinter.printRecord(values); } } } } catch (Exception e) { System.out.println("Error in CsvFileWriter!"); e.printStackTrace(); } finally { try { if (fileWriter != null) { fileWriter.flush(); fileWriter.close(); } if (csvFilePrinter != null) { csvFilePrinter.close(); } } catch (IOException e) { System.out.println("Error while flushing/closing fileWriter/csvPrinter!"); e.printStackTrace(); } } return fileName; } private static void printHeaderInfo(CSVPrinter csvFilePrinter, LinkedHashMap headerInfoBlock) throws IOException { for (Map.Entry entry : headerInfoBlock.entrySet()) { List values = new ArrayList(); values.add(entry.getKey()); values.add(entry.getValue()); csvFilePrinter.printRecord(values); } // 2 empty lines List values = new ArrayList(); csvFilePrinter.printRecord(values); csvFilePrinter.printRecord(values); } }