package util; import static util.Util.*; import java.io.*; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; import data.*; import gui.I18N; import gui.ValidationUtil; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVPrinter; import org.apache.commons.csv.QuoteMode; import org.apache.commons.lang3.tuple.Pair; import data.Enums.WordLevelType; @SuppressWarnings("unchecked") public class Export { public static String SetToCSV(Set>> set, File resultsPath, LinkedHashMap headerInfoBlock, StatisticsNew statistics, Filter filter) { Map> taxonomyResults = statistics.getTaxonomyResult(); //Delimiter used in CSV file String NEW_LINE_SEPARATOR = "\n"; List FILE_HEADER_AL = new ArrayList<>(); Object[] FILE_HEADER; Map num_selected_taxonomy_frequencies = new ConcurrentHashMap<>(); for (Taxonomy taxonomyKey : taxonomyResults.keySet()) { num_selected_taxonomy_frequencies.put(taxonomyKey, (long) 0); for (AtomicLong value : taxonomyResults.get(taxonomyKey).values()){ long val = num_selected_taxonomy_frequencies.get(taxonomyKey); val += value.get(); num_selected_taxonomy_frequencies.put(taxonomyKey, val); } } Map num_taxonomy_frequencies = statistics.getUniGramOccurrences(); //CSV file header if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { FILE_HEADER_AL.add(I18N.get("exportTable.skippedWords")); } FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString(filter.getNgramValue())); if (filter.getCalculateFor().equals(CalculateFor.LEMMA)) { if(filter.getNgramValue() == 0) { FILE_HEADER_AL.add(I18N.get("exportTable.lettersSmall")); } else if(filter.getNgramValue() >= 1) { FILE_HEADER_AL.add(I18N.get("exportTable.wordsSmall")); } } if (filter.getSuffixLength() != null && filter.getSuffixList() != null && filter.getPrefixLength() != null && filter.getPrefixList() != null) { if (filter.getPrefixLength() > 0 || filter.getPrefixList().size() > 0) { FILE_HEADER_AL.add(I18N.get("exportTable.wordBeginning")); } FILE_HEADER_AL.add(I18N.get("exportTable.wordRest")); if (filter.getSuffixLength() > 0 || filter.getSuffixList().size() > 0) { FILE_HEADER_AL.add(I18N.get("exportTable.wordEnding")); } } headerInfoBlock.put(filter.getCalculateFor().totalSumString(filter.getNgramValue()), String.valueOf(num_taxonomy_frequencies.get(statistics.getCorpus().getTotal()).longValue())); headerInfoBlock.put(filter.getCalculateFor().foundSumString(filter.getNgramValue()), String.valueOf(num_selected_taxonomy_frequencies.get(statistics.getCorpus().getTotal()).longValue())); for (CalculateFor otherKey : filter.getMultipleKeys()) { FILE_HEADER_AL.add(otherKey.toHeaderString(filter.getNgramValue())); if (otherKey.equals(CalculateFor.LEMMA)) FILE_HEADER_AL.add(I18N.get("exportTable.wordsSmall")); } FILE_HEADER_AL.add(filter.getCalculateFor().totalAbsoluteFrequencyString(filter.getNgramValue())); FILE_HEADER_AL.add(filter.getCalculateFor().shareOfTotalString(filter.getNgramValue())); FILE_HEADER_AL.add(I18N.get("exportTable.totalRelativeFrequency")); for (Taxonomy key : taxonomyResults.keySet()) { if(!key.equals(statistics.getCorpus().getTotal()) && num_taxonomy_frequencies.containsKey(key) && num_taxonomy_frequencies.get(key).longValue() > 0) { FILE_HEADER_AL.add(I18N.get("exportTable.absoluteFrequency") + " [" + key.toString() + "]"); FILE_HEADER_AL.add(I18N.get("exportTable.percentage") + " [" + key.toString() + "]"); FILE_HEADER_AL.add(I18N.get("exportTable.relativeFrequency") + " [" + key.toString() + "]"); } } if (filter.getCollocability().size() > 0){ for (Collocability c : filter.getCollocability()) { FILE_HEADER_AL.add(c.toHeaderString()); } } if (filter.getWriteMsdAtTheEnd()) { String msd = ""; int maxMsdLength = 0; for(MultipleHMKeys key : set.iterator().next().getRight().keySet()){ msd = key.getMsd(filter); if (msd.length() > maxMsdLength){ maxMsdLength = msd.length(); } } for(int i = 0; i < maxMsdLength; i++){ FILE_HEADER_AL.add(I18N.get("exportTable.msd") + String.format("%02d", i + 1)); } } FILE_HEADER = new String[ FILE_HEADER_AL.size() ]; FILE_HEADER_AL.toArray(FILE_HEADER); String fileName = ""; for (Pair> p : set) { String title = p.getLeft(); title = statistics.generateResultTitle(); fileName = title.replace(": ", "-"); fileName = fileName.replace(" ", "_").concat(".tsv"); fileName = resultsPath.toString().concat(File.separator).concat(fileName); Map map = p.getRight(); if (map.isEmpty()) continue; OutputStreamWriter fileWriter = null; CSVPrinter csvFilePrinter = null; //Create the CSVFormat object with "\n" as a record delimiter it puts all words in braces CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter('\t').withQuoteMode(QuoteMode.ALL); try { //initialize FileWriter object fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); //initialize CSVPrinter object csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat); // write info block printHeaderInfo(csvFilePrinter, headerInfoBlock); //Create CSV file header csvFilePrinter.printRecord(FILE_HEADER); for (Map.Entry e : map.entrySet()) { List dataEntry = new ArrayList<>(); if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { dataEntry.add(e.getKey().getK1()); } dataEntry.add(eraseSkipgramStars(e.getKey().getK1(), filter)); if (filter.getCalculateFor().equals(CalculateFor.LEMMA)){ dataEntry.add(eraseSkipgramStars(e.getKey().getK1().toLowerCase(), filter)); } if (filter.getSuffixLength() != null || filter.getSuffixList() != null || filter.getPrefixLength() != null || filter.getPrefixList() != null) { if(filter.getSuffixLength() > 0 || filter.getPrefixLength() > 0) { if (filter.getPrefixLength() > 0) { dataEntry.add(((String) dataEntry.get(0)).substring(0, filter.getPrefixLength())); } dataEntry.add(((String) dataEntry.get(0)).substring(filter.getPrefixLength(), ((String) dataEntry.get(0)).length() - filter.getSuffixLength())); if (filter.getSuffixLength() > 0) { dataEntry.add(((String) dataEntry.get(0)).substring(((String) dataEntry.get(0)).length() - filter.getSuffixLength())); } } else { String key = (String) dataEntry.get(0); // real prefix String rpf = ""; for(String pf : filter.getPrefixList()){ if (key.length() < pf.length()) { continue; } if (pf.equals(key.substring(0, pf.length()))){ rpf = pf; break; } } // real suffix String rsf = ""; for(String sf : filter.getSuffixList()){ if (key.length() < sf.length()) { continue; } if (sf.equals(key.substring(key.length() - sf.length()))){ rsf = sf; break; } } if (filter.getPrefixList().size() > 0) { dataEntry.add(rpf); } dataEntry.add(key.substring(rpf.length(), key.length() - rsf.length())); if (filter.getSuffixList().size() > 0) { dataEntry.add(rsf); } } } int i = 0; for (CalculateFor otherKey : filter.getMultipleKeys()){ switch(i){ case 0: if (otherKey.equals(CalculateFor.LEMMA)){ dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter)); dataEntry.add(eraseSkipgramStars(e.getKey().getK2().toLowerCase(), filter)); } else { dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter)); } break; case 1: dataEntry.add(eraseSkipgramStars(e.getKey().getK3(), filter)); break; case 2: dataEntry.add(eraseSkipgramStars(e.getKey().getK4(), filter)); break; case 3: dataEntry.add(eraseSkipgramStars(e.getKey().getK5(), filter)); break; } i++; } dataEntry.add(e.getValue().toString()); dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_selected_taxonomy_frequencies.get(statistics.getCorpus().getTotal()), statistics.getCorpus().getPunctuation())); dataEntry.add(formatNumberForExport(((double) e.getValue() * 1000000)/num_taxonomy_frequencies.get(statistics.getCorpus().getTotal()).longValue(), statistics.getCorpus().getPunctuation())); for (Taxonomy key : taxonomyResults.keySet()){ if(!key.equals(statistics.getCorpus().getTotal()) && num_taxonomy_frequencies.containsKey(key) && num_taxonomy_frequencies.get(key).longValue() > 0) { AtomicLong frequency = taxonomyResults.get(key).get(e.getKey()); dataEntry.add(frequency.toString()); dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_selected_taxonomy_frequencies.get(key), statistics.getCorpus().getPunctuation())); dataEntry.add(formatNumberForExport(((double) frequency.get() * 1000000) / num_taxonomy_frequencies.get(key).longValue(), statistics.getCorpus().getPunctuation())); } } if (filter.getCollocability().size() > 0){ for (Collocability c : filter.getCollocability()) { dataEntry.add(formatNumberForLongExport(statistics.getCollocability().get(c).get(e.getKey()), statistics.getCorpus().getPunctuation())); } } // Write msd separated per letters at the end of each line in csv if (filter.getWriteMsdAtTheEnd()) { String msd = e.getKey().getMsd(filter); String [] charArray = msd.split("(?!^)"); dataEntry.addAll(Arrays.asList(charArray)); } csvFilePrinter.printRecord(dataEntry); } } catch (Exception e) { System.out.println("Error in CsvFileWriter!"); e.printStackTrace(); } finally { try { if (fileWriter != null) { fileWriter.flush(); fileWriter.close(); } if (csvFilePrinter != null) { csvFilePrinter.close(); } } catch (IOException e) { System.out.println("Error while flushing/closing fileWriter/csvPrinter!"); e.printStackTrace(); } } } return fileName; } private static String eraseSkipgramStars(String s, Filter filter){ if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { s = s.replace("* ", ""); } return s; } public static String nestedMapToCSV(String title, Map>> result, File resultsPath, LinkedHashMap headerInfoBlock) { //Delimiter used in CSV file String NEW_LINE_SEPARATOR = "\n"; //CSV file header Object[] FILE_HEADER = {"type", "key", "word", "frequency"}; String fileName = ""; fileName = title.replace(": ", "-"); fileName = fileName.replace(" ", "_").concat(".csv"); fileName = resultsPath.toString().concat(File.separator).concat(fileName); OutputStreamWriter fileWriter = null; CSVPrinter csvFilePrinter = null; //Create the CSVFormat object with "\n" as a record delimiter CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter('\t'); try { //initialize FileWriter object fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); //initialize CSVPrinter object csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat); // write info block printHeaderInfo(csvFilePrinter, headerInfoBlock); //Create CSV file header csvFilePrinter.printRecord(FILE_HEADER); for (Map.Entry>> typeEntry : result.entrySet()) { for (Map.Entry> keyWordEntry : typeEntry.getValue().entrySet()) { for (Map.Entry calculationResults : keyWordEntry.getValue().entrySet()) { List values = new ArrayList(); values.add(typeEntry.getKey().getName()); values.add(keyWordEntry.getKey()); values.add(calculationResults.getKey()); values.add(calculationResults.getValue()); csvFilePrinter.printRecord(values); } } } } catch (Exception e) { System.out.println("Error in CsvFileWriter!"); e.printStackTrace(); } finally { try { if (fileWriter != null) { fileWriter.flush(); fileWriter.close(); } if (csvFilePrinter != null) { csvFilePrinter.close(); } } catch (IOException e) { System.out.println("Error while flushing/closing fileWriter/csvPrinter!"); e.printStackTrace(); } } return fileName; } private static void printHeaderInfo(CSVPrinter csvFilePrinter, LinkedHashMap headerInfoBlock) throws IOException { for (Map.Entry entry : headerInfoBlock.entrySet()) { List values = new ArrayList(); values.add(entry.getKey()); values.add(entry.getValue()); csvFilePrinter.printRecord(values); } // 2 empty lines List values = new ArrayList(); csvFilePrinter.printRecord(values); csvFilePrinter.printRecord(values); } }