diff --git a/.gitignore b/.gitignore index f794c26..16016a7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Created by .ignore support plugin (hsz.mobi) ### Maven template target/ +corpus_analyzer_jar/ pom.xml.tag pom.xml.releaseBackup pom.xml.versionsBackup diff --git a/src/main/java/alg/ngram/Ngrams.java b/src/main/java/alg/ngram/Ngrams.java index 429b5a6..6bcfc3c 100644 --- a/src/main/java/alg/ngram/Ngrams.java +++ b/src/main/java/alg/ngram/Ngrams.java @@ -46,7 +46,7 @@ public class Ngrams { } // UPDATE TAXONOMY HERE!!! - stats.updateTaxonomyResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()), ngramCandidate); + stats.updateTaxonomyResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()), ngramCandidate.get(0).getTaxonomy()); stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor())); } } @@ -114,6 +114,7 @@ public class Ngrams { private static void generateNgramLetterCandidates(List corpus, StatisticsNew stats) { for (Sentence s : corpus) { for (Word w : s.getWords()) { + List taxonomy = w.getTaxonomy(); String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv()); // skip this iteration if: @@ -128,6 +129,10 @@ public class Ngrams { for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) { // TODO: locila? + stats.updateTaxonomyResults(word.substring(i, i + stats.getFilter().getStringLength()), taxonomy); +// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor())); + + stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength())); } } diff --git a/src/main/java/data/MultipleHMKeys.java b/src/main/java/data/MultipleHMKeys.java new file mode 100644 index 0000000..61e06a0 --- /dev/null +++ b/src/main/java/data/MultipleHMKeys.java @@ -0,0 +1,49 @@ +package data; +/* +Created for when words are sorted by multiple keys, i.e. not just lemmas but lemmas and msd simultaneously. +*/ +final class MultipleHMKeys { + private final String key1, key2, key3; + + public MultipleHMKeys(String key1) { + this.key1 = key1; + this.key2 = null; + this.key3 = null; + } + + public MultipleHMKeys(String key1, String key2) { + this.key1 = key1; + this.key2 = key2; + this.key3 = null; + } + + public MultipleHMKeys(String key1, String key2, String key3) { + this.key1 = key1; + this.key2 = key2; + this.key3 = key3; + } + + public String getKey1() { + return key1; + } + + public String getKey2() { + return key2; + } + + public String getKey3() { + return key3; + } + + @Override + public int hashCode() { + return key1.hashCode() ^ key2.hashCode() ^ key3.hashCode(); + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof MultipleHMKeys) && ((MultipleHMKeys) obj).key1.equals(key1) + && ((MultipleHMKeys) obj).key2.equals(key2) + && ((MultipleHMKeys) obj).key3.equals(key3); + } +} diff --git a/src/main/java/data/StatisticsNew.java b/src/main/java/data/StatisticsNew.java index 6c27265..4da3226 100644 --- a/src/main/java/data/StatisticsNew.java +++ b/src/main/java/data/StatisticsNew.java @@ -45,6 +45,7 @@ public class StatisticsNew { this.corpus = corpus; this.filter = filter; this.taxonomyResult = new ConcurrentHashMap<>(); + this.taxonomyResult.put("Total", new ConcurrentHashMap<>()); // create table for counting word occurances per taxonomies @@ -97,13 +98,18 @@ public class StatisticsNew { if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { Integer ngramLevel = filter.getNgramValue(); if(ngramLevel == 0) { - sb.append("Crke"). - append(separator) - .append(corpus.getCorpusType().toString()) + sb.append(corpus.getCorpusType().toString()) + .append(separator) + .append("crke") + .append(separator) + .append(filter.getCalculateFor()) .append(separator); } else if(ngramLevel == 1) { - sb.append("Besede").append(separator) - .append(corpus.getCorpusType().toString()) + sb.append(corpus.getCorpusType().toString()) + .append(separator) + .append("besede") + .append(separator) + .append(filter.getCalculateFor()) .append(separator); } else { @@ -196,14 +202,14 @@ public class StatisticsNew { } // if no results and nothing to save, return false - if (!(result.size() > 0)) { + if (!(taxonomyResult.get("Total").size() > 0)) { analysisProducedResults = false; return false; } else { analysisProducedResults = true; } - stats.add(ImmutablePair.of(resultTitle, getSortedResult(result, Util.getValidInt(limit)))); + stats.add(ImmutablePair.of(resultTitle, getSortedResult(taxonomyResult.get("Total"), Util.getValidInt(limit)))); Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), taxonomyResult); return true; } @@ -275,10 +281,10 @@ public class StatisticsNew { return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit); } - public void updateTaxonomyResults(String o, List ngramCandidate) { + public void updateTaxonomyResults(String o, List taxonomy) { for (String key : taxonomyResult.keySet()) { // first word should have the same taxonomy as others - if (ngramCandidate.get(0).getTaxonomy().contains(key)) { + if (taxonomy.contains(key) || key.equals("Total")) { // if taxonomy not in map and in this word AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(1)); diff --git a/src/main/java/data/Word.java b/src/main/java/data/Word.java index 5cff321..bd6b3ee 100644 --- a/src/main/java/data/Word.java +++ b/src/main/java/data/Word.java @@ -55,7 +55,8 @@ public class Word implements Serializable { //private char besedna_vrsta; public Word(String word, String lemma, String msd, List taxonomy) { this.lemma = lemma; - this.msd = normalizeMsd(msd); +// this.msd = normalizeMsd(msd); + this.msd = msd; this.taxonomy = taxonomy; // veliko zacetnico ohranimo samo za lastna imena diff --git a/src/main/java/META-INF/MANIFEST.MF b/src/main/java/man/META-INF/MANIFEST.MF similarity index 94% rename from src/main/java/META-INF/MANIFEST.MF rename to src/main/java/man/META-INF/MANIFEST.MF index d2c5d1f..95df4c8 100644 --- a/src/main/java/META-INF/MANIFEST.MF +++ b/src/main/java/man/META-INF/MANIFEST.MF @@ -1,3 +1,3 @@ -Manifest-Version: 1.0 -Main-Class: gui.GUIController - +Manifest-Version: 1.0 +Main-Class: gui.GUIController + diff --git a/src/main/java/manifest/META-INF/MANIFEST.MF b/src/main/java/manifest/META-INF/MANIFEST.MF deleted file mode 100644 index d2c5d1f..0000000 --- a/src/main/java/manifest/META-INF/MANIFEST.MF +++ /dev/null @@ -1,3 +0,0 @@ -Manifest-Version: 1.0 -Main-Class: gui.GUIController - diff --git a/src/main/java/util/Export.java b/src/main/java/util/Export.java index 1627312..856018a 100644 --- a/src/main/java/util/Export.java +++ b/src/main/java/util/Export.java @@ -108,9 +108,11 @@ public class Export { } FILE_HEADER_AL.add("Skupna relativna pogostost"); for (String key : taxonomyResults.keySet()) { - FILE_HEADER_AL.add("Absolutna pogostost [" + key + "]"); - FILE_HEADER_AL.add("Delež [" + key + "]"); - FILE_HEADER_AL.add("Relativna pogostost [" + key + "]"); + if(!key.equals("Total")) { + FILE_HEADER_AL.add("Absolutna pogostost [" + key + "]"); + FILE_HEADER_AL.add("Delež [" + key + "]"); + FILE_HEADER_AL.add("Relativna pogostost [" + key + "]"); + } } FILE_HEADER = new String[ FILE_HEADER_AL.size() ]; FILE_HEADER_AL.toArray(FILE_HEADER); @@ -160,11 +162,12 @@ public class Export { dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies)); dataEntry.add(String.format("%.2f", ((double) e.getValue() * 10000)/num_frequencies)); for (String key : taxonomyResults.keySet()){ - AtomicLong frequency = taxonomyResults.get(key).get(e.getKey()); - dataEntry.add(frequency.toString()); - dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_taxonomy_frequencies.get(key))); - dataEntry.add(String.format("%.2f", ((double) frequency.get() * 10000) / num_taxonomy_frequencies.get(key))); - + if(!key.equals("Total")) { + AtomicLong frequency = taxonomyResults.get(key).get(e.getKey()); + dataEntry.add(frequency.toString()); + dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_taxonomy_frequencies.get(key))); + dataEntry.add(String.format("%.2f", ((double) frequency.get() * 10000) / num_taxonomy_frequencies.get(key))); + } } csvFilePrinter.printRecord(dataEntry);