From 9b5fa4616be39023f744c88dffbf6905d51eeaa3 Mon Sep 17 00:00:00 2001 From: Luka Date: Thu, 9 Aug 2018 09:21:06 +0200 Subject: [PATCH] Added some performance measures --- .gitignore | 1 + src/main/java/alg/XML_processing.java | 121 +++++++++++---- .../alg/inflectedJOS/InflectedJOSCount.java | 6 +- src/main/java/alg/ngram/Ngrams.java | 96 +++++++++--- src/main/java/alg/word/WordCount.java | 146 +++++++++--------- src/main/java/data/MultipleHMKeys.java | 49 +----- src/main/java/data/MultipleHMKeys1.java | 44 ++++++ src/main/java/data/MultipleHMKeys2.java | 49 ++++++ src/main/java/data/MultipleHMKeys3.java | 48 ++++++ src/main/java/data/MultipleHMKeys4.java | 50 ++++++ src/main/java/data/MultipleHMKeys5.java | 52 +++++++ src/main/java/data/Sentence.java | 24 +-- src/main/java/data/StatisticsNew.java | 6 +- src/main/java/data/Word.java | 55 ++++--- src/main/java/util/Export.java | 71 ++++++--- src/main/java/util/Util.java | 2 +- .../resources/gui/CharacterAnalysisTab.fxml | 16 +- src/main/resources/gui/CorpusTab.fxml | 2 +- .../resources/gui/OneWordAnalysisTab.fxml | 33 ++-- .../resources/gui/StringAnalysisTabNew2.fxml | 114 +++++++------- src/main/resources/gui/WordFormationTab.fxml | 12 +- src/main/resources/gui/WordLevelTab.fxml | 10 +- src/test/java/Common.java | 76 ++++----- src/test/java/NgramTests.java | 30 ++-- 24 files changed, 734 insertions(+), 379 deletions(-) create mode 100755 src/main/java/data/MultipleHMKeys1.java create mode 100755 src/main/java/data/MultipleHMKeys2.java create mode 100755 src/main/java/data/MultipleHMKeys3.java create mode 100755 src/main/java/data/MultipleHMKeys4.java create mode 100755 src/main/java/data/MultipleHMKeys5.java diff --git a/.gitignore b/.gitignore index 16016a7..de9fa21 100755 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Created by .ignore support plugin (hsz.mobi) ### Maven template +src/main/resources/META-INF/ target/ corpus_analyzer_jar/ pom.xml.tag diff --git a/src/main/java/alg/XML_processing.java b/src/main/java/alg/XML_processing.java index 4152982..79b82ff 100755 --- a/src/main/java/alg/XML_processing.java +++ b/src/main/java/alg/XML_processing.java @@ -5,6 +5,7 @@ import static data.Enums.solar.SolarFilters.*; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.util.*; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ForkJoinPool; import javax.xml.namespace.QName; @@ -261,7 +262,7 @@ public class XML_processing { if (c3Content.equals(".") && includeThisBlock) { // add sentence to corpus - corpus.add(new Sentence(stavek)); + corpus.add(new Sentence(stavek, null)); // and start a new one stavek = new ArrayList<>(); @@ -293,7 +294,7 @@ public class XML_processing { // "word" node value if (in_word) { - stavek.add(new Word(characters.getData(), lemma, msd, null)); + stavek.add(new Word(characters.getData(), lemma, msd)); in_word = false; } else if(inPunctuation){ String punctuation = ","; @@ -543,7 +544,7 @@ public class XML_processing { // "word" node value if (inWord) { String word = characters.getData(); - sentence.add(new Word(word, lemma, msd, currentFiletaxonomyLong)); + sentence.add(new Word(word, lemma, msd)); inWord = false; } if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { @@ -588,7 +589,7 @@ public class XML_processing { sentence = runFilters(sentence, stats.getFilter()); if (!ValidationUtil.isEmpty(sentence)) { - corpus.add(new Sentence(sentence)); + corpus.add(new Sentence(sentence, currentFiletaxonomyLong)); } // and start a new one @@ -655,6 +656,7 @@ public class XML_processing { boolean inPunctuation = false; boolean inOrthDiv = false; boolean computeForOrth = stats.getCorpus().isGosOrthMode(); + boolean inSeparatedWord = false; ArrayList currentFiletaxonomy = new ArrayList<>(); ArrayList currentFiletaxonomyLong = new ArrayList<>(); String lemma = ""; @@ -662,7 +664,10 @@ public class XML_processing { List sentence = new ArrayList<>(); List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it + Map> GOSCorpusHM = new ConcurrentHashMap<>(); + String GOSCorpusHMKey = ""; String sentenceDelimiter = "seg"; + int wordIndex = 0; String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm @@ -674,6 +679,8 @@ public class XML_processing { XMLInputFactory factory = XMLInputFactory.newInstance(); eventReader = factory.createXMLEventReader(new FileInputStream(path)); + // created hashmap to combine words with normalized words + while (eventReader.hasNext()) { XMLEvent event = eventReader.nextEvent(); // System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", ""))); @@ -711,7 +718,9 @@ public class XML_processing { // msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); // lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); // } - } + } else if (atts.containsKey("type") && atts.get("type").equals("separated")) { + inSeparatedWord = true; + } // } } @@ -730,49 +739,107 @@ public class XML_processing { } } else if (qName.equalsIgnoreCase("div")) { gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue()); + } else if (qName.equalsIgnoreCase("seg")) { + HashMap atts = extractAttributes(startElement); + + if (atts.keySet().contains("id")) { + if (inOrthDiv) { + GOSCorpusHMKey = atts.get("id") + ".norm"; + } else { + GOSCorpusHMKey = atts.get("id"); + } + } else { + System.out.println("No attribute \"id\""); + } } break; case XMLStreamConstants.CHARACTERS: // "word" node value if (inWord) { - Characters characters = event.asCharacters(); - if (gosType.equals("norm") && msd != null) { - sentence.add(new Word(characters.getData(), lemma, msd, currentFiletaxonomyLong)); +// if (GOSCorpusHMKey.equals("gos.028-0108.norm") && wordIndex > 8){ +// System.out.println(wordIndex); +// } + // if algorithm is in orthodox part add new word to sentence + if (inOrthDiv){ +// GOSCorpusHM.put(GOSCorpusHMKey, sentence); + String word = ""; + Characters characters = event.asCharacters(); + sentence.add(new Word(characters.getData(), "", "")); + // if algorithm is in normalized part find orthodox word and add other info to it } else { - sentence.add(new Word(characters.getData(), lemma, msd, currentFiletaxonomyLong)); + Characters characters = event.asCharacters(); +// System.out.println(wordIndex); +// System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex); + if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) { + Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex); + currentWord.setLemma(lemma); + currentWord.setMsd(msd); + currentWord.setNormalizedWord(characters.getData()); + + wordIndex += 1; + + // when a word is separated from one to many we have to create these duplicates + if (inSeparatedWord){ + GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, new Word(currentWord.getWord(), "", "")); + } + } //else { +// System.out.println("Error"); +// } + } - inWord = false; } break; case XMLStreamConstants.END_ELEMENT: EndElement endElement = event.asEndElement(); + if (endElement.getName().getLocalPart().equals("w")) { + if (inWord){ + inWord = false; + } else if(inSeparatedWord) { + // when there are no separated words left we have to delete last aditional duplicate + GOSCorpusHM.get(GOSCorpusHMKey).remove(wordIndex); + + inSeparatedWord = false; + } + } + // parser reached end of the current sentence if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { - // add sentence to corpus if it passes filters - boolean saveSentence = computeForOrth == inOrthDiv; + if (inOrthDiv){ + // add sentence to corpus + GOSCorpusHM.put(GOSCorpusHMKey, sentence); + } else { - if (includeFile && saveSentence && !ValidationUtil.isEmpty(sentence)) { - sentence = runFilters(sentence, stats.getFilter()); - corpus.add(new Sentence(sentence)); + + sentence = GOSCorpusHM.remove(GOSCorpusHMKey); + // add sentence to corpus if it passes filters + if (includeFile && !ValidationUtil.isEmpty(sentence)) { + sentence = runFilters(sentence, stats.getFilter()); + corpus.add(new Sentence(sentence, currentFiletaxonomyLong)); + } + + wordIndex = 0; + + + + /* Invoke Fork-Join when we reach maximum limit of + * sentences (because we can't read everything to + * memory) or we reach the end of the file. + */ + if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { + fj(corpus, stats); + // empty the current corpus, since we don't need + // the data anymore + corpus.clear(); + } } + // start a new sentence + sentence = new ArrayList<>(); - // and start a new one - sentence = new ArrayList<>(); - /* Invoke Fork-Join when we reach maximum limit of - * sentences (because we can't read everything to - * memory) or we reach the end of the file. - */ - if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { - fj(corpus, stats); - // empty the current corpus, since we don't need - // the data anymore - corpus.clear(); - } } else if (endElement.getName().getLocalPart().equals("teiHeader")) { // before proceeding to read this file, make sure that taxonomy filters are a match if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { diff --git a/src/main/java/alg/inflectedJOS/InflectedJOSCount.java b/src/main/java/alg/inflectedJOS/InflectedJOSCount.java index 7d40a68..c05b27c 100755 --- a/src/main/java/alg/inflectedJOS/InflectedJOSCount.java +++ b/src/main/java/alg/inflectedJOS/InflectedJOSCount.java @@ -122,9 +122,9 @@ public class InflectedJOSCount { static void calculateForAll(List corpus, Statistics stats, String taxonomy) { for (Sentence s : corpus) { // disregard if wrong taxonomy - if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) { - continue; - } +// if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) { +// continue; +// } for (Word word : s.getWords()) { // skip if current word is not inflected diff --git a/src/main/java/alg/ngram/Ngrams.java b/src/main/java/alg/ngram/Ngrams.java index 6f6f218..1e9ac94 100755 --- a/src/main/java/alg/ngram/Ngrams.java +++ b/src/main/java/alg/ngram/Ngrams.java @@ -3,9 +3,11 @@ package alg.ngram; import java.util.ArrayList; import java.util.List; +import java.util.Set; import java.util.regex.Pattern; import java.util.stream.Collectors; +import com.sun.xml.internal.bind.v2.runtime.reflect.Lister; import data.*; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; @@ -28,6 +30,9 @@ public class Ngrams { } public static void generateNgramCandidates(List corpus, StatisticsNew stats) { + // preprocess CalculateFor for this corpus and prepare data for MultipleHMKeys + ArrayList otherKeys = stats.getFilter().getMultipleKeys(); + for (Sentence s : corpus) { // skip sentences shorter than specified ngram length if (s.getWords().size() < stats.getFilter().getNgramValue()) { @@ -46,29 +51,62 @@ public class Ngrams { String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor()); // if last letter is ',' erase it - key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key; -// String key = "aaaaaaaaaaaaaaaaaaaaaaa"; - String lemma = ""; - String wordType = ""; - String msd = ""; - for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){ - if(otherKey.toString().equals("lema")){ -// lemma = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - lemma = wordToString(ngramCandidate, otherKey); - } else if(otherKey.toString().equals("besedna vrsta")){ - wordType = wordToString(ngramCandidate, otherKey).substring(0, 1); - } else if(otherKey.toString().equals("oblikoskladenjska oznaka")){ - msd = wordToString(ngramCandidate, otherKey); - } +// if (key.equals("")){ +// String test = key; +// } + +// key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key; + + MultipleHMKeys multipleKeys; + + // create MultipleHMKeys for different amount of other keys + switch (otherKeys.size()) { + case 0: + multipleKeys = new MultipleHMKeys1(key); + break; + case 1: + multipleKeys = new MultipleHMKeys2(key, wordToString(ngramCandidate, otherKeys.get(0))); + break; + case 2: + multipleKeys = new MultipleHMKeys3(key, wordToString(ngramCandidate, otherKeys.get(0)), + wordToString(ngramCandidate, otherKeys.get(1))); + break; + case 3: + multipleKeys = new MultipleHMKeys4(key, wordToString(ngramCandidate, otherKeys.get(0)), + wordToString(ngramCandidate, otherKeys.get(1)), + wordToString(ngramCandidate, otherKeys.get(2))); + break; + case 4: + multipleKeys = new MultipleHMKeys5(key, wordToString(ngramCandidate, otherKeys.get(0)), + wordToString(ngramCandidate, otherKeys.get(1)), + wordToString(ngramCandidate, otherKeys.get(2)), + wordToString(ngramCandidate, otherKeys.get(3))); + break; + default: + multipleKeys = null; } +// String lemma = ""; +// String wordType = ""; +// String msd = ""; +// for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){ +// if(otherKey.toString().equals("lema")){ +// lemma = wordToString(ngramCandidate, otherKey); +// } else if(otherKey.toString().equals("besedna vrsta")){ +// wordType = wordToString(ngramCandidate, otherKey).substring(0, 1); +// } else if(otherKey.toString().equals("oblikoskladenjska oznaka")){ +// msd = wordToString(ngramCandidate, otherKey); +// } +// } +// +// MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd); + - MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd); // UPDATE TAXONOMY HERE!!! - stats.updateTaxonomyResults(multipleKeys, ngramCandidate.get(0).getTaxonomy()); + stats.updateTaxonomyResults(multipleKeys, s.getTaxonomy()); // stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor())); } } @@ -102,26 +140,31 @@ public class Ngrams { .stream() .map(Word::getLemma) .collect(Collectors.toList())); - break; + return StringUtils.join(candidate, " "); case WORD: candidate.addAll(ngramCandidate .stream() .map(Word::getWord) .collect(Collectors.toList())); - break; + return StringUtils.join(candidate, " "); case MORPHOSYNTACTIC_SPECS: case MORPHOSYNTACTIC_PROPERTY: candidate.addAll(ngramCandidate .stream() .map(Word::getMsd) .collect(Collectors.toList())); - break; + return StringUtils.join(candidate, " "); case WORD_TYPE: candidate.addAll(ngramCandidate .stream() .map(w -> Character.toString(w.getMsd().charAt(0))) .collect(Collectors.toList())); - break; +// candidate.addAll(ngramCandidate +// .stream() +// .map(w -> Character.toString(w.getMsd().charAt(0))) +// .collect(Collectors.toList())); +// .substring(0, 1) + return StringUtils.join(candidate, " "); } return StringUtils.join(candidate, " "); @@ -136,7 +179,7 @@ public class Ngrams { private static void generateNgramLetterCandidates(List corpus, StatisticsNew stats) { for (Sentence s : corpus) { for (Word w : s.getWords()) { - List taxonomy = w.getTaxonomy(); + List taxonomy = s.getTaxonomy(); String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv()); // skip this iteration if: @@ -152,7 +195,7 @@ public class Ngrams { for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) { // TODO: locila? - MultipleHMKeys multipleKeys = new MultipleHMKeys(word.substring(i, i + stats.getFilter().getStringLength())); + MultipleHMKeys multipleKeys = new MultipleHMKeys1(word.substring(i, i + stats.getFilter().getStringLength())); stats.updateTaxonomyResults(multipleKeys, taxonomy); // stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor())); @@ -183,8 +226,7 @@ public class Ngrams { String punctuation = ","; return new Word(sentence.get(i).getWord() + punctuation, sentence.get(i).getLemma() + punctuation, - sentence.get(i).getMsd() + punctuation, - sentence.get(i).getTaxonomy()); + sentence.get(i).getMsd() + punctuation); } } return sentence.get(i); @@ -204,6 +246,10 @@ public class Ngrams { for (Sentence s : corpus) { List sentence = s.getWords(); + if (sentence == null){ + continue; + } + for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram if (ngram == 2 && j < sentence.size()) { @@ -260,7 +306,7 @@ public class Ngrams { if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) { String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()); key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key; - stats.updateTaxonomyResults(new MultipleHMKeys(key, "", "", ""), + stats.updateTaxonomyResults(new MultipleHMKeys1(key), stats.getCorpus().getTaxonomy()); } } diff --git a/src/main/java/alg/word/WordCount.java b/src/main/java/alg/word/WordCount.java index 9eac4b0..31a37d3 100755 --- a/src/main/java/alg/word/WordCount.java +++ b/src/main/java/alg/word/WordCount.java @@ -89,79 +89,79 @@ class WordCount { } } - private static void calculateForTaxonomyAndJosType(List corpus, Statistics stats) { - for (Sentence s : corpus) { - if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) { - List sentence = new ArrayList<>(s.getWords().size()); - List filteredWords = new ArrayList<>(); +// private static void calculateForTaxonomyAndJosType(List corpus, Statistics stats) { +// for (Sentence s : corpus) { +// if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) { +// List sentence = new ArrayList<>(s.getWords().size()); +// List filteredWords = new ArrayList<>(); +// +// for (Word word : s.getWords()) { +// if (word.getMsd().charAt(0) == stats.getDistributionJosWordType()) { +// filteredWords.add(word); +// } +// } +// +// if (stats.getCf() == CalculateFor.LEMMA) { +// sentence.addAll(filteredWords +// .stream() +// .map(Word::getLemma) +// .collect(Collectors.toList())); +// } else if (stats.getCf() == CalculateFor.WORD) { +// sentence.addAll(filteredWords +// .stream() +// .map(Word::getWord) +// .collect(Collectors.toList())); +// } +// +// for (String word : sentence) { +// Common.updateMap(stats.result, word); +// } +// } +// } +// } - for (Word word : s.getWords()) { - if (word.getMsd().charAt(0) == stats.getDistributionJosWordType()) { - filteredWords.add(word); - } - } +// private static void calculateForTaxonomy(List corpus, Statistics stats) { +// for (Sentence s : corpus) { +// if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) { +// List sentence = new ArrayList<>(s.getWords().size()); +// +// if (stats.getCf() == CalculateFor.LEMMA) { +// sentence.addAll(s.getWords() +// .stream() +// .map(Word::getLemma) +// .collect(Collectors.toList())); +// } else if (stats.getCf() == CalculateFor.WORD) { +// sentence.addAll(s.getWords() +// .stream() +// .map(Word::getWord) +// .collect(Collectors.toList())); +// } +// +// for (String word : sentence) { +// Common.updateMap(stats.result, word); +// } +// } +// } +// } - if (stats.getCf() == CalculateFor.LEMMA) { - sentence.addAll(filteredWords - .stream() - .map(Word::getLemma) - .collect(Collectors.toList())); - } else if (stats.getCf() == CalculateFor.WORD) { - sentence.addAll(filteredWords - .stream() - .map(Word::getWord) - .collect(Collectors.toList())); - } - - for (String word : sentence) { - Common.updateMap(stats.result, word); - } - } - } - } - - private static void calculateForTaxonomy(List corpus, Statistics stats) { - for (Sentence s : corpus) { - if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) { - List sentence = new ArrayList<>(s.getWords().size()); - - if (stats.getCf() == CalculateFor.LEMMA) { - sentence.addAll(s.getWords() - .stream() - .map(Word::getLemma) - .collect(Collectors.toList())); - } else if (stats.getCf() == CalculateFor.WORD) { - sentence.addAll(s.getWords() - .stream() - .map(Word::getWord) - .collect(Collectors.toList())); - } - - for (String word : sentence) { - Common.updateMap(stats.result, word); - } - } - } - } - - static void calculateForAll(List corpus, Statistics stats) { - boolean taxonomyIsSet = stats.isTaxonomySet(); - boolean JosTypeIsSet = stats.isJOSTypeSet(); - - // branching because even though the only difference is an if or two && - // O(if) = 1, the amount of ifs adds up and this saves some time - if (taxonomyIsSet && JosTypeIsSet) { - calculateForTaxonomyAndJosType(corpus, stats); - } else if (taxonomyIsSet && !JosTypeIsSet) { - calculateForTaxonomy(corpus, stats); - } else if (!taxonomyIsSet && JosTypeIsSet) { - calculateForJosType(corpus, stats); - } else { - if (stats.isVcc()) { - calculateVCC(corpus, stats); - } else { - calculateNoFilter(corpus, stats); - } - } - } +// static void calculateForAll(List corpus, Statistics stats) { +// boolean taxonomyIsSet = stats.isTaxonomySet(); +// boolean JosTypeIsSet = stats.isJOSTypeSet(); +// +// // branching because even though the only difference is an if or two && +// // O(if) = 1, the amount of ifs adds up and this saves some time +// if (taxonomyIsSet && JosTypeIsSet) { +// calculateForTaxonomyAndJosType(corpus, stats); +// } else if (taxonomyIsSet && !JosTypeIsSet) { +// calculateForTaxonomy(corpus, stats); +// } else if (!taxonomyIsSet && JosTypeIsSet) { +// calculateForJosType(corpus, stats); +// } else { +// if (stats.isVcc()) { +// calculateVCC(corpus, stats); +// } else { +// calculateNoFilter(corpus, stats); +// } +// } +// } } \ No newline at end of file diff --git a/src/main/java/data/MultipleHMKeys.java b/src/main/java/data/MultipleHMKeys.java index 910611c..be93d1b 100755 --- a/src/main/java/data/MultipleHMKeys.java +++ b/src/main/java/data/MultipleHMKeys.java @@ -5,49 +5,16 @@ import java.util.Objects; /* Created for when words are sorted by multiple keys, i.e. not just lemmas but lemmas and msd simultaneously. */ -public final class MultipleHMKeys { - private final String key, lemma, wordType, msd; - private MultipleHMKeys actual_obj; - public MultipleHMKeys(String key) { - this.key = key; - this.lemma = ""; - this.wordType = ""; - this.msd = ""; - } - - public MultipleHMKeys(String key, String lemma, String wordType, String msd) { - this.key = key; - this.lemma = lemma; - this.wordType = wordType; - this.msd = msd; - } - - public String getKey() { - return key; - } - - public String getLemma() { - return lemma; - } - - public String getWordType() { - return wordType; - } - - public String getMsd() { - return msd; - } +public interface MultipleHMKeys { + String getK1(); + String getK2(); + String getK3(); + String getK4(); + String getK5(); @Override - public int hashCode() { - return Objects.hash(key, lemma, wordType, msd); - } + int hashCode(); @Override - public boolean equals(Object obj) { - return (obj instanceof MultipleHMKeys) && ((MultipleHMKeys) obj).key.equals(key) - && ((MultipleHMKeys) obj).lemma.equals(lemma) - && ((MultipleHMKeys) obj).wordType.equals(wordType) - && ((MultipleHMKeys) obj).msd.equals(msd); - } + boolean equals(Object obj); } diff --git a/src/main/java/data/MultipleHMKeys1.java b/src/main/java/data/MultipleHMKeys1.java new file mode 100755 index 0000000..4698e3c --- /dev/null +++ b/src/main/java/data/MultipleHMKeys1.java @@ -0,0 +1,44 @@ +package data; + +import java.util.Objects; + +/* +Created for when words are sorted by multiple keys, i.e. not just lemmas but lemmas and msd simultaneously. +*/ +public final class MultipleHMKeys1 implements MultipleHMKeys { + private final String k1; + + public MultipleHMKeys1(String k1) { + this.k1 = k1; + } + + public String getK1() { + return k1; + } + + public String getK2() { + return null; + } + + public String getK3() { + return null; + } + + public String getK4() { + return null; + } + + public String getK5() { + return null; + } + + @Override + public int hashCode() { + return k1.hashCode(); + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof MultipleHMKeys1) && ((MultipleHMKeys1) obj).k1.equals(k1); + } +} diff --git a/src/main/java/data/MultipleHMKeys2.java b/src/main/java/data/MultipleHMKeys2.java new file mode 100755 index 0000000..91388d5 --- /dev/null +++ b/src/main/java/data/MultipleHMKeys2.java @@ -0,0 +1,49 @@ +package data; + +import java.util.Objects; + +/* +Created for when words are sorted by multiple keys, i.e. not just lemmas but lemmas and msd simultaneously. +*/ +public final class MultipleHMKeys2 implements MultipleHMKeys { + private final String k1, k2; + + public MultipleHMKeys2(String k1, String k2) { + this.k1 = k1; + this.k2 = k2; + } + + public String getK1() { + return k1; + } + + public String getK2() { + return k2; + } + + public String getK3() { + return null; + } + + public String getK4() { + return null; + } + + public String getK5() { + return null; + } + + @Override + public int hashCode() { + return Objects.hash(k1, k2); +// return key.hashCode(); + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof MultipleHMKeys2) && ((MultipleHMKeys2) obj).k1.equals(k1) + && ((MultipleHMKeys2) obj).k2.equals(k2); + +// return (obj instanceof MultipleHMKeys) && ((MultipleHMKeys) obj).key.equals(key); + } +} diff --git a/src/main/java/data/MultipleHMKeys3.java b/src/main/java/data/MultipleHMKeys3.java new file mode 100755 index 0000000..5783ef3 --- /dev/null +++ b/src/main/java/data/MultipleHMKeys3.java @@ -0,0 +1,48 @@ +package data; + +import java.util.Objects; + +/* +Created for when words are sorted by multiple keys, i.e. not just lemmas but lemmas and msd simultaneously. +*/ +public final class MultipleHMKeys3 implements MultipleHMKeys { + private final String k1, k2, k3; + + public MultipleHMKeys3(String k1, String k2, String k3) { + this.k1 = k1; + this.k2 = k2; + this.k3 = k3; + } + + public String getK1() { + return k1; + } + + public String getK2() { + return k2; + } + + public String getK3() { + return k3; + } + + public String getK4() { + return null; + } + + public String getK5() { + return null; + } + + @Override + public int hashCode() { + return Objects.hash(k1, k2, k3); + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof MultipleHMKeys3) && ((MultipleHMKeys3) obj).k1.equals(k1) + && ((MultipleHMKeys3) obj).k2.equals(k2) + && ((MultipleHMKeys3) obj).k3.equals(k3); + } +} diff --git a/src/main/java/data/MultipleHMKeys4.java b/src/main/java/data/MultipleHMKeys4.java new file mode 100755 index 0000000..46e8d73 --- /dev/null +++ b/src/main/java/data/MultipleHMKeys4.java @@ -0,0 +1,50 @@ +package data; + +import java.util.Objects; + +/* +Created for when words are sorted by multiple keys, i.e. not just lemmas but lemmas and msd simultaneously. +*/ +public final class MultipleHMKeys4 implements MultipleHMKeys { + private final String k1, k2, k3, k4; + + public MultipleHMKeys4(String k1, String k2, String k3, String k4) { + this.k1 = k1; + this.k2 = k2; + this.k3 = k3; + this.k4 = k4; + } + + public String getK1() { + return k1; + } + + public String getK2() { + return k2; + } + + public String getK3() { + return k3; + } + + public String getK4() { + return k4; + } + + public String getK5() { + return null; + } + + @Override + public int hashCode() { + return Objects.hash(k1, k2, k3, k4); + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof MultipleHMKeys4) && ((MultipleHMKeys4) obj).k1.equals(k1) + && ((MultipleHMKeys4) obj).k2.equals(k2) + && ((MultipleHMKeys4) obj).k3.equals(k3) + && ((MultipleHMKeys4) obj).k4.equals(k4); + } +} diff --git a/src/main/java/data/MultipleHMKeys5.java b/src/main/java/data/MultipleHMKeys5.java new file mode 100755 index 0000000..11d36df --- /dev/null +++ b/src/main/java/data/MultipleHMKeys5.java @@ -0,0 +1,52 @@ +package data; + +import java.util.Objects; + +/* +Created for when words are sorted by multiple keys, i.e. not just lemmas but lemmas and msd simultaneously. +*/ +public final class MultipleHMKeys5 implements MultipleHMKeys { + private final String k1, k2, k3, k4, k5; + + public MultipleHMKeys5(String k1, String k2, String k3, String k4, String k5) { + this.k1 = k1; + this.k2 = k2; + this.k3 = k3; + this.k4 = k4; + this.k5 = k5; + } + + public String getK1() { + return k1; + } + + public String getK2() { + return k2; + } + + public String getK3() { + return k3; + } + + public String getK4() { + return k4; + } + + public String getK5() { + return k5; + } + + @Override + public int hashCode() { + return Objects.hash(k1, k2, k3, k4, k5); + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof MultipleHMKeys5) && ((MultipleHMKeys5) obj).k1.equals(k1) + && ((MultipleHMKeys5) obj).k2.equals(k2) + && ((MultipleHMKeys5) obj).k3.equals(k3) + && ((MultipleHMKeys5) obj).k4.equals(k4) + && ((MultipleHMKeys5) obj).k5.equals(k5); + } +} diff --git a/src/main/java/data/Sentence.java b/src/main/java/data/Sentence.java index 00a1a39..5213bb4 100755 --- a/src/main/java/data/Sentence.java +++ b/src/main/java/data/Sentence.java @@ -7,30 +7,30 @@ public class Sentence { private List words; - private String taksonomija; + private List taxonomy; // GOS private String type; private Map properties; - public Sentence(List words, String taksonomija) { + public Sentence(List words, List taxonomy) { this.words = words; - this.taksonomija = taksonomija; + this.taxonomy = taxonomy; } - public Sentence(List words) { - this.words = words; - } +// public Sentence(List words) { +// this.words = words; +// } - public Sentence(List words, String taksonomija, Map properties) { + public Sentence(List words, List taxonomy, Map properties) { this.words = words; - this.taksonomija = taksonomija; + this.taxonomy = taxonomy; this.properties = properties; } - public Sentence(List words, String taksonomija, String type) { + public Sentence(List words, List taxonomy, String type) { this.words = words; - this.taksonomija = taksonomija; + this.taxonomy = taxonomy; this.type = type; } @@ -38,8 +38,8 @@ public class Sentence { return words; } - public String getTaxonomy() { - return taksonomija; + public List getTaxonomy() { + return taxonomy; } public List getSublist(int indexFrom, int indexTo) { diff --git a/src/main/java/data/StatisticsNew.java b/src/main/java/data/StatisticsNew.java index 4963979..5949fd7 100755 --- a/src/main/java/data/StatisticsNew.java +++ b/src/main/java/data/StatisticsNew.java @@ -213,7 +213,7 @@ public class StatisticsNew { removeMinimalOccurrences(taxonomyResult.get("Total"), filter.getMinimalOccurrences()); removeMinimalTaxonomy(taxonomyResult, filter.getMinimalTaxonomy()); stats.add(ImmutablePair.of(resultTitle, getSortedResult(taxonomyResult.get("Total"), Util.getValidInt(limit)))); - Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), taxonomyResult); + Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), taxonomyResult, filter); return true; } @@ -376,7 +376,7 @@ public class StatisticsNew { } public void updateResultsNestedSuffix(String key, String stringValue) { - MultipleHMKeys mkStringValue = new MultipleHMKeys(stringValue); + MultipleHMKeys mkStringValue = new MultipleHMKeys1(stringValue); if (resultNestedSuffix.containsKey(key)) { // if not in map @@ -397,7 +397,7 @@ public class StatisticsNew { } public void updateResultsNestedPrefix(String key, String stringValue) { - MultipleHMKeys mkStringValue = new MultipleHMKeys(stringValue); + MultipleHMKeys mkStringValue = new MultipleHMKeys1(stringValue); if (resultNestedPrefix.containsKey(key)) { // if not in map diff --git a/src/main/java/data/Word.java b/src/main/java/data/Word.java index 0cafe84..0fc115f 100755 --- a/src/main/java/data/Word.java +++ b/src/main/java/data/Word.java @@ -16,8 +16,7 @@ public class Word implements Serializable { private String word; private String lemma; private String msd; -// private String msd; - private List taxonomy; + private String normalizedWord; private final HashSet VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u')); /** @@ -41,7 +40,8 @@ public class Word implements Serializable { //private char besedna_vrsta; public Word(String word, String lemma, String msd) { this.lemma = lemma; - this.msd = normalizeMsd(msd); + this.msd = msd; //normalizeMsd(msd); + this.normalizedWord = ""; // veliko zacetnico ohranimo samo za lastna imena if (!ValidationUtil.isEmpty(this.msd) && !(this.msd.charAt(0) == 'S' @@ -53,12 +53,11 @@ public class Word implements Serializable { } } - //private char besedna_vrsta; - public Word(String word, String lemma, String msd, List taxonomy) { + public Word(String word, String lemma, String msd, String normalizedWord) { this.lemma = lemma; // this.msd = normalizeMsd(msd); this.msd = msd; - this.taxonomy = taxonomy; + this.normalizedWord = normalizedWord; // veliko zacetnico ohranimo samo za lastna imena if (!ValidationUtil.isEmpty(this.msd) && !(this.msd.charAt(0) == 'S' @@ -73,21 +72,21 @@ public class Word implements Serializable { public Word() { } - /** - * Appends a number of '-' to msds which are not properly sized. - * E.g. nouns should have 5 attributes, yet the last one isn't always defined (Somei vs. Sometd) - * - * @param msdInput - * - * @return - */ - private String normalizeMsd(String msdInput) { - if (ValidationUtil.isEmpty(msdInput)) { - return ""; - } else { - return StringUtils.rightPad(msdInput, Msd.getMsdLengthForType(msdInput), PAD_CHARACTER); - } - } +// /** +// * Appends a number of '-' to msds which are not properly sized. +// * E.g. nouns should have 5 attributes, yet the last one isn't always defined (Somei vs. Sometd) +// * +// * @param msdInput +// * +// * @return +// */ +// private String normalizeMsd(String msdInput) { +// if (ValidationUtil.isEmpty(msdInput)) { +// return ""; +// } else { +// return StringUtils.rightPad(msdInput, Msd.getMsdLengthForType(msdInput), PAD_CHARACTER); +// } +// } public Word(String word) { this.word = word; @@ -119,10 +118,6 @@ public class Word implements Serializable { this.word = word; } - public List getTaxonomy() { - return taxonomy; - } - public String getLemma() { return lemma; } @@ -139,6 +134,14 @@ public class Word implements Serializable { this.msd = msd; } + public String getNormalizedWord() { + return normalizedWord; + } + + public void setNormalizedWord(String normalizedWord) { + this.normalizedWord = normalizedWord; + } + public String toString() { StringBuilder sb = new StringBuilder(); @@ -150,6 +153,8 @@ public class Word implements Serializable { .append("\n") .append("msd:\t") .append(getMsd()) + .append("normalized word:\t") + .append(getNormalizedWord()) .append("\n"); return sb.toString(); diff --git a/src/main/java/util/Export.java b/src/main/java/util/Export.java index fa3dfb5..00a8ef3 100755 --- a/src/main/java/util/Export.java +++ b/src/main/java/util/Export.java @@ -9,6 +9,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicLong; +import data.CalculateFor; import data.Filter; import data.MultipleHMKeys; import org.apache.commons.csv.CSVFormat; @@ -59,7 +60,7 @@ public class Export { } public static String SetToCSV(Set>> set, File resultsPath, LinkedHashMap headerInfoBlock, - Map> taxonomyResults) { + Map> taxonomyResults, Filter filter) { //Delimiter used in CSV file String NEW_LINE_SEPARATOR = "\n"; List FILE_HEADER_AL = new ArrayList(); @@ -98,8 +99,10 @@ public class Export { headerInfoBlock.put("Skupna vsota vseh lem:", String.valueOf(num_frequencies)); if (headerInfoBlock.get("Analiza").equals("Besede")){ FILE_HEADER_AL.add("Lema"); + FILE_HEADER_AL.add("Lema male črke"); } else if (headerInfoBlock.get("Analiza").equals("Besedni nizi")) { FILE_HEADER_AL.add("Leme"); + FILE_HEADER_AL.add("Leme male črke"); } } else if (headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("oblikoskladenjska oznaka")) { headerInfoBlock.put("Skupna vsota vseh oblikoskladenjskih oznak:", String.valueOf(num_frequencies)); @@ -111,25 +114,26 @@ public class Export { } else { headerInfoBlock.put("Skupna vsota vseh različnic:", String.valueOf(num_frequencies)); FILE_HEADER_AL.add("Lema"); + FILE_HEADER_AL.add("Lema male črke"); } - for (Map value : taxonomyResults.values()) { - for (MultipleHMKeys key : value.keySet()){ - if(!key.getLemma().equals("")){ +// for (Map value : taxonomyResults.values()) { + for (CalculateFor otherKey : filter.getMultipleKeys()){ + if(otherKey.equals(CalculateFor.LEMMA)){ FILE_HEADER_AL.add("Lema"); + FILE_HEADER_AL.add("Lema male črke"); } - if(!key.getWordType().equals("")){ + if(otherKey.equals(CalculateFor.WORD_TYPE)){ FILE_HEADER_AL.add("Besedna vrsta"); } - if(!key.getMsd().equals("")){ + if(otherKey.equals(CalculateFor.MORPHOSYNTACTIC_SPECS)){ FILE_HEADER_AL.add("Oblikoskladenjska oznaka"); } - break; } - break; - } +// break; +// } @@ -198,16 +202,47 @@ public class Export { for (Map.Entry e : map.entrySet()) { List dataEntry = new ArrayList<>(); - dataEntry.add(e.getKey().getKey()); - if(!e.getKey().getLemma().equals("")){ - dataEntry.add(e.getKey().getLemma()); - } - if(!e.getKey().getWordType().equals("")){ - dataEntry.add(e.getKey().getWordType()); - } - if(!e.getKey().getMsd().equals("")){ - dataEntry.add(e.getKey().getMsd()); + dataEntry.add(e.getKey().getK1()); + if (headerInfoBlock.containsKey("Analiza") && (headerInfoBlock.get("Analiza").equals("Besede") || headerInfoBlock.get("Analiza").equals("Besedni nizi")) && + headerInfoBlock.containsKey("Izračunaj za:") && headerInfoBlock.get("Izračunaj za:").equals("lema")){ + dataEntry.add(e.getKey().getK1().toLowerCase()); + } + + int i = 0; + for (CalculateFor otherKey : filter.getMultipleKeys()){ + switch(i){ + case 0: + if (otherKey.equals(CalculateFor.LEMMA)){ + dataEntry.add(e.getKey().getK2()); + dataEntry.add(e.getKey().getK2().toLowerCase()); + } else { + dataEntry.add(e.getKey().getK2()); + } + break; + case 1: + dataEntry.add(e.getKey().getK3()); + break; + case 2: + dataEntry.add(e.getKey().getK4()); + break; + case 3: + dataEntry.add(e.getKey().getK5()); + break; + } + + i++; } + +// if(!e.getKey().getLemma().equals("")){ +// dataEntry.add(e.getKey().getLemma()); +// dataEntry.add(e.getKey().getLemma().toLowerCase()); +// } +// if(!e.getKey().getWordType().equals("")){ +// dataEntry.add(e.getKey().getWordType()); +// } +// if(!e.getKey().getMsd().equals("")){ +// dataEntry.add(e.getKey().getMsd()); +// } dataEntry.add(e.getValue().toString()); dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies)); dataEntry.add(String.format("%.2f", ((double) e.getValue() * 10000)/num_frequencies)); diff --git a/src/main/java/util/Util.java b/src/main/java/util/Util.java index ff6f24a..d44671e 100755 --- a/src/main/java/util/Util.java +++ b/src/main/java/util/Util.java @@ -55,7 +55,7 @@ public class Util { } public static String formatNumberAsPercent(Object o) { - return MessageFormat.format("{0,number,#.###%}", o); + return MessageFormat.format("{0,number,#.### %}", o).replace('.', ','); } private static boolean isInstanceOfInteger(Object o) { diff --git a/src/main/resources/gui/CharacterAnalysisTab.fxml b/src/main/resources/gui/CharacterAnalysisTab.fxml index a3e017a..4f16e10 100755 --- a/src/main/resources/gui/CharacterAnalysisTab.fxml +++ b/src/main/resources/gui/CharacterAnalysisTab.fxml @@ -16,7 +16,7 @@