diff --git a/src/main/java/alg/XML_processing.java b/src/main/java/alg/XML_processing.java index 6779d23..366aebc 100755 --- a/src/main/java/alg/XML_processing.java +++ b/src/main/java/alg/XML_processing.java @@ -313,6 +313,17 @@ public class XML_processing { } if (c3Content.equals(".") && includeThisBlock) { + if (stats.getFilter().getNgramValue() == 0){ + int numSentenceParts = 0; + for(Word w : stavek){ + int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); + numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; + } + stats.updateUniGramOccurrences(numSentenceParts, new ArrayList<>()); + } else if(stats.getFilter().getNgramValue() >= 1) { + stats.updateUniGramOccurrences(stavek.size(), new ArrayList<>()); + } + // add sentence to corpus corpus.add(new Sentence(stavek, null)); // and start a new one @@ -637,8 +648,16 @@ public class XML_processing { // parser reached end of the current sentence if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { // count all UniGramOccurrences in sentence for statistics - stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy); - + if (stats.getFilter().getNgramValue() == 0){ + int numSentenceParts = 0; + for(Word w : sentence){ + int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); + numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; + } + stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy); + } else if(stats.getFilter().getNgramValue() >= 1) { + stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy); + } // add sentence to corpus if it passes filters sentence = runFilters(sentence, stats.getFilter()); @@ -713,6 +732,7 @@ public class XML_processing { public static boolean readXMLSSJ500K(String path, StatisticsNew stats) { boolean inWord = false; boolean inPunctuation = false; + boolean taxonomyMatch = true; ArrayList currentFiletaxonomy = new ArrayList<>(); // ArrayList currentFiletaxonomyLong = new ArrayList<>(); String lemma = ""; @@ -759,10 +779,14 @@ public class XML_processing { // keep only taxonomy properties Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", "")); currentFiletaxonomy.add(currentFiletaxonomyElement); - Tax taxonomy = new Tax(); +// Tax taxonomy = new Tax(); // currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement)); } - } + } else if (qName.equals("bibl")) { + // before proceeding to read this file, make sure that taxonomy filters are a match + taxonomyMatch = true; + + } break; case XMLStreamConstants.CHARACTERS: @@ -789,10 +813,21 @@ public class XML_processing { // parser reached end of the current sentence if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { + if (stats.getFilter().getNgramValue() == 0){ + int numSentenceParts = 0; + for(Word w : sentence){ + int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); + numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; + } + stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy); + } else if(stats.getFilter().getNgramValue() >= 1) { + stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy); + } + // add sentence to corpus if it passes filters sentence = runFilters(sentence, stats.getFilter()); - if (!ValidationUtil.isEmpty(sentence)) { + if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) { corpus.add(new Sentence(sentence, currentFiletaxonomy)); } @@ -821,7 +856,20 @@ public class XML_processing { currentFiletaxonomy = new ArrayList<>(); // currentFiletaxonomyLong = new ArrayList<>(); - } + } else if (endElement.getName().getLocalPart().equals("bibl")) { + // before proceeding to read this file, make sure that taxonomy filters are a match + + if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { + currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection + + if (currentFiletaxonomy.isEmpty()) { + // taxonomies don't match so stop +// return false; + taxonomyMatch = false; +// System.out.println("TEST"); + } + } + } break; } @@ -925,7 +973,7 @@ public class XML_processing { // keep only taxonomy properties Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue())); currentFiletaxonomy.add(currentFiletaxonomyElement); - Tax taxonomy = new Tax(); +// Tax taxonomy = new Tax(); // currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement)); } } else if (qName.equalsIgnoreCase("div")) { @@ -1007,6 +1055,17 @@ public class XML_processing { sentence = GOSCorpusHM.remove(GOSCorpusHMKey); + if (stats.getFilter().getNgramValue() == 0){ + int numSentenceParts = 0; + for(Word w : sentence){ + int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); + numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; + } + stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy); + } else if(stats.getFilter().getNgramValue() >= 1) { + stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy); + } + // add sentence to corpus if it passes filters if (includeFile && !ValidationUtil.isEmpty(sentence)) { sentence = runFilters(sentence, stats.getFilter()); @@ -1040,7 +1099,7 @@ public class XML_processing { // disregard this entry if taxonomies don't match includeFile = !currentFiletaxonomy.isEmpty(); - currentFiletaxonomy = new ArrayList<>(); +// currentFiletaxonomy = new ArrayList<>(); } } diff --git a/src/main/java/data/CalculateFor.java b/src/main/java/data/CalculateFor.java index 5ff938e..17fe196 100755 --- a/src/main/java/data/CalculateFor.java +++ b/src/main/java/data/CalculateFor.java @@ -45,72 +45,218 @@ public enum CalculateFor { return null; } - public String toMetadataString() { + public String totalSumString(int ngram) { + if (ngram == 0) { + switch (this) { + case WORD: + return "Skupna vsota vseh črkovnih nizov različnic:"; + case NORMALIZED_WORD: + return "Skupna vsota vseh črkovnih nizov normaliziranih različnic:"; + case LEMMA: + return "Skupna vsota vseh črkovnih nizov lem:"; + case MORPHOSYNTACTIC_SPECS: + return "Skupna vsota vseh črkovnih nizov oblikoskladenjskih oznak:"; + case MORPHOSYNTACTIC_PROPERTY: + return "Skupna vsota vseh črkovnih nizov oblikoskladenjskih lastnosti:"; + case WORD_TYPE: + return "Skupna vsota vseh črkovnih nizov besednih vrst:"; + case DIST_WORDS: + return "Skupna vsota vseh črkovnih nizov različnic:"; + case DIST_LEMMAS: + return "Skupna vsota vseh črkovnih nizov lem:"; + default: + return null; + } + } else if (ngram >= 1) { + switch (this) { + case WORD: + return "Skupna vsota vseh različnic:"; + case NORMALIZED_WORD: + return "Skupna vsota vseh normaliziranih različnic:"; + case LEMMA: + return "Skupna vsota vseh lem:"; + case MORPHOSYNTACTIC_SPECS: + return "Skupna vsota vseh oblikoskladenjskih oznak:"; + case MORPHOSYNTACTIC_PROPERTY: + return "Skupna vsota vseh oblikoskladenjskih lastnosti:"; + case WORD_TYPE: + return "Skupna vsota vseh besednih vrst:"; + case DIST_WORDS: + return "Skupna vsota vseh različnic:"; + case DIST_LEMMAS: + return "Skupna vsota vseh lem:"; + default: + return null; + } + } + return null; + } + + public String foundSumString(int ngram) { + if (ngram == 0) { + switch (this) { + case WORD: + return "Skupna vsota vseh najdenih črkovnih nizov različnic:"; + case NORMALIZED_WORD: + return "Skupna vsota vseh najdenih črkovnih nizov normaliziranih različnic:"; + case LEMMA: + return "Skupna vsota vseh najdenih črkovnih nizov lem:"; + case MORPHOSYNTACTIC_SPECS: + return "Skupna vsota vseh najdenih črkovnih nizov oblikoskladenjskih oznak:"; + case MORPHOSYNTACTIC_PROPERTY: + return "Skupna vsota vseh najdenih črkovnih nizov oblikoskladenjskih lastnosti:"; + case WORD_TYPE: + return "Skupna vsota vseh najdenih črkovnih nizov besednih vrst:"; + case DIST_WORDS: + return "Skupna vsota vseh najdenih črkovnih nizov različnic:"; + case DIST_LEMMAS: + return "Skupna vsota vseh najdenih črkovnih nizov lem:"; + default: + return null; + } + } else if (ngram >= 1) { + switch (this) { + case WORD: + return "Skupna vsota vseh najdenih različnic:"; + case NORMALIZED_WORD: + return "Skupna vsota vseh najdenih normaliziranih različnic:"; + case LEMMA: + return "Skupna vsota vseh najdenih lem:"; + case MORPHOSYNTACTIC_SPECS: + return "Skupna vsota vseh najdenih oblikoskladenjskih oznak:"; + case MORPHOSYNTACTIC_PROPERTY: + return "Skupna vsota vseh najdenih oblikoskladenjskih lastnosti:"; + case WORD_TYPE: + return "Skupna vsota vseh najdenih besednih vrst:"; + case DIST_WORDS: + return "Skupna vsota vseh najdenih različnic:"; + case DIST_LEMMAS: + return "Skupna vsota vseh najdenih lem:"; + default: + return null; + } + } + + return null; + } + + public String totalAbsoluteFrequencyString(int ngram) { + if (ngram == 0) { + return "Skupna absolutna pogostost črkovnega niza"; + } switch(this){ case WORD: - return "Skupna vsota vseh različnic:"; + return "Skupna absolutna pogostost različnice"; case NORMALIZED_WORD: - return "Skupna vsota vseh normaliziranih različnic:"; + return "Skupna absolutna pogostost normalizirane različnice"; case LEMMA: - return "Skupna vsota vseh lem:"; + return "Skupna absolutna pogostost leme"; case MORPHOSYNTACTIC_SPECS: - return "Skupna vsota vseh oblikoskladenjskih oznak:"; + return "Skupna absolutna pogostost oblikoskladenjske oznake"; case MORPHOSYNTACTIC_PROPERTY: - return "Skupna vsota vseh oblikoskladenjskih lastnosti:"; + return "Skupna absolutna pogostost oblikoskladenjske lastnosti"; case WORD_TYPE: - return "Skupna vsota vseh besednih vrst:"; + return "Skupna absolutna pogostost besedne vrste"; case DIST_WORDS: - return "Skupna vsota vseh različnic:"; + return "Skupna absolutna pogostost različnice"; case DIST_LEMMAS: - return "Skupna vsota vseh lem:"; + return "Skupna absolutna pogostost leme"; default: return null; } } - public String toHeaderString() { + public String shareOfTotalString(int ngram) { + if (ngram == 0) { + return "Delež glede na skupno vsoto vseh najdenih črkovnih nizov"; + } switch(this){ case WORD: - return "Različnica"; + return "Delež glede na vse najdene različnice"; case NORMALIZED_WORD: - return "Normalizirana različnica"; + return "Delež glede na vse najdene normalizirane različnice"; case LEMMA: - return "Lema"; + return "Delež glede na vse najdene leme"; case MORPHOSYNTACTIC_SPECS: - return "Oblikoskladenjska oznaka"; + return "Delež glede na vse najdene oblikoskladenjske oznake"; case MORPHOSYNTACTIC_PROPERTY: - return "Oblikoskladenjska lastnost"; + return "Delež glede na vse najdene oblikoskladenjske lastnosti"; case WORD_TYPE: - return "Besedna vrsta"; + return "Delež glede na vse najdene besedne vrste"; case DIST_WORDS: - return "Različnica"; + return "Delež glede na vse najdene različnice"; case DIST_LEMMAS: - return "Lema"; + return "Delež glede na vse najdene leme"; + default: + return null; + } + } + + public String toHeaderString(int ngram) { + if (ngram == 0) { + return "Črkovni niz"; + } else if (ngram == 1) { + switch (this) { + case WORD: + return "Različnica"; + case NORMALIZED_WORD: + return "Normalizirana različnica"; + case LEMMA: + return "Lema"; + case MORPHOSYNTACTIC_SPECS: + return "Oblikoskladenjska oznaka"; + case MORPHOSYNTACTIC_PROPERTY: + return "Oblikoskladenjska lastnost"; + case WORD_TYPE: + return "Besedna vrsta"; + case DIST_WORDS: + return "Različnica"; + case DIST_LEMMAS: + return "Lema"; + default: + return null; + } + } + switch (this) { + case WORD: + case DIST_WORDS: + return "Različnica niza"; + case NORMALIZED_WORD: + return "Normalizirana različnica niza"; + case LEMMA: + case DIST_LEMMAS: + return "Lema niza"; + case MORPHOSYNTACTIC_SPECS: + return "Oblikoskladenjska oznaka niza"; + case MORPHOSYNTACTIC_PROPERTY: + return "Oblikoskladenjska lastnost niza"; + case WORD_TYPE: + return "Besedna vrsta niza"; default: return null; } } - public String toPercentString() { - switch(this){ - case WORD: - return "Delež glede na vse različnice"; - case NORMALIZED_WORD: - return "Delež glede na vse normalizirane različnice"; - case LEMMA: - return "Delež glede na vse leme"; - case MORPHOSYNTACTIC_SPECS: - return "Delež glede na vse oblikoskladenjske oznake"; - case MORPHOSYNTACTIC_PROPERTY: - return "Delež glede na vse oblikoskladenjske lastnosti"; - case WORD_TYPE: - return "Delež glede na vse besedne vrste"; - case DIST_WORDS: - return "Delež glede na vse različnice"; - case DIST_LEMMAS: - return "Delež glede na vse leme"; - default: - return null; - } - } +// public String toPercentString() { +// switch(this){ +// case WORD: +// return "Delež glede na vse različnice"; +// case NORMALIZED_WORD: +// return "Delež glede na vse normalizirane različnice"; +// case LEMMA: +// return "Delež glede na vse leme"; +// case MORPHOSYNTACTIC_SPECS: +// return "Delež glede na vse oblikoskladenjske oznake"; +// case MORPHOSYNTACTIC_PROPERTY: +// return "Delež glede na vse oblikoskladenjske lastnosti"; +// case WORD_TYPE: +// return "Delež glede na vse besedne vrste"; +// case DIST_WORDS: +// return "Delež glede na vse različnice"; +// case DIST_LEMMAS: +// return "Delež glede na vse leme"; +// default: +// return null; +// } +// } } diff --git a/src/main/java/data/StatisticsNew.java b/src/main/java/data/StatisticsNew.java index 266dc55..fec58a7 100755 --- a/src/main/java/data/StatisticsNew.java +++ b/src/main/java/data/StatisticsNew.java @@ -10,6 +10,7 @@ import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Pattern; +import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; @@ -59,7 +60,7 @@ public class StatisticsNew { if (this.corpus.getTaxonomy() != null && filter.getDisplayTaxonomy()) { if (this.filter.getTaxonomy().isEmpty()) { for (int i = 0; i < this.corpus.getTaxonomy().size(); i++) { - this.taxonomyResult.put(Taxonomy.factory(this.corpus.getTaxonomy().get(i)), new ConcurrentHashMap<>()); + this.taxonomyResult.put(Taxonomy.factoryLongName(this.corpus.getTaxonomy().get(i)), new ConcurrentHashMap<>()); } } else { for (int i = 0; i < this.filter.getTaxonomy().size(); i++) { @@ -236,7 +237,7 @@ public class StatisticsNew { analysisProducedResults = true; } - removeMinimalOccurrences(taxonomyResult.get(Taxonomy.TOTAL), filter.getMinimalOccurrences()); + removeMinimalOccurrences(filter.getMinimalOccurrences()); removeMinimalTaxonomy(taxonomyResult, filter.getMinimalTaxonomy()); stats.add(ImmutablePair.of(resultTitle, getSortedResult(taxonomyResult.get(Taxonomy.TOTAL), Util.getValidInt(limit)))); Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), this, filter); @@ -265,12 +266,14 @@ public class StatisticsNew { /** * Removes lines where total number of occurrences is lower than specified number (minimalOccurrences) */ - private void removeMinimalOccurrences(Map taxonomyResultTotal, Integer minimalOccurrences) { + private void removeMinimalOccurrences(Integer minimalOccurrences) { if (minimalOccurrences == 0) return; - for (MultipleHMKeys key : taxonomyResultTotal.keySet()){ - if(taxonomyResultTotal.get(key).intValue() < minimalOccurrences){ - taxonomyResultTotal.remove(key); + for (MultipleHMKeys key : taxonomyResult.get(Taxonomy.TOTAL).keySet()){ + if(taxonomyResult.get(Taxonomy.TOTAL).get(key).intValue() < minimalOccurrences){ + for (Taxonomy t : taxonomyResult.keySet()){ + taxonomyResult.get(t).remove(key); + } } } } @@ -498,15 +501,17 @@ public class StatisticsNew { info.put("Izračunaj za:", filter.getCalculateFor().toString()); // also write - if (filter.getMultipleKeys().size() > 0){ + if (ngramLevel > 0) { + if (filter.getMultipleKeys().size() > 0) { - StringBuilder mk = new StringBuilder(); - for (CalculateFor s : filter.getMultipleKeys()) { - mk.append(s.toString()).append("; "); + StringBuilder mk = new StringBuilder(); + for (CalculateFor s : filter.getMultipleKeys()) { + mk.append(s.toString()).append("; "); + } + info.put("Upoštevaj tudi: ", String.join("; ", mk.substring(0, mk.length() - 2))); + } else { + info.put("Upoštevaj tudi: ", ""); } - info.put("Izpiši tudi: ", String.join("; ", mk.substring(0, mk.length() - 2))); - } else { - info.put("Izpiši tudi: ", ""); } // data limitations @@ -535,14 +540,16 @@ public class StatisticsNew { } // also write - n - gram > 1 - if (ngramLevel > 1 && filter.getCollocability().size() > 0){ - StringBuilder mk = new StringBuilder(); - for (Collocability s : filter.getCollocability()) { - mk.append(s.toString()).append("; "); + if(ngramLevel > 1) { + if (filter.getCollocability().size() > 0) { + StringBuilder mk = new StringBuilder(); + for (Collocability s : filter.getCollocability()) { + mk.append(s.toString()).append("; "); + } + info.put("Kolokabilnost: ", String.join("; ", mk.substring(0, mk.length() - 2))); + } else { + info.put("Kolokabilnost: ", ""); } - info.put("Kolokabilnost: ", String.join("; ", mk.substring(0, mk.length() - 2))); - } else { - info.put("Kolokabilnost: ", ""); } // fragmented MSD - n-gram = 1 @@ -580,13 +587,48 @@ public class StatisticsNew { // } info.put("Taksonomija: ", ""); - if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) { + if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) || filter.getDisplayTaxonomy()) { ArrayList tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy()); - String sep = ""; + if (filter.getDisplayTaxonomy() && tax.size() == 0) { + + +// ArrayList intList = (new ArrayList<>(taxonomyResult.keySet()).stream() +// .forEach(x -> {x.toString();})); +// ArrayList taxonomyString = new ArrayList<>(); +// for (Taxonomy t : taxonomyResult.keySet()){ +// taxonomyString.add(t.toString()); +// } +// ObservableList taxonomyObservableString = Tax.getTaxonomyForComboBox(corpus.getCorpusType(), new HashSet<>(taxonomyString)); +// ArrayList sortedTaxonomyString = new ArrayList<>(); +// for (String t : taxonomyObservableString){ +// sortedTaxonomyString.add(t); +// } +// getTaxonomyForTaxonomyResult + tax = Tax.getTaxonomyForTaxonomyResult(corpus.getCorpusType(), taxonomyResult.keySet()); + } + +// String sep = ""; for (String s : tax) { - info.put(sep = sep + " ", s); + + if (s == null) { + continue; + } + +// info.put(sep = sep + " ", s); + if (uniGramTaxonomyOccurrences.get(Taxonomy.factoryLongName(s)) == null) { + info.put(s, ""); + continue; + } + int n = uniGramTaxonomyOccurrences.get(Taxonomy.factoryLongName(s)).intValue(); + if (n == 0) { + info.put(s, ""); + } else { + info.put(s, String.valueOf(n)); + } + } + } info.put("Min. št. pojavitev: ", String.valueOf(filter.getMinimalOccurrences())); diff --git a/src/main/java/data/Tax.java b/src/main/java/data/Tax.java index f48e301..58191c8 100755 --- a/src/main/java/data/Tax.java +++ b/src/main/java/data/Tax.java @@ -135,6 +135,60 @@ public class Tax { return FXCollections.observableArrayList(taxForCombo); } + /** + * Returns taxonomy names only for items found in headers + */ + public static ArrayList getTaxonomyForTaxonomyResult(CorpusType corpusType, Set foundTax) { + LinkedHashMap tax = new LinkedHashMap<>(); + Set foundTaxHS= new HashSet<>(foundTax); + + if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K) { + tax = GIGAFIDA_TAXONOMY; + } else if (corpusType == CorpusType.GOS) { + tax = GOS_TAXONOMY; + } + + ArrayList taxForCombo = new ArrayList<>(); + + // adds parents taxonomy as well + Set genFoundTax = new HashSet<>(); + + for(Taxonomy e : foundTaxHS){ + String[] elList = e.toString().split("\\."); + for(int i = 1; i < elList.length - 1; i++){ + Taxonomy candidate = Taxonomy.factory(String.join(".", Arrays.copyOfRange(elList, 0, elList.length - i))); + genFoundTax.add(candidate); + } + } + + + + +// ArrayList taxonomyString = new ArrayList<>(); +// for (Taxonomy t : taxonomyResult.keySet()){ +// taxonomyString.add(t.toString()); +// } +// ObservableList taxonomyObservableString = Tax.getTaxonomyForComboBox(corpus.getCorpusType(), new HashSet<>(taxonomyString)); +// ArrayList sortedTaxonomyString = new ArrayList<>(); +// for (String t : taxonomyObservableString){ +// sortedTaxonomyString.add(t); +// } + + + + + foundTaxHS.addAll(genFoundTax); + + // assures same relative order + for (String t : tax.keySet()) { + if (foundTaxHS.contains(Taxonomy.factory(t))) { + taxForCombo.add(tax.get(t)); + } + } + + return taxForCombo; + } + public static HashSet getCorpusTypesWithTaxonomy() { return corpusTypesWithTaxonomy; } @@ -204,7 +258,7 @@ public class Tax { public static ArrayList getTaxonomyForInfo(CorpusType corpusType, ArrayList taxonomy) { LinkedHashMap tax = new LinkedHashMap<>(); - if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) { + if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K) { tax = GIGAFIDA_TAXONOMY; } else if (corpusType == CorpusType.GOS) { tax = GOS_TAXONOMY; diff --git a/src/main/java/data/Taxonomy.java b/src/main/java/data/Taxonomy.java index 21663e3..86f0bd6 100755 --- a/src/main/java/data/Taxonomy.java +++ b/src/main/java/data/Taxonomy.java @@ -502,6 +502,7 @@ public enum Taxonomy { r.add(SITUACIJA_TELEVIZIJA); } else if(disjointTaxonomy.equals(KANAL)){ r.add(KANAL_OSEBNI_STIK); + r.add(KANAL_TELEFON); r.add(KANAL_RADIO); r.add(KANAL_TELEVIZIJA); } else if(disjointTaxonomy.equals(SSJ_TISK)){ @@ -646,6 +647,9 @@ public enum Taxonomy { connections.put(SSJ_KNJIZNO, SSJ_TISK); connections.put(SSJ_LEPOSLOVNO, SSJ_KNJIZNO); connections.put(SSJ_STROKOVNO, SSJ_KNJIZNO); + connections.put(SSJ_PERIODICNO, SSJ_TISK); + connections.put(SSJ_CASOPIS, SSJ_PERIODICNO); + connections.put(SSJ_REVIJA, SSJ_PERIODICNO); connections.put(SSJ_DRUGO, SSJ_TISK); connections.put(FT_P_GOVORNI, FT_P_PRENOSNIK); diff --git a/src/main/java/gui/CharacterAnalysisTab.java b/src/main/java/gui/CharacterAnalysisTab.java index b6481c0..60a52e2 100755 --- a/src/main/java/gui/CharacterAnalysisTab.java +++ b/src/main/java/gui/CharacterAnalysisTab.java @@ -181,6 +181,7 @@ public class CharacterAnalysisTab { // taxonomy if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) { + taxonomyCCB.setDisable(false); taxonomyCCB.getItems().removeAll(); taxonomyCCB.getItems().setAll(corpus.getTaxonomy()); taxonomyCCB.getCheckModel().getCheckedItems().addListener(new ListChangeListener() { @@ -217,21 +218,27 @@ public class CharacterAnalysisTab { } displayTaxonomy = false; + displayTaxonomyChB.setSelected(false); // set - displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> { - displayTaxonomy = newValue; - if(displayTaxonomy){ - minimalTaxonomyTF.setDisable(false); - } else { - minimalTaxonomyTF.setDisable(true); + if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) { + displayTaxonomyChB.setDisable(false); + displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> { + displayTaxonomy = newValue; + if (displayTaxonomy) { + minimalTaxonomyTF.setDisable(false); + } else { + minimalTaxonomyTF.setDisable(true); - minimalTaxonomyTF.setText("1"); - minimalTaxonomy = 1; - } + minimalTaxonomyTF.setText("1"); + minimalTaxonomy = 1; + } - logger.info("display taxonomy: ", displayTaxonomy); - }); - displayTaxonomyChB.setTooltip(new Tooltip(TOOLTIP_readDisplayTaxonomyChB)); + logger.info("display taxonomy: ", displayTaxonomy); + }); + displayTaxonomyChB.setTooltip(new Tooltip(TOOLTIP_readDisplayTaxonomyChB)); + } else { + displayTaxonomyChB.setDisable(true); + } // cvv calculatecvvCB.selectedProperty().addListener((observable, oldValue, newValue) -> { diff --git a/src/main/java/gui/OneWordAnalysisTab.java b/src/main/java/gui/OneWordAnalysisTab.java index 4e3fa82..49580fc 100755 --- a/src/main/java/gui/OneWordAnalysisTab.java +++ b/src/main/java/gui/OneWordAnalysisTab.java @@ -14,6 +14,7 @@ import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.controlsfx.control.CheckComboBox; +import org.controlsfx.control.IndexedCheckModel; import java.io.File; import java.io.UnsupportedEncodingException; @@ -220,14 +221,16 @@ public class OneWordAnalysisTab { // taxonomy if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) { + taxonomyCCB.setDisable(false); taxonomyCCB.getItems().removeAll(); taxonomyCCB.getItems().setAll(corpus.getTaxonomy()); - taxonomyCCB.getCheckModel().getCheckedItems().addListener(new ListChangeListener() { - boolean changing = true; + + ListChangeListener listener = new ListChangeListener() { + public boolean changing = true; @Override - public void onChanged(ListChangeListener.Change c){ - if(changing) { + public void onChanged(Change c) { + if (changing) { ObservableList checkedItems = taxonomyCCB.getCheckModel().getCheckedItems(); ArrayList checkedItemsTaxonomy = Taxonomy.convertStringListToTaxonomyList(checkedItems); @@ -249,27 +252,41 @@ public class OneWordAnalysisTab { logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ","))); } } - }); + }; + +// taxonomyCCB.getCheckModel().getCheckedItems().removeListener(listener); +// System.out.println("THIS WORKS!!!!"); taxonomyCCB.getCheckModel().clearChecks(); +// System.out.println("YES???"); + taxonomyCCB.getCheckModel().getCheckedItems().addListener(listener); +// taxonomyCCB.setCheckModel(null); + +// taxonomyCCB.getCheckModel().clearChecks(); } else { taxonomyCCB.setDisable(true); } displayTaxonomy = false; + displayTaxonomyChB.setSelected(false); // set - displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> { - displayTaxonomy = newValue; - if(displayTaxonomy){ - minimalTaxonomyTF.setDisable(false); - } else { - minimalTaxonomyTF.setDisable(true); + if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) { + displayTaxonomyChB.setDisable(false); + displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> { + displayTaxonomy = newValue; + if (displayTaxonomy) { + minimalTaxonomyTF.setDisable(false); + } else { + minimalTaxonomyTF.setDisable(true); - minimalTaxonomyTF.setText("1"); - minimalTaxonomy = 1; - } - logger.info("display taxonomy: ", displayTaxonomy); - }); - displayTaxonomyChB.setTooltip(new Tooltip(TOOLTIP_readDisplayTaxonomyChB)); + minimalTaxonomyTF.setText("1"); + minimalTaxonomy = 1; + } + logger.info("display taxonomy: ", displayTaxonomy); + }); + displayTaxonomyChB.setTooltip(new Tooltip(TOOLTIP_readDisplayTaxonomyChB)); + } else { + displayTaxonomyChB.setDisable(true); + } writeMsdAtTheEnd = false; writeMsdAtTheEndChB.setDisable(true); diff --git a/src/main/java/gui/StringAnalysisTabNew2.java b/src/main/java/gui/StringAnalysisTabNew2.java index c58f754..f5e9d30 100755 --- a/src/main/java/gui/StringAnalysisTabNew2.java +++ b/src/main/java/gui/StringAnalysisTabNew2.java @@ -198,20 +198,27 @@ public class StringAnalysisTabNew2 { notePunctuationsChB.setTooltip(new Tooltip(TOOLTIP_readNotePunctuationsChB)); displayTaxonomy = false; + displayTaxonomyChB.setSelected(false); // set - displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> { - displayTaxonomy = newValue; - if(displayTaxonomy){ - minimalTaxonomyTF.setDisable(false); - } else { - minimalTaxonomyTF.setDisable(true); - minimalTaxonomyTF.setText("1"); - minimalTaxonomy = 1; - } - logger.info("display taxonomy: ", displayTaxonomy); - }); - displayTaxonomyChB.setTooltip(new Tooltip(TOOLTIP_readDisplayTaxonomyChB)); + if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) { + displayTaxonomyChB.setDisable(false); + displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> { + displayTaxonomy = newValue; + if (displayTaxonomy) { + minimalTaxonomyTF.setDisable(false); + } else { + minimalTaxonomyTF.setDisable(true); + + minimalTaxonomyTF.setText("1"); + minimalTaxonomy = 1; + } + logger.info("display taxonomy: ", displayTaxonomy); + }); + displayTaxonomyChB.setTooltip(new Tooltip(TOOLTIP_readDisplayTaxonomyChB)); + } else { + displayTaxonomyChB.setDisable(true); + } // calculateForCB calculateForCB.valueProperty().addListener((observable, oldValue, newValue) -> { @@ -306,6 +313,7 @@ public class StringAnalysisTabNew2 { // taxonomy if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) { + taxonomyCCB.setDisable(false); taxonomyCCB.getItems().removeAll(); taxonomyCCB.getItems().setAll(corpus.getTaxonomy()); taxonomyCCB.getCheckModel().getCheckedItems().addListener(new ListChangeListener() { @@ -667,7 +675,11 @@ public class StringAnalysisTabNew2 { readXML(f.toString(), statisticsOneGrams); i++; this.updateProgress(i, corpusFiles.size() * 2); - this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size() * 2, f.getName())); + if (statistic.getFilter().getCollocability().size() > 0) { + this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size() * 2, f.getName())); + } else { + this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName())); + } } return null; @@ -776,7 +788,7 @@ public class StringAnalysisTabNew2 { this.updateProgress(i, corpusFiles.size()); this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName())); } - this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size() * 2, f.getName())); +// this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size() * 2, f.getName())); } return null; diff --git a/src/main/java/gui/WordLevelTab.java b/src/main/java/gui/WordLevelTab.java index 0415c66..9b24956 100755 --- a/src/main/java/gui/WordLevelTab.java +++ b/src/main/java/gui/WordLevelTab.java @@ -343,6 +343,7 @@ public class WordLevelTab { // taxonomy if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) { + taxonomyCCB.setDisable(false); taxonomyCCB.getItems().removeAll(); taxonomyCCB.getItems().setAll(corpus.getTaxonomy()); taxonomyCCB.getCheckModel().getCheckedItems().addListener(new ListChangeListener() { @@ -379,20 +380,26 @@ public class WordLevelTab { } displayTaxonomy = false; + displayTaxonomyChB.setSelected(false); // set - displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> { - displayTaxonomy = newValue; - if(displayTaxonomy){ - minimalTaxonomyTF.setDisable(false); - } else { - minimalTaxonomyTF.setDisable(true); - - minimalTaxonomyTF.setText("1"); - minimalTaxonomy = 1; - } - logger.info("display taxonomy: ", displayTaxonomy); - }); - displayTaxonomyChB.setTooltip(new Tooltip(TOOLTIP_readDisplayTaxonomyChB)); + if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) { + displayTaxonomyChB.setDisable(false); + displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> { + displayTaxonomy = newValue; + if (displayTaxonomy) { + minimalTaxonomyTF.setDisable(false); + } else { + minimalTaxonomyTF.setDisable(true); + + minimalTaxonomyTF.setText("1"); + minimalTaxonomy = 1; + } + logger.info("display taxonomy: ", displayTaxonomy); + }); + displayTaxonomyChB.setTooltip(new Tooltip(TOOLTIP_readDisplayTaxonomyChB)); + } else { + displayTaxonomyChB.setDisable(true); + } // writeMsdAtTheEnd = false; // writeMsdAtTheEndChB.setDisable(true); diff --git a/src/main/java/util/Export.java b/src/main/java/util/Export.java index 6dbdd2f..d68a2b9 100755 --- a/src/main/java/util/Export.java +++ b/src/main/java/util/Export.java @@ -64,27 +64,27 @@ public class Export { //Delimiter used in CSV file String NEW_LINE_SEPARATOR = "\n"; - List FILE_HEADER_AL = new ArrayList(); + List FILE_HEADER_AL = new ArrayList<>(); Object[] FILE_HEADER; //Count frequencies - long num_frequencies = 0; - for (Pair> p : set) { - Map map = p.getRight(); - if (map.isEmpty()) - continue; - num_frequencies = Util.mapSumFrequencies(map); - } - -// Map num_taxonomy_frequencies = new ConcurrentHashMap<>(); -// for (String taxonomyKey : taxonomyResults.keySet()) { -// num_taxonomy_frequencies.put(taxonomyKey, (long) 0); -// for (AtomicLong value : taxonomyResults.get(taxonomyKey).values()){ -// long val = num_taxonomy_frequencies.get(taxonomyKey); -// val += value.get(); -// num_taxonomy_frequencies.put(taxonomyKey, val); -// } +// long num_frequencies = 0; +// for (Pair> p : set) { +// Map map = p.getRight(); +// if (map.isEmpty()) +// continue; +// num_frequencies = Util.mapSumFrequencies(map); // } + + Map num_selected_taxonomy_frequencies = new ConcurrentHashMap<>(); + for (Taxonomy taxonomyKey : taxonomyResults.keySet()) { + num_selected_taxonomy_frequencies.put(taxonomyKey, (long) 0); + for (AtomicLong value : taxonomyResults.get(taxonomyKey).values()){ + long val = num_selected_taxonomy_frequencies.get(taxonomyKey); + val += value.get(); + num_selected_taxonomy_frequencies.put(taxonomyKey, val); + } + } Map num_taxonomy_frequencies = statistics.getUniGramOccurrences(); @@ -92,32 +92,37 @@ public class Export { if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) { FILE_HEADER_AL.add("Izpuščene besede"); } - FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString()); - if (filter.getCalculateFor().equals(CalculateFor.LEMMA)) - FILE_HEADER_AL.add("Lema male črke"); - + FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString(filter.getNgramValue())); + if (filter.getCalculateFor().equals(CalculateFor.LEMMA)) { + if(filter.getNgramValue() == 0) { + FILE_HEADER_AL.add("Črkovni niz (male črke)"); + } else if(filter.getNgramValue() >= 1) { + FILE_HEADER_AL.add("Lema (male črke)"); + } + } if (filter.getSuffixLength() != null && filter.getSuffixList() != null && filter.getPrefixLength() != null && filter.getPrefixList() != null) { if (filter.getPrefixLength() > 0 || filter.getPrefixList().size() > 0) { - FILE_HEADER_AL.add("Predpona"); + FILE_HEADER_AL.add("Začetni del besede"); } FILE_HEADER_AL.add("Preostali del besede"); if (filter.getSuffixLength() > 0 || filter.getSuffixList().size() > 0) { - FILE_HEADER_AL.add("Pripona"); + FILE_HEADER_AL.add("Končni del besede"); } } - headerInfoBlock.put(filter.getCalculateFor().toMetadataString(), String.valueOf(statistics.getUniGramOccurrences().get(Taxonomy.TOTAL).longValue())); + headerInfoBlock.put(filter.getCalculateFor().totalSumString(filter.getNgramValue()), String.valueOf(num_taxonomy_frequencies.get(Taxonomy.TOTAL).longValue())); + headerInfoBlock.put(filter.getCalculateFor().foundSumString(filter.getNgramValue()), String.valueOf(num_selected_taxonomy_frequencies.get(Taxonomy.TOTAL).longValue())); // headerInfoBlock.put(filter.getCalculateFor().toMetadataString(), String.valueOf(num_frequencies)); for (CalculateFor otherKey : filter.getMultipleKeys()) { - FILE_HEADER_AL.add(otherKey.toHeaderString()); + FILE_HEADER_AL.add(otherKey.toHeaderString(filter.getNgramValue())); if (otherKey.equals(CalculateFor.LEMMA)) - FILE_HEADER_AL.add("Lema male črke"); + FILE_HEADER_AL.add("Lema (male črke)"); } - FILE_HEADER_AL.add("Skupna absolutna pogostost"); - FILE_HEADER_AL.add(filter.getCalculateFor().toPercentString()); + FILE_HEADER_AL.add(filter.getCalculateFor().totalAbsoluteFrequencyString(filter.getNgramValue())); + FILE_HEADER_AL.add(filter.getCalculateFor().shareOfTotalString(filter.getNgramValue())); FILE_HEADER_AL.add("Skupna relativna pogostost (na milijon pojavitev)"); @@ -216,6 +221,9 @@ public class Export { // real prefix String rpf = ""; for(String pf : filter.getPrefixList()){ + if (key.length() < pf.length()) { + continue; + } if (pf.equals(key.substring(0, pf.length()))){ rpf = pf; break; @@ -225,6 +233,9 @@ public class Export { // real suffix String rsf = ""; for(String sf : filter.getSuffixList()){ + if (key.length() < sf.length()) { + continue; + } if (sf.equals(key.substring(key.length() - sf.length()))){ rsf = sf; break; @@ -268,13 +279,13 @@ public class Export { dataEntry.add(e.getValue().toString()); - dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies)); - dataEntry.add(String.format("%.2f", ((double) e.getValue() * 1000000)/num_frequencies)); + dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_selected_taxonomy_frequencies.get(Taxonomy.TOTAL))); + dataEntry.add(String.format("%.2f", ((double) e.getValue() * 1000000)/num_taxonomy_frequencies.get(Taxonomy.TOTAL).longValue())); for (Taxonomy key : taxonomyResults.keySet()){ if(!key.equals(Taxonomy.TOTAL) && num_taxonomy_frequencies.containsKey(key) && num_taxonomy_frequencies.get(key).longValue() > 0) { AtomicLong frequency = taxonomyResults.get(key).get(e.getKey()); dataEntry.add(frequency.toString()); - dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_taxonomy_frequencies.get(key).longValue())); + dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_selected_taxonomy_frequencies.get(key))); dataEntry.add(String.format("%.2f", ((double) frequency.get() * 1000000) / num_taxonomy_frequencies.get(key).longValue())); // dataEntry.add(formatNumberAsPercent((double) frequency.get() / statistics.getUniGramOccurrences())); // dataEntry.add(String.format("%.2f", ((double) frequency.get() * 1000000) / statistics.getUniGramOccurrences())); diff --git a/src/main/resources/gui/OneWordAnalysisTab.fxml b/src/main/resources/gui/OneWordAnalysisTab.fxml index 0874140..80c1a1c 100755 --- a/src/main/resources/gui/OneWordAnalysisTab.fxml +++ b/src/main/resources/gui/OneWordAnalysisTab.fxml @@ -30,7 +30,7 @@ -