From 426a9ccc4623aea16f5913154e7063268b6b0574 Mon Sep 17 00:00:00 2001 From: Luka Date: Fri, 31 Aug 2018 07:57:58 +0200 Subject: [PATCH] Added some optimizations and new taxonomy names --- src/main/java/alg/XML_processing.java | 57 +- src/main/java/alg/inflectedJOS/ForkJoin.java | 134 ++-- .../alg/inflectedJOS/InflectedJOSCount.java | 338 ++++---- src/main/java/alg/ngram/Ngrams.java | 68 +- src/main/java/alg/word/WordCount.java | 158 ++-- src/main/java/alg/word/WordLevel.java | 4 +- src/main/java/data/Corpus.java | 7 + src/main/java/data/Filter.java | 55 +- src/main/java/data/Tax.java | 141 ++-- src/main/java/data/Word.java | 212 ++--- src/main/java/data/Word1.java | 17 + src/main/java/data/Word2.java | 22 + src/main/java/data/Word3.java | 27 + src/main/java/data/Word4.java | 32 + src/main/java/gui/StringAnalysisTabNew2.java | 163 ++-- src/main/java/util/Export.java | 12 +- .../resources/gui/StringAnalysisTabNew2.fxml | 12 +- src/test/java/Common.java | 174 ++--- src/test/java/NgramTests.java | 724 +++++++++--------- src/test/java/WordFormationTest.java | 110 +-- src/test/java/WordTest.java | 78 +- 21 files changed, 1354 insertions(+), 1191 deletions(-) create mode 100755 src/main/java/data/Word1.java create mode 100755 src/main/java/data/Word2.java create mode 100755 src/main/java/data/Word3.java create mode 100755 src/main/java/data/Word4.java diff --git a/src/main/java/alg/XML_processing.java b/src/main/java/alg/XML_processing.java index 6f1386f..f57accb 100755 --- a/src/main/java/alg/XML_processing.java +++ b/src/main/java/alg/XML_processing.java @@ -262,7 +262,7 @@ public class XML_processing { if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && stavek.size() > 0){ - stavek.add(new Word(c3Content, c3Content, "/")); + stavek.add(createWord(c3Content, c3Content, "/", "", stats.getFilter())); } @@ -297,7 +297,7 @@ public class XML_processing { // "word" node value if (in_word) { - stavek.add(new Word(characters.getData(), lemma, msd)); + stavek.add(createWord(characters.getData(), lemma, msd, "", stats.getFilter())); in_word = false; } break; @@ -537,12 +537,12 @@ public class XML_processing { // "word" node value if (inWord) { String word = characters.getData(); - sentence.add(new Word(word, lemma, msd)); + sentence.add(createWord(word, lemma, msd, word, stats.getFilter())); inWord = false; } if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { String punctuation = characters.getData(); - sentence.add(new Word(punctuation, punctuation, "/")); + sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter())); inPunctuation = false; // String punctuation = ","; @@ -761,7 +761,7 @@ public class XML_processing { // GOSCorpusHM.put(GOSCorpusHMKey, sentence); String word = ""; Characters characters = event.asCharacters(); - sentence.add(new Word(characters.getData(), "", "")); + sentence.add(createWord(characters.getData(), "", "", "", stats.getFilter())); // if algorithm is in normalized part find orthodox word and add other info to it } else { Characters characters = event.asCharacters(); @@ -769,15 +769,16 @@ public class XML_processing { // System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex); if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) { Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex); - currentWord.setLemma(lemma); - currentWord.setMsd(msd); - currentWord.setNormalizedWord(characters.getData()); + currentWord.setLemma(lemma, stats.getFilter().getWordParts()); + currentWord.setMsd(msd, stats.getFilter().getWordParts()); + currentWord.setNormalizedWord(characters.getData(), stats.getFilter().getWordParts()); wordIndex += 1; // when a word is separated from one to many we have to create these duplicates if (inSeparatedWord){ - GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, new Word(currentWord.getWord(), "", "")); + GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, createWord(currentWord.getWord(stats.getFilter().getWordParts()), + "", "", "", stats.getFilter())); } } //else { // System.out.println("Error"); @@ -893,8 +894,8 @@ public class XML_processing { // if we're calculating values for letters, omit words that are shorter than string length if (filter.getNgramValue() == 0) { - sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength()) - || (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength())); + sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord(filter.getWordParts()).length() < filter.getStringLength()) + || (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma(filter.getWordParts()).length() < filter.getStringLength())); } } @@ -912,4 +913,38 @@ public class XML_processing { return atts; } + + private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){ + List wString = new ArrayList<>(); + if (f.getWordParts().contains(CalculateFor.WORD)) + wString.add(word); + if (f.getWordParts().contains(CalculateFor.LEMMA)) + wString.add(lemma); + if (f.getWordParts().contains(CalculateFor.MORPHOSYNTACTIC_SPECS)) + wString.add(msd); + if (f.getWordParts().contains(CalculateFor.NORMALIZED_WORD)) + wString.add(normalizedWord); + + // find appropriate strings and put them in word + Word w; + + switch (f.getWordParts().size()) { + case 1: + w = new Word1(wString.get(0)); + break; + case 2: + w = new Word2(wString.get(0), wString.get(1)); + break; + case 3: + w = new Word3(wString.get(0), wString.get(1), wString.get(2)); + break; + case 4: + w = new Word4(wString.get(0), wString.get(1), wString.get(2), wString.get(3)); + break; + default: + w = null; + + } + return w; + } } diff --git a/src/main/java/alg/inflectedJOS/ForkJoin.java b/src/main/java/alg/inflectedJOS/ForkJoin.java index e480b76..3da4eee 100755 --- a/src/main/java/alg/inflectedJOS/ForkJoin.java +++ b/src/main/java/alg/inflectedJOS/ForkJoin.java @@ -1,67 +1,67 @@ -package alg.inflectedJOS; - -import java.util.List; -import java.util.concurrent.RecursiveAction; - -import data.Sentence; -import data.Statistics; - -public class ForkJoin extends RecursiveAction { - private static final long serialVersionUID = -1260951004477299634L; - - private static final int ACCEPTABLE_SIZE = 1000; - private List corpus; - private Statistics stats; - private int start; - private int end; - - - /** - * Constructor for subproblems. - */ - private ForkJoin(List corpus, int start, int end, Statistics stats) { - this.corpus = corpus; - this.start = start; - this.end = end; - this.stats = stats; - } - - /** - * Default constructor for the initial problem - */ - public ForkJoin(List corpus, Statistics stats) { - this.corpus = corpus; - this.start = 0; - this.end = corpus.size(); - this.stats = stats; - } - - private void computeDirectly() { - List subCorpus = corpus.subList(start, end); - - if (stats.isTaxonomySet()) { - InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy()); - } else { - InflectedJOSCount.calculateForAll(subCorpus, stats, null); - } - } - - @Override - protected void compute() { - int subCorpusSize = end - start; - - if (subCorpusSize < ACCEPTABLE_SIZE) { - computeDirectly(); - } else { - int mid = start + subCorpusSize / 2; - ForkJoin left = new ForkJoin(corpus, start, mid, stats); - ForkJoin right = new ForkJoin(corpus, mid, end, stats); - - // fork (push to queue)-> compute -> join - left.fork(); - right.fork(); - left.join(); - right.join(); - } - } -} +//package alg.inflectedJOS; +// +//import java.util.List; +//import java.util.concurrent.RecursiveAction; +// +//import data.Sentence; +//import data.Statistics; +// +//public class ForkJoin extends RecursiveAction { +// private static final long serialVersionUID = -1260951004477299634L; +// +// private static final int ACCEPTABLE_SIZE = 1000; +// private List corpus; +// private Statistics stats; +// private int start; +// private int end; +// +// +// /** +// * Constructor for subproblems. +// */ +// private ForkJoin(List corpus, int start, int end, Statistics stats) { +// this.corpus = corpus; +// this.start = start; +// this.end = end; +// this.stats = stats; +// } +// +// /** +// * Default constructor for the initial problem +// */ +// public ForkJoin(List corpus, Statistics stats) { +// this.corpus = corpus; +// this.start = 0; +// this.end = corpus.size(); +// this.stats = stats; +// } +// +// private void computeDirectly() { +// List subCorpus = corpus.subList(start, end); +// +// if (stats.isTaxonomySet()) { +// InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy()); +// } else { +// InflectedJOSCount.calculateForAll(subCorpus, stats, null); +// } +// } +// +// @Override +// protected void compute() { +// int subCorpusSize = end - start; +// +// if (subCorpusSize < ACCEPTABLE_SIZE) { +// computeDirectly(); +// } else { +// int mid = start + subCorpusSize / 2; +// ForkJoin left = new ForkJoin(corpus, start, mid, stats); +// ForkJoin right = new ForkJoin(corpus, mid, end, stats); +// +// // fork (push to queue)-> compute -> join +// left.fork(); +// right.fork(); +// left.join(); +// right.join(); +// } +// } +//} diff --git a/src/main/java/alg/inflectedJOS/InflectedJOSCount.java b/src/main/java/alg/inflectedJOS/InflectedJOSCount.java index c05b27c..b4f02bf 100755 --- a/src/main/java/alg/inflectedJOS/InflectedJOSCount.java +++ b/src/main/java/alg/inflectedJOS/InflectedJOSCount.java @@ -1,170 +1,170 @@ -package alg.inflectedJOS; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; - -import org.apache.commons.lang3.StringUtils; - -import alg.Common; -import data.Sentence; -import data.Statistics; -import data.StatisticsNew; -import data.Word; - -public class InflectedJOSCount { - - public static HashMap>> indices; - - // static { - // // calculate all possible combinations of indices we will substitute with a '-' for substring statistics - // indices = new HashMap<>(); - // for (int i = 5; i <= 8; i++) { - // indices.put(i, calculateCombinations(i)); - // } - // } - // - // private static List calculateCombinations(int i) { - // int arr[] = {1, 2, 3, 4, 5}; - // int r = 3; - // int n = arr.length; - // ArrayList> result = new ArrayList<>(); - // - // return printCombination(arr, n, r); - // } - // - // /* arr[] ---> Input Array - // data[] ---> Temporary array to store current combination - // start & end ---> Staring and Ending indexes in arr[] - // index ---> Current index in data[] - // r ---> Size of a combination to be printed */ - // static void combinationUtil(int arr[], int data[], int start, - // int end, int index, int r, ArrayList> result) { - // // Current combination is ready to be printed, print it - // ArrayList tmpResult = new ArrayList<>(); - // - // if (index == r) { - // ArrayList tmpResult = new ArrayList<>(); - // for (int j = 0; j < r; j++) - // System.out.print(data[j] + " "); - // System.out.println(""); - // return; - // } - // - // // replace index with all possible elements. The condition - // // "end-i+1 >= r-index" makes sure that including one element - // // at index will make a combination with remaining elements - // // at remaining positions - // for (int i = start; i <= end && end - i + 1 >= r - index; i++) { - // data[index] = arr[i]; - // combinationUtil(arr, data, i + 1, end, index + 1, r); - // } - // } - // - // // The main function that prints all combinations of size r - // // in arr[] of size n. This function mainly uses combinationUtil() - // static void printCombination(int arr[], int n, int r) { - // // A temporary array to store all combination one by one - // int data[] = new int[r]; - // - // // Print all combination using temprary array 'data[]' - // combinationUtil(arr, data, 0, n - 1, 0, r); - // } - - // public static void calculateForAll(List corpus, Statistics stats, String taxonomy) { - // for (Sentence s : corpus) { - // // disregard if wrong taxonomy - // if (!(s.getTaxonomy().startsWith(taxonomy))) { - // continue; - // } - // - // calculateCommon(s, stats.result); - // - // for (Word word : s.getWords()) { - // // skip if current word is not inflected - // if (!(word.getMsd().length() > 0)) { - // continue; - // } - // - // String msd = word.getMsd(); - // - // StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1))); - // - // for (int i = 1; i < msd.length(); i++) { - // entry.setCharAt(i, msd.charAt(i)); - // Common.updateMap(stats.result, entry.toString()); - // entry.setCharAt(i, '-'); - // } - // } - // } - // } - - // public static void calculateForAll(List corpus, Statistics stats) { - // for (Sentence s : corpus) { - // for (Word word : s.getWords()) { - // if (!(word.getMsd().length() > 0)) { - // continue; - // } - // - // String msd = word.getMsd(); - // - // StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1))); - // - // for (int i = 1; i < msd.length(); i++) { - // entry.setCharAt(i, msd.charAt(i)); - // Common.updateMap(stats.result, entry.toString()); - // entry.setCharAt(i, '-'); - // } - // } - // } - // } - - static void calculateForAll(List corpus, Statistics stats, String taxonomy) { - for (Sentence s : corpus) { - // disregard if wrong taxonomy -// if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) { -// continue; +//package alg.inflectedJOS; +// +//import java.util.ArrayList; +//import java.util.HashMap; +//import java.util.List; +// +//import org.apache.commons.lang3.StringUtils; +// +//import alg.Common; +//import data.Sentence; +//import data.Statistics; +//import data.StatisticsNew; +//import data.Word; +// +//public class InflectedJOSCount { +// +// public static HashMap>> indices; +// +// // static { +// // // calculate all possible combinations of indices we will substitute with a '-' for substring statistics +// // indices = new HashMap<>(); +// // for (int i = 5; i <= 8; i++) { +// // indices.put(i, calculateCombinations(i)); +// // } +// // } +// // +// // private static List calculateCombinations(int i) { +// // int arr[] = {1, 2, 3, 4, 5}; +// // int r = 3; +// // int n = arr.length; +// // ArrayList> result = new ArrayList<>(); +// // +// // return printCombination(arr, n, r); +// // } +// // +// // /* arr[] ---> Input Array +// // data[] ---> Temporary array to store current combination +// // start & end ---> Staring and Ending indexes in arr[] +// // index ---> Current index in data[] +// // r ---> Size of a combination to be printed */ +// // static void combinationUtil(int arr[], int data[], int start, +// // int end, int index, int r, ArrayList> result) { +// // // Current combination is ready to be printed, print it +// // ArrayList tmpResult = new ArrayList<>(); +// // +// // if (index == r) { +// // ArrayList tmpResult = new ArrayList<>(); +// // for (int j = 0; j < r; j++) +// // System.out.print(data[j] + " "); +// // System.out.println(""); +// // return; +// // } +// // +// // // replace index with all possible elements. The condition +// // // "end-i+1 >= r-index" makes sure that including one element +// // // at index will make a combination with remaining elements +// // // at remaining positions +// // for (int i = start; i <= end && end - i + 1 >= r - index; i++) { +// // data[index] = arr[i]; +// // combinationUtil(arr, data, i + 1, end, index + 1, r); +// // } +// // } +// // +// // // The main function that prints all combinations of size r +// // // in arr[] of size n. This function mainly uses combinationUtil() +// // static void printCombination(int arr[], int n, int r) { +// // // A temporary array to store all combination one by one +// // int data[] = new int[r]; +// // +// // // Print all combination using temprary array 'data[]' +// // combinationUtil(arr, data, 0, n - 1, 0, r); +// // } +// +// // public static void calculateForAll(List corpus, Statistics stats, String taxonomy) { +// // for (Sentence s : corpus) { +// // // disregard if wrong taxonomy +// // if (!(s.getTaxonomy().startsWith(taxonomy))) { +// // continue; +// // } +// // +// // calculateCommon(s, stats.result); +// // +// // for (Word word : s.getWords()) { +// // // skip if current word is not inflected +// // if (!(word.getMsd().length() > 0)) { +// // continue; +// // } +// // +// // String msd = word.getMsd(); +// // +// // StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1))); +// // +// // for (int i = 1; i < msd.length(); i++) { +// // entry.setCharAt(i, msd.charAt(i)); +// // Common.updateMap(stats.result, entry.toString()); +// // entry.setCharAt(i, '-'); +// // } +// // } +// // } +// // } +// +// // public static void calculateForAll(List corpus, Statistics stats) { +// // for (Sentence s : corpus) { +// // for (Word word : s.getWords()) { +// // if (!(word.getMsd().length() > 0)) { +// // continue; +// // } +// // +// // String msd = word.getMsd(); +// // +// // StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1))); +// // +// // for (int i = 1; i < msd.length(); i++) { +// // entry.setCharAt(i, msd.charAt(i)); +// // Common.updateMap(stats.result, entry.toString()); +// // entry.setCharAt(i, '-'); +// // } +// // } +// // } +// // } +// +// static void calculateForAll(List corpus, Statistics stats, String taxonomy) { +// for (Sentence s : corpus) { +// // disregard if wrong taxonomy +//// if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) { +//// continue; +//// } +// +// for (Word word : s.getWords()) { +// // skip if current word is not inflected +// if (!(word.getMsd().length() > 0)) { +// continue; +// } +// +// String msd = word.getMsd(); +// +// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1))); +// +// for (int i = 1; i < msd.length(); i++) { +// entry.setCharAt(i, msd.charAt(i)); +// Common.updateMap(stats.result, entry.toString()); +// entry.setCharAt(i, '-'); +// } // } - - for (Word word : s.getWords()) { - // skip if current word is not inflected - if (!(word.getMsd().length() > 0)) { - continue; - } - - String msd = word.getMsd(); - - StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1))); - - for (int i = 1; i < msd.length(); i++) { - entry.setCharAt(i, msd.charAt(i)); - Common.updateMap(stats.result, entry.toString()); - entry.setCharAt(i, '-'); - } - } - } - } - - public static void calculateForAll(List corpus, StatisticsNew stats, String taxonomy) { - for (Sentence s : corpus) { - - for (Word word : s.getWords()) { - // skip if current word is not inflected - // // TODO: if has defined msd and is of correct type (create a set) - // if (!(word.getMsd().length() > 0)) { - // continue; - // } - - String msd = word.getMsd(); - - StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1))); - - for (int i = 1; i < msd.length(); i++) { - entry.setCharAt(i, msd.charAt(i)); - stats.updateResults(entry.toString()); - entry.setCharAt(i, '-'); - } - } - } - } -} +// } +// } +// +// public static void calculateForAll(List corpus, StatisticsNew stats, String taxonomy) { +// for (Sentence s : corpus) { +// +// for (Word word : s.getWords()) { +// // skip if current word is not inflected +// // // TODO: if has defined msd and is of correct type (create a set) +// // if (!(word.getMsd().length() > 0)) { +// // continue; +// // } +// +// String msd = word.getMsd(); +// +// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1))); +// +// for (int i = 1; i < msd.length(); i++) { +// entry.setCharAt(i, msd.charAt(i)); +// stats.updateResults(entry.toString()); +// entry.setCharAt(i, '-'); +// } +// } +// } +// } +//} diff --git a/src/main/java/alg/ngram/Ngrams.java b/src/main/java/alg/ngram/Ngrams.java index c0c0155..a973a16 100755 --- a/src/main/java/alg/ngram/Ngrams.java +++ b/src/main/java/alg/ngram/Ngrams.java @@ -43,12 +43,12 @@ public class Ngrams { List ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue()); // if msd regex is set and this candidate doesn't pass it, skip this iteration - if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) { + if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) { continue; } // generate proper MultipleHMKeys depending on filter data - String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor()); + String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts()); // if last letter is ',' erase it @@ -67,14 +67,14 @@ public class Ngrams { multipleKeys = new MultipleHMKeys1(key); break; case 1: - String k1_2 = wordToString(ngramCandidate, otherKeys.get(0)); + String k1_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) // k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length()-1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2; multipleKeys = new MultipleHMKeys2(key, k1_2); break; case 2: - String k2_2 = wordToString(ngramCandidate, otherKeys.get(0)); - String k2_3 = wordToString(ngramCandidate, otherKeys.get(1)); + String k2_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); + String k2_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) { // k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2; // k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3; @@ -82,9 +82,9 @@ public class Ngrams { multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3); break; case 3: - String k3_2 = wordToString(ngramCandidate, otherKeys.get(0)); - String k3_3 = wordToString(ngramCandidate, otherKeys.get(1)); - String k3_4 = wordToString(ngramCandidate, otherKeys.get(2)); + String k3_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); + String k3_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts()); + String k3_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) { // k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2; // k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3; @@ -93,10 +93,10 @@ public class Ngrams { multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4); break; case 4: - String k4_2 = wordToString(ngramCandidate, otherKeys.get(0)); - String k4_3 = wordToString(ngramCandidate, otherKeys.get(1)); - String k4_4 = wordToString(ngramCandidate, otherKeys.get(2)); - String k4_5 = wordToString(ngramCandidate, otherKeys.get(3)); + String k4_2 = wordToString(ngramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); + String k4_3 = wordToString(ngramCandidate, otherKeys.get(1), stats.getFilter().getWordParts()); + String k4_4 = wordToString(ngramCandidate, otherKeys.get(2), stats.getFilter().getWordParts()); + String k4_5 = wordToString(ngramCandidate, otherKeys.get(3), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) { // k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2; // k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3; @@ -137,7 +137,7 @@ public class Ngrams { /** * Checks whether an ngram candidate passes specified regex filter. */ - private static boolean passesRegex(List ngramCandidate, ArrayList regex) { + private static boolean passesRegex(List ngramCandidate, ArrayList regex, ArrayList wordParts) { if (ngramCandidate.size() != regex.size()) { logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway return false; @@ -145,7 +145,7 @@ public class Ngrams { for (int i = 0; i < regex.size(); i++) { //if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) { - if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern() + ".*")) { + if (!ngramCandidate.get(i).getMsd(wordParts).matches(regex.get(i).pattern() + ".*")) { return false; } } @@ -153,33 +153,33 @@ public class Ngrams { return true; } - private static String wordToString(List ngramCandidate, CalculateFor calculateFor) { + private static String wordToString(List ngramCandidate, CalculateFor calculateFor, ArrayList wordParts) { ArrayList candidate = new ArrayList<>(ngramCandidate.size()); switch (calculateFor) { case LEMMA: candidate.addAll(ngramCandidate .stream() - .map(Word::getLemma) + .map(w -> w.getLemma(wordParts)) .collect(Collectors.toList())); return StringUtils.join(candidate, " "); case WORD: candidate.addAll(ngramCandidate .stream() - .map(Word::getWord) + .map(w -> w.getWord(wordParts)) .collect(Collectors.toList())); return StringUtils.join(candidate, " "); case MORPHOSYNTACTIC_SPECS: case MORPHOSYNTACTIC_PROPERTY: candidate.addAll(ngramCandidate .stream() - .map(Word::getMsd) + .map(w -> w.getMsd(wordParts)) .collect(Collectors.toList())); return StringUtils.join(candidate, " "); case WORD_TYPE: candidate.addAll(ngramCandidate .stream() - .map(w -> Character.toString(w.getMsd().charAt(0))) + .map(w -> Character.toString(w.getMsd(wordParts).charAt(0))) .collect(Collectors.toList())); // candidate.addAll(ngramCandidate // .stream() @@ -190,7 +190,7 @@ public class Ngrams { case NORMALIZED_WORD: candidate.addAll(ngramCandidate .stream() - .map(Word::getNormalizedWord) + .map(w -> w.getNormalizedWord(wordParts)) .collect(Collectors.toList())); return StringUtils.join(candidate, " "); } @@ -208,14 +208,14 @@ public class Ngrams { for (Sentence s : corpus) { for (Word w : s.getWords()) { List taxonomy = s.getTaxonomy(); - String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv()); + String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv(), stats.getFilter().getWordParts()); // skip this iteration if: // - word doesn't contain a proper version (missing lemma for example) // - msd regex is given but this word's msd doesn't match it, skip this iteration // - given substring length is larger than the word length if (ValidationUtil.isEmpty(word) - || stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern()) + || stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern()) || word.length() < stats.getFilter().getStringLength()) { continue; } @@ -331,7 +331,7 @@ public class Ngrams { private static void validateAndCountSkipgramCandidate(ArrayList skipgramCandidate, StatisticsNew stats, List taxonomy) { // count if no regex is set or if it is & candidate passes it - if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) { + if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) { // String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()); // key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key; // stats.updateTaxonomyResults(new MultipleHMKeys1(key), @@ -340,7 +340,7 @@ public class Ngrams { ArrayList otherKeys = stats.getFilter().getMultipleKeys(); - String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()); + String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor(), stats.getFilter().getWordParts()); // if last letter is ',' erase it @@ -359,14 +359,14 @@ public class Ngrams { multipleKeys = new MultipleHMKeys1(key); break; case 1: - String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0)); + String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) // k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length() - 1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2; multipleKeys = new MultipleHMKeys2(key, k1_2); break; case 2: - String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0)); - String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1)); + String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); + String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) { // k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2; // k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3; @@ -374,9 +374,9 @@ public class Ngrams { multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3); break; case 3: - String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0)); - String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1)); - String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2)); + String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); + String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts()); + String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) { // k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2; // k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3; @@ -385,10 +385,10 @@ public class Ngrams { multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4); break; case 4: - String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0)); - String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1)); - String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2)); - String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3)); + String k4_2 = wordToString(skipgramCandidate, otherKeys.get(0), stats.getFilter().getWordParts()); + String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1), stats.getFilter().getWordParts()); + String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2), stats.getFilter().getWordParts()); + String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3), stats.getFilter().getWordParts()); // if (stats.getFilter().getNotePunctuations()) { // k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2; // k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3; diff --git a/src/main/java/alg/word/WordCount.java b/src/main/java/alg/word/WordCount.java index 31a37d3..5ee2160 100755 --- a/src/main/java/alg/word/WordCount.java +++ b/src/main/java/alg/word/WordCount.java @@ -10,84 +10,84 @@ import data.Sentence; import data.Statistics; import data.Word; -class WordCount { - private static void calculateNoFilter(List corpus, Statistics stats) { - for (Sentence s : corpus) { - List sentence = new ArrayList<>(s.getWords().size()); - - if (stats.getCf() == CalculateFor.LEMMA) { - sentence.addAll(s.getWords() - .stream() - .map(Word::getLemma) - .collect(Collectors.toList())); - } else if (stats.getCf() == CalculateFor.WORD) { - sentence.addAll(s.getWords() - .stream() - .map(Word::getWord) - .collect(Collectors.toList())); - } - - for (String word : sentence) { - Common.updateMap(stats.result, word); - } - } - } - - private static void calculateVCC(List corpus, Statistics stats) { - for (Sentence s : corpus) { - List sentence = new ArrayList<>(s.getWords().size()); - - if (stats.getCf() == CalculateFor.LEMMA) { - sentence.addAll(s.getWords() - .stream() - .map(Word::getCVVLemma) - .collect(Collectors.toList())); - } else if (stats.getCf() == CalculateFor.WORD) { - sentence.addAll(s.getWords() - .stream() - .map(Word::getCVVWord) - .collect(Collectors.toList())); - } - - for (String word : sentence) { - if (word.length() > stats.getSubstringLength()) { - for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) { - String substring = word.substring(i, i + stats.getSubstringLength()); - Common.updateMap(stats.result, substring); - } - } - } - } - } - - private static void calculateForJosType(List corpus, Statistics stats) { - for (Sentence s : corpus) { - List sentence = new ArrayList<>(s.getWords().size()); - List filteredWords = new ArrayList<>(); - - for (Word word : s.getWords()) { - if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) { - filteredWords.add(word); - } - } - - if (stats.getCf() == CalculateFor.LEMMA) { - sentence.addAll(filteredWords - .stream() - .map(Word::getLemma) - .collect(Collectors.toList())); - } else if (stats.getCf() == CalculateFor.WORD) { - sentence.addAll(filteredWords - .stream() - .map(Word::getWord) - .collect(Collectors.toList())); - } - - for (String word : sentence) { - Common.updateMap(stats.result, word); - } - } - } +//class WordCount { +// private static void calculateNoFilter(List corpus, Statistics stats) { +// for (Sentence s : corpus) { +// List sentence = new ArrayList<>(s.getWords().size()); +// +// if (stats.getCf() == CalculateFor.LEMMA) { +// sentence.addAll(s.getWords() +// .stream() +// .map(Word::getLemma) +// .collect(Collectors.toList())); +// } else if (stats.getCf() == CalculateFor.WORD) { +// sentence.addAll(s.getWords() +// .stream() +// .map(Word::getWord) +// .collect(Collectors.toList())); +// } +// +// for (String word : sentence) { +// Common.updateMap(stats.result, word); +// } +// } +// } +// +// private static void calculateVCC(List corpus, Statistics stats) { +// for (Sentence s : corpus) { +// List sentence = new ArrayList<>(s.getWords().size()); +// +// if (stats.getCf() == CalculateFor.LEMMA) { +// sentence.addAll(s.getWords() +// .stream() +// .map(Word::getCVVLemma) +// .collect(Collectors.toList())); +// } else if (stats.getCf() == CalculateFor.WORD) { +// sentence.addAll(s.getWords() +// .stream() +// .map(Word::getCVVWord) +// .collect(Collectors.toList())); +// } +// +// for (String word : sentence) { +// if (word.length() > stats.getSubstringLength()) { +// for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) { +// String substring = word.substring(i, i + stats.getSubstringLength()); +// Common.updateMap(stats.result, substring); +// } +// } +// } +// } +// } +// +// private static void calculateForJosType(List corpus, Statistics stats) { +// for (Sentence s : corpus) { +// List sentence = new ArrayList<>(s.getWords().size()); +// List filteredWords = new ArrayList<>(); +// +// for (Word word : s.getWords()) { +// if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) { +// filteredWords.add(word); +// } +// } +// +// if (stats.getCf() == CalculateFor.LEMMA) { +// sentence.addAll(filteredWords +// .stream() +// .map(Word::getLemma) +// .collect(Collectors.toList())); +// } else if (stats.getCf() == CalculateFor.WORD) { +// sentence.addAll(filteredWords +// .stream() +// .map(Word::getWord) +// .collect(Collectors.toList())); +// } +// +// for (String word : sentence) { +// Common.updateMap(stats.result, word); +// } +// } +// } // private static void calculateForTaxonomyAndJosType(List corpus, Statistics stats) { // for (Sentence s : corpus) { @@ -164,4 +164,4 @@ class WordCount { // } // } // } -} \ No newline at end of file +//} \ No newline at end of file diff --git a/src/main/java/alg/word/WordLevel.java b/src/main/java/alg/word/WordLevel.java index 5809c71..94ea255 100755 --- a/src/main/java/alg/word/WordLevel.java +++ b/src/main/java/alg/word/WordLevel.java @@ -34,8 +34,8 @@ public class WordLevel { public static void calculateForAll(List corpus, StatisticsNew stats) { for (Sentence s : corpus) { for (Word word : s.getWords()) { - calculateForSuffixes(word.getWord(), stats); - calculateForPrefixes(word.getWord(), stats); + calculateForSuffixes(word.getWord(stats.getFilter().getWordParts()), stats); + calculateForPrefixes(word.getWord(stats.getFilter().getWordParts()), stats); } } } diff --git a/src/main/java/data/Corpus.java b/src/main/java/data/Corpus.java index 6bc35bd..017ecb5 100755 --- a/src/main/java/data/Corpus.java +++ b/src/main/java/data/Corpus.java @@ -8,6 +8,7 @@ import java.util.Collection; import java.util.HashMap; import java.util.HashSet; +import javafx.collections.FXCollections; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -15,6 +16,7 @@ import org.apache.logging.log4j.Logger; import data.Enums.solar.SolarFilters; import gui.ValidationUtil; import javafx.collections.ObservableList; +import org.controlsfx.control.CheckComboBox; public class Corpus { public final static Logger logger = LogManager.getLogger(Corpus.class); @@ -82,6 +84,11 @@ public class Corpus { public ObservableList getTaxonomy() { return taxonomy; } +// +// public ObservableList getFormattedTaxonomy() { +// ArrayList al = Tax.getTaxonomyFormatted(new ArrayList<>(taxonomy), corpusType); +// return FXCollections.observableArrayList(al); +// } public void setTaxonomy(ObservableList taxonomy) { this.taxonomy = taxonomy; diff --git a/src/main/java/data/Filter.java b/src/main/java/data/Filter.java index 0f5ee59..6cc3e43 100755 --- a/src/main/java/data/Filter.java +++ b/src/main/java/data/Filter.java @@ -2,10 +2,7 @@ package data; import static data.Filter.filterName.*; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; +import java.util.*; import java.util.regex.Pattern; import gui.ValidationUtil; @@ -17,6 +14,7 @@ public class Filter { public enum filterName { ANALYSIS_LEVEL, CALCULATE_FOR, + WORD_PARTS, NGRAM_VALUE, SKIP_VALUE, IS_CVV, @@ -36,6 +34,7 @@ public class Filter { public Filter() { filter = new HashMap<>(); filter.put(WRITE_MSD_AT_THE_END, false); + filter.put(WORD_PARTS, new ArrayList()); } public Filter(AnalysisLevel al, CalculateFor cf) { @@ -43,6 +42,10 @@ public class Filter { filter.put(ANALYSIS_LEVEL, al); filter.put(CALCULATE_FOR, cf); + + filter.put(WORD_PARTS, new ArrayList()); + addWordPart(cf); + filter.put(WRITE_MSD_AT_THE_END, false); } @@ -56,6 +59,8 @@ public class Filter { public void setCalculateFor(CalculateFor cf) { filter.put(CALCULATE_FOR, cf); + filter.put(WORD_PARTS, new ArrayList()); + addWordPart(cf); } public CalculateFor getCalculateFor() { @@ -137,6 +142,8 @@ public class Filter { public void setHasMsd(boolean hasMsd) { filter.put(HAS_MSD, hasMsd); + if (hasMsd) + addWordPart(CalculateFor.MORPHOSYNTACTIC_SPECS); } public boolean hasMsd() { @@ -170,7 +177,9 @@ public class Filter { ArrayList newKeys = new ArrayList<>(); if (keys != null) { for (String key : keys) { - newKeys.add(CalculateFor.factory(key)); + CalculateFor cf = CalculateFor.factory(key); + newKeys.add(cf); + addWordPart(cf); } } @@ -185,6 +194,14 @@ public class Filter { } } + public ArrayList getWordParts() { + if (filter.containsKey(WORD_PARTS) && filter.get(WORD_PARTS) != null) { + return (ArrayList) filter.get(WORD_PARTS); + } else { + return new ArrayList<>(); + } + } + public void setNotePunctuations(boolean notePunctuations) { filter.put(NOTE_PUNCTUATIONS, notePunctuations); } @@ -209,4 +226,32 @@ public class Filter { public Integer getMinimalTaxonomy() { return (Integer) filter.get(MINIMAL_TAXONOMY); } + + private void addWordPart(CalculateFor wp){ + ArrayList oldWp = ((ArrayList) filter.get(WORD_PARTS)); + + switch (wp) { + case WORD: + case DIST_WORDS: + if (!oldWp.contains(CalculateFor.WORD)) + oldWp.add(CalculateFor.WORD); + break; + case LEMMA: + case DIST_LEMMAS: + if (!oldWp.contains(CalculateFor.LEMMA)) + oldWp.add(CalculateFor.LEMMA); + break; + case MORPHOSYNTACTIC_PROPERTY: + case MORPHOSYNTACTIC_SPECS: + case WORD_TYPE: + if (!oldWp.contains(CalculateFor.MORPHOSYNTACTIC_SPECS)) + oldWp.add(CalculateFor.MORPHOSYNTACTIC_SPECS); + break; + case NORMALIZED_WORD: + if (!oldWp.contains(CalculateFor.NORMALIZED_WORD)) + oldWp.add(CalculateFor.NORMALIZED_WORD); + break; + } + + } } diff --git a/src/main/java/data/Tax.java b/src/main/java/data/Tax.java index 6324fd0..bd71203 100755 --- a/src/main/java/data/Tax.java +++ b/src/main/java/data/Tax.java @@ -16,67 +16,67 @@ public class Tax { // GIGAFIDA ---------------------------- GIGAFIDA_TAXONOMY = new LinkedHashMap<>(); - GIGAFIDA_TAXONOMY.put("SSJ.T", "tisk"); - GIGAFIDA_TAXONOMY.put("SSJ.T.K", "tisk-knjižno"); - GIGAFIDA_TAXONOMY.put("SSJ.T.K.L", "tisk-knjižno-leposlovno"); - GIGAFIDA_TAXONOMY.put("SSJ.T.K.S", "tisk-knjižno-strokovno"); - GIGAFIDA_TAXONOMY.put("SSJ.T.P", "tisk-periodično"); - GIGAFIDA_TAXONOMY.put("SSJ.T.P.C", "tisk-periodično-časopis"); - GIGAFIDA_TAXONOMY.put("SSJ.T.P.R", "tisk-periodično-revija"); - GIGAFIDA_TAXONOMY.put("SSJ.T.D", "tisk-drugo"); - GIGAFIDA_TAXONOMY.put("SSJ.I", "internet"); - - GIGAFIDA_TAXONOMY.put("Ft.P", "prenosnik"); - GIGAFIDA_TAXONOMY.put("Ft.P.G", "prenosnik-govorni"); - GIGAFIDA_TAXONOMY.put("Ft.P.E", "prenosnik-elektronski"); - GIGAFIDA_TAXONOMY.put("Ft.P.P", "prenosnik-pisni"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.O", "prenosnik-pisni-objavljeno"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.O.K", "prenosnik-pisni-objavljeno-knjižno"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P", "prenosnik-pisni-objavljeno-periodično"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C", "prenosnik-pisni-objavljeno-periodično-časopisno"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.D", "prenosnik-pisni-objavljeno-periodično-časopisno-dnevno"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.V", "prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.T", "prenosnik-pisni-objavljeno-periodično-časopisno-tedensko"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R", "prenosnik-pisni-objavljeno-periodično-revialno"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.T", "prenosnik-pisni-objavljeno-periodično-revialno-tedensko"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.S", "prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.M", "prenosnik-pisni-objavljeno-periodično-revialno-mesečno"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.D", "prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.O", "prenosnik-pisni-objavljeno-periodično-revialno-občasno"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.N", "prenosnik-pisni-neobjavljeno"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.N.J", "prenosnik-pisni-neobjavljeno-javno"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.N.I", "prenosnik-pisni-neobjavljeno-interno"); - GIGAFIDA_TAXONOMY.put("Ft.P.P.N.Z", "prenosnik-pisni-neobjavljeno-zasebno"); - - GIGAFIDA_TAXONOMY.put("Ft.Z", "zvrst"); - GIGAFIDA_TAXONOMY.put("Ft.Z.U", "zvrst-umetnostna"); - GIGAFIDA_TAXONOMY.put("Ft.Z.U.P", "zvrst-umetnostna-pesniška"); - GIGAFIDA_TAXONOMY.put("Ft.Z.U.R", "zvrst-umetnostna-prozna"); - GIGAFIDA_TAXONOMY.put("Ft.Z.U.D", "zvrst-umetnostna-dramska"); - GIGAFIDA_TAXONOMY.put("Ft.Z.N", "zvrst-neumetnostna"); - GIGAFIDA_TAXONOMY.put("Ft.Z.N.S", "zvrst-neumetnostna-strokovna"); - GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.H", "zvrst-neumetnostna-strokovna-humanistična in družboslovna"); - GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.N", "zvrst-neumetnostna-strokovna-naravoslovna in tehnična"); - GIGAFIDA_TAXONOMY.put("Ft.Z.N.N", "zvrst-neumetnostna-nestrokovna"); - GIGAFIDA_TAXONOMY.put("Ft.Z.N.P", "zvrst-neumetnostna-pravna"); - GIGAFIDA_TAXONOMY.put("Ft.L", "zvrst-lektorirano"); - GIGAFIDA_TAXONOMY.put("Ft.L.D", "zvrst-lektorirano-da"); - GIGAFIDA_TAXONOMY.put("Ft.L.N", "zvrst-lektorirano-ne"); + GIGAFIDA_TAXONOMY.put("SSJ.T", "SSJ.T - tisk"); + GIGAFIDA_TAXONOMY.put("SSJ.T.K", "SSJ.T.K - tisk-knjižno"); + GIGAFIDA_TAXONOMY.put("SSJ.T.K.L", "SSJ.T.K.L - tisk-knjižno-leposlovno"); + GIGAFIDA_TAXONOMY.put("SSJ.T.K.S", "SSJ.T.K.S - tisk-knjižno-strokovno"); + GIGAFIDA_TAXONOMY.put("SSJ.T.P", "SSJ.T.P - tisk-periodično"); + GIGAFIDA_TAXONOMY.put("SSJ.T.P.C", "SSJ.T.P.C - tisk-periodično-časopis"); + GIGAFIDA_TAXONOMY.put("SSJ.T.P.R", "SSJ.T.P.R - tisk-periodično-revija"); + GIGAFIDA_TAXONOMY.put("SSJ.T.D", "SSJ.T.D - tisk-drugo"); + GIGAFIDA_TAXONOMY.put("SSJ.I", "SSJ.I - internet"); + + GIGAFIDA_TAXONOMY.put("Ft.P", "Ft.P - prenosnik"); + GIGAFIDA_TAXONOMY.put("Ft.P.G", "Ft.P.G - prenosnik-govorni"); + GIGAFIDA_TAXONOMY.put("Ft.P.E", "Ft.P.E - prenosnik-elektronski"); + GIGAFIDA_TAXONOMY.put("Ft.P.P", "Ft.P.P - prenosnik-pisni"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.O", "Ft.P.P.O - prenosnik-pisni-objavljeno"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.O.K", "Ft.P.P.O.K - prenosnik-pisni-objavljeno-knjižno"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P", "Ft.P.P.O.P - prenosnik-pisni-objavljeno-periodično"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C", "Ft.P.P.O.P.C - prenosnik-pisni-objavljeno-periodično-časopisno"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.D", "Ft.P.P.O.P.C.D - prenosnik-pisni-objavljeno-periodično-časopisno-dnevno"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.V", "Ft.P.P.O.P.C.V - prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.T", "Ft.P.P.O.P.C.T - prenosnik-pisni-objavljeno-periodično-časopisno-tedensko"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R", "Ft.P.P.O.P.R - prenosnik-pisni-objavljeno-periodično-revialno"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.T", "Ft.P.P.O.P.R.T - prenosnik-pisni-objavljeno-periodično-revialno-tedensko"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.S", "Ft.P.P.O.P.R.S - prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.M", "Ft.P.P.O.P.R.M - prenosnik-pisni-objavljeno-periodično-revialno-mesečno"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.D", "Ft.P.P.O.P.R.D - prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.O", "Ft.P.P.O.P.R.O - prenosnik-pisni-objavljeno-periodično-revialno-občasno"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.N", "Ft.P.P.N - prenosnik-pisni-neobjavljeno"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.N.J", "Ft.P.P.N.J - prenosnik-pisni-neobjavljeno-javno"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.N.I", "Ft.P.P.N.I - prenosnik-pisni-neobjavljeno-interno"); + GIGAFIDA_TAXONOMY.put("Ft.P.P.N.Z", "Ft.P.P.N.Z - prenosnik-pisni-neobjavljeno-zasebno"); + + GIGAFIDA_TAXONOMY.put("Ft.Z", "Ft.Z - zvrst"); + GIGAFIDA_TAXONOMY.put("Ft.Z.U", "Ft.Z.U - zvrst-umetnostna"); + GIGAFIDA_TAXONOMY.put("Ft.Z.U.P", "Ft.Z.U.P - zvrst-umetnostna-pesniška"); + GIGAFIDA_TAXONOMY.put("Ft.Z.U.R", "Ft.Z.U.R - zvrst-umetnostna-prozna"); + GIGAFIDA_TAXONOMY.put("Ft.Z.U.D", "Ft.Z.U.D - zvrst-umetnostna-dramska"); + GIGAFIDA_TAXONOMY.put("Ft.Z.N", "Ft.Z.N - zvrst-neumetnostna"); + GIGAFIDA_TAXONOMY.put("Ft.Z.N.S", "Ft.Z.N.S - zvrst-neumetnostna-strokovna"); + GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.H", "Ft.Z.N.S.H - zvrst-neumetnostna-strokovna-humanistična in družboslovna"); + GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.N", "Ft.Z.N.S.N - zvrst-neumetnostna-strokovna-naravoslovna in tehnična"); + GIGAFIDA_TAXONOMY.put("Ft.Z.N.N", "Ft.Z.N.N - zvrst-neumetnostna-nestrokovna"); + GIGAFIDA_TAXONOMY.put("Ft.Z.N.P", "Ft.Z.N.P - zvrst-neumetnostna-pravna"); + GIGAFIDA_TAXONOMY.put("Ft.L", "Ft.L - zvrst-lektorirano"); + GIGAFIDA_TAXONOMY.put("Ft.L.D", "Ft.L.D - zvrst-lektorirano-da"); + GIGAFIDA_TAXONOMY.put("Ft.L.N", "Ft.L.N - zvrst-lektorirano-ne"); // GOS ---------------------------------- GOS_TAXONOMY = new LinkedHashMap<>(); - GOS_TAXONOMY.put("gos.T", "diskurz"); - GOS_TAXONOMY.put("gos.T.J", "diskurz-javni"); - GOS_TAXONOMY.put("gos.T.J.I", "diskurz-javni-informativno-izobraževalni"); - GOS_TAXONOMY.put("gos.T.J.R", "diskurz-javni-razvedrilni"); - GOS_TAXONOMY.put("gos.T.N", "diskurz-nejavni"); - GOS_TAXONOMY.put("gos.T.N.N", "diskurz-nejavni-nezasebni"); - GOS_TAXONOMY.put("gos.T.N.Z", "diskurz-nejavni-zasebni"); - - GOS_TAXONOMY.put("gos.S", "situacija"); - GOS_TAXONOMY.put("gos.S.R", "situacija-radio"); - GOS_TAXONOMY.put("gos.S.T", "situacija-televizija"); + GOS_TAXONOMY.put("gos.T", "gos.T - diskurz"); + GOS_TAXONOMY.put("gos.T.J", "gos.T.J - diskurz-javni"); + GOS_TAXONOMY.put("gos.T.J.I", "gos.T.J.I - diskurz-javni-informativno-izobraževalni"); + GOS_TAXONOMY.put("gos.T.J.R", "gos.T.J.R - diskurz-javni-razvedrilni"); + GOS_TAXONOMY.put("gos.T.N", "gos.T.N - diskurz-nejavni"); + GOS_TAXONOMY.put("gos.T.N.N", "gos.T.N.N - diskurz-nejavni-nezasebni"); + GOS_TAXONOMY.put("gos.T.N.Z", "gos.T.N.Z - diskurz-nejavni-zasebni"); + + GOS_TAXONOMY.put("gos.S", "gos.S - situacija"); + GOS_TAXONOMY.put("gos.S.R", "gos.S.R - situacija-radio"); + GOS_TAXONOMY.put("gos.S.T", "gos.S.T - situacija-televizija"); } /** @@ -147,6 +147,33 @@ public class Tax { return result; } +// public static ArrayList getTaxonomyFormatted(ArrayList taxonomyNames, CorpusType corpusType) { +// ArrayList result = new ArrayList<>(); +// +// if (ValidationUtil.isEmpty(taxonomyNames)) { +// return result; +// } +// +// LinkedHashMap tax = new LinkedHashMap<>(); +// +// if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) { +// tax = GIGAFIDA_TAXONOMY; +// } else if (corpusType == CorpusType.GOS) { +// tax = GOS_TAXONOMY; +// } +// +// // for easier lookup +// Map taxInversed = tax.entrySet() +// .stream() +// .collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey)); +// +// for (String taxonomyName : taxonomyNames) { +// result.add(taxInversed.get(taxonomyName) + " - " + taxonomyName); +// } +// +// return result; +// } + /** * Returns a list of proper names for codes * diff --git a/src/main/java/data/Word.java b/src/main/java/data/Word.java index 0fc115f..1dc6ba7 100755 --- a/src/main/java/data/Word.java +++ b/src/main/java/data/Word.java @@ -1,110 +1,94 @@ package data; import java.io.Serializable; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; -import java.util.List; - -import org.apache.commons.lang3.StringUtils; - -import data.Enums.Msd; -import gui.ValidationUtil; - -public class Word implements Serializable { - public static final char PAD_CHARACTER = '-'; - - private String word; - private String lemma; - private String msd; - private String normalizedWord; - private final HashSet VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u')); - - /** - * Possible values: - *

- *

    - *
  • S = samostalnik
  • - *
  • G = glagol
  • - *
  • P = pridevnik
  • - *
  • R = prislov
  • - *
  • Z = zaimek
  • - *
  • K = števnik
  • - *
  • D = predlog
  • - *
  • V = veznik
  • - *
  • L = členek
  • - *
  • M = medmet
  • - *
  • O = okrajšava
  • - *
  • N = neuvrščeno
  • - *
- */ - //private char besedna_vrsta; - public Word(String word, String lemma, String msd) { - this.lemma = lemma; - this.msd = msd; //normalizeMsd(msd); - this.normalizedWord = ""; - - // veliko zacetnico ohranimo samo za lastna imena - if (!ValidationUtil.isEmpty(this.msd) && !(this.msd.charAt(0) == 'S' - && this.msd.length() >= 2 - && this.msd.charAt(1) == 'l')) { - this.word = word.toLowerCase(); - } else { - this.word = word; +import java.util.Objects; + +/* +Created for when words are sorted by multiple keys, i.e. not just lemmas but lemmas and msd simultaneously. +*/ +public interface Word { + String getW1(); + default String getW2(){ return null; } + default String getW3(){ return null; } + default String getW4(){ return null; } + + default String get(ArrayList wordParts, CalculateFor cf){ + if (wordParts.size() > 0 && wordParts.get(0).equals(cf)) + return getW1(); + if (wordParts.size() > 1 && wordParts.get(1).equals(cf)) + return getW2(); + if (wordParts.size() > 2 && wordParts.get(2).equals(cf)) + return getW3(); + if (wordParts.size() > 3 && wordParts.get(3).equals(cf)) + return getW4(); + return null; + } + + default String getWord(ArrayList wordParts){ + return get(wordParts, CalculateFor.WORD); + } + + default String getLemma(ArrayList wordParts){ + return get(wordParts, CalculateFor.LEMMA); + } + + default String getMsd(ArrayList wordParts){ + return get(wordParts, CalculateFor.MORPHOSYNTACTIC_SPECS); + } + + default String getNormalizedWord(ArrayList wordParts){ + return get(wordParts, CalculateFor.NORMALIZED_WORD); + } + + void setW1(String w); + default void setW2(String w){} + default void setW3(String w){} + default void setW4(String w){} + + default void set(String w, ArrayList wordParts, CalculateFor cf){ + switch(wordParts.indexOf(cf)){ + case 0: + setW1(w); + break; + case 1: + setW2(w); + break; + case 2: + setW3(w); + break; + case 3: + setW4(w); + break; } } - public Word(String word, String lemma, String msd, String normalizedWord) { - this.lemma = lemma; -// this.msd = normalizeMsd(msd); - this.msd = msd; - this.normalizedWord = normalizedWord; - - // veliko zacetnico ohranimo samo za lastna imena - if (!ValidationUtil.isEmpty(this.msd) && !(this.msd.charAt(0) == 'S' - && this.msd.length() >= 2 - && this.msd.charAt(1) == 'l')) { - this.word = word.toLowerCase(); - } else { - this.word = word; - } + default void setLemma(String w, ArrayList wordParts){ + set(w, wordParts, CalculateFor.LEMMA); } - public Word() { + default void setMsd(String w, ArrayList wordParts){ + set(w, wordParts, CalculateFor.MORPHOSYNTACTIC_SPECS); } -// /** -// * Appends a number of '-' to msds which are not properly sized. -// * E.g. nouns should have 5 attributes, yet the last one isn't always defined (Somei vs. Sometd) -// * -// * @param msdInput -// * -// * @return -// */ -// private String normalizeMsd(String msdInput) { -// if (ValidationUtil.isEmpty(msdInput)) { -// return ""; -// } else { -// return StringUtils.rightPad(msdInput, Msd.getMsdLengthForType(msdInput), PAD_CHARACTER); -// } -// } - - public Word(String word) { - this.word = word; + default void setNormalizedWord(String w, ArrayList wordParts){ + set(w, wordParts, CalculateFor.NORMALIZED_WORD); } - public String getWord() { - return word; - } - public String getCVVWord() { - return covertToCvv(word); + default String getCVVWord(ArrayList cf) { + return covertToCvv(getWord(cf)); } - public String getCVVLemma() { - return covertToCvv(lemma); + default String getCVVLemma(ArrayList cf) { + return covertToCvv(getLemma(cf)); } - private String covertToCvv(String s) { + default String covertToCvv(String s) { + final HashSet VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u')); + char[] StringCA = s.toCharArray(); for (int i = 0; i < StringCA.length; i++) { @@ -114,59 +98,13 @@ public class Word implements Serializable { return new String(StringCA); } - public void setWord(String word) { - this.word = word; - } - - public String getLemma() { - return lemma; - } - - public void setLemma(String lemma) { - this.lemma = lemma; - } - - public String getMsd() { - return msd; - } - - public void setMsd(String msd) { - this.msd = msd; - } - - public String getNormalizedWord() { - return normalizedWord; - } - - public void setNormalizedWord(String normalizedWord) { - this.normalizedWord = normalizedWord; - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - - sb.append("beseda:\t") - .append(getWord()) - .append("\n") - .append("lema:\t") - .append(getLemma()) - .append("\n") - .append("msd:\t") - .append(getMsd()) - .append("normalized word:\t") - .append(getNormalizedWord()) - .append("\n"); - - return sb.toString(); - } - - public String getForCf(CalculateFor calculateFor, boolean cvv) { + default String getForCf(CalculateFor calculateFor, boolean cvv, ArrayList cf) { String returnValue = ""; if (cvv) { - returnValue = calculateFor == CalculateFor.WORD ? getCVVWord() : getCVVLemma(); + returnValue = calculateFor == CalculateFor.WORD ? getCVVWord(cf) : getCVVLemma(cf); } else { - returnValue = calculateFor == CalculateFor.WORD ? getWord() : getLemma(); + returnValue = calculateFor == CalculateFor.WORD ? getWord(cf) : getLemma(cf); } return returnValue; diff --git a/src/main/java/data/Word1.java b/src/main/java/data/Word1.java new file mode 100755 index 0000000..ecabc87 --- /dev/null +++ b/src/main/java/data/Word1.java @@ -0,0 +1,17 @@ +package data; + +import java.io.Serializable; + +public class Word1 implements Serializable, Word { + private String w1; + + public Word1(String w1) { + this.w1 = w1; + } + + public String getW1() { + return w1; + } + + public void setW1(String w){w1 = w;} +} diff --git a/src/main/java/data/Word2.java b/src/main/java/data/Word2.java new file mode 100755 index 0000000..a335f02 --- /dev/null +++ b/src/main/java/data/Word2.java @@ -0,0 +1,22 @@ +package data; + +import java.io.Serializable; + +public class Word2 implements Serializable, Word { + private String w1, w2; + + public Word2(String w1, String w2) { + this.w1 = w1; + this.w2 = w2; + } + + public String getW1() { + return w1; + } + public String getW2() { + return w2; + } + + public void setW1(String w){w1 = w;} + public void setW2(String w){w2 = w;} +} diff --git a/src/main/java/data/Word3.java b/src/main/java/data/Word3.java new file mode 100755 index 0000000..c387f3d --- /dev/null +++ b/src/main/java/data/Word3.java @@ -0,0 +1,27 @@ +package data; + +import java.io.Serializable; + +public class Word3 implements Serializable, Word { + private String w1, w2, w3; + + public Word3(String w1, String w2, String w3) { + this.w1 = w1; + this.w2 = w2; + this.w3 = w3; + } + + public String getW1() { + return w1; + } + public String getW2() { + return w2; + } + public String getW3() { + return w3; + } + + public void setW1(String w){w1 = w;} + public void setW2(String w){w2 = w;} + public void setW3(String w){w3 = w;} +} diff --git a/src/main/java/data/Word4.java b/src/main/java/data/Word4.java new file mode 100755 index 0000000..b9767f2 --- /dev/null +++ b/src/main/java/data/Word4.java @@ -0,0 +1,32 @@ +package data; + +import java.io.Serializable; + +public class Word4 implements Serializable, Word { + private String w1, w2, w3, w4; + + public Word4(String w1, String w2, String w3, String w4) { + this.w1 = w1; + this.w2 = w2; + this.w3 = w3; + this.w4 = w4; + } + + public String getW1() { + return w1; + } + public String getW2() { + return w2; + } + public String getW3() { + return w3; + } + public String getW4() { + return w4; + } + + public void setW1(String w){w1 = w;} + public void setW2(String w){w2 = w;} + public void setW3(String w){w3 = w;} + public void setW4(String w){w4 = w;} +} diff --git a/src/main/java/gui/StringAnalysisTabNew2.java b/src/main/java/gui/StringAnalysisTabNew2.java index 6bcf801..f141727 100755 --- a/src/main/java/gui/StringAnalysisTabNew2.java +++ b/src/main/java/gui/StringAnalysisTabNew2.java @@ -10,6 +10,7 @@ import java.util.*; import java.util.regex.Pattern; import javafx.application.HostServices; +import javafx.collections.transformation.SortedList; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -380,87 +381,87 @@ public class StringAnalysisTabNew2 { * iscvv: false * string length: 1 */ - public void populateFields() { - // corpus changed if: current one is null (this is first run of the app) - // or if currentCorpus != gui's corpus - boolean corpusChanged = currentCorpusType == null - || currentCorpusType != corpus.getCorpusType(); - - // keep ngram value if set - if (ngramValue == null) { - ngramValueCB.getSelectionModel().select("1"); - ngramValue = 1; - } - - // TODO: check for GOS, GIGAFIDA, SOLAR... - // refresh and: - // TODO if current value != null && is in new calculateFor ? keep : otherwise reset - if (calculateFor == null) { - calculateForCB.getSelectionModel().select(calculateForCB.getItems().get(0)); - calculateFor = CalculateFor.factory(calculateForCB.getItems().get(0)); - } - - if (!filter.hasMsd()) { - // if current corpus doesn't have msd data, disable this field - msd = new ArrayList<>(); - msdTF.setText(""); - msdTF.setDisable(true); - logger.info("no msd data"); - } else { - if (ValidationUtil.isEmpty(msd) - || (!ValidationUtil.isEmpty(msd) && corpusChanged)) { - // msd has not been set previously - // or msd has been set but the corpus changed -> reset - msd = new ArrayList<>(); - msdTF.setText(""); - msdTF.setDisable(false); - logger.info("msd reset"); - } else if (!ValidationUtil.isEmpty(msd) && !corpusChanged) { - // if msd has been set, but corpus type remained the same, we can keep any set msd value - msdTF.setText(StringUtils.join(msdStrings, " ")); - msdTF.setDisable(false); - logger.info("msd kept"); - } - } - - // TODO: taxonomy: refresh and keep if in new taxonomy, otherwise empty (no selection) - - // keep skip value - if (skipValue == null) { - skipValueCB.getSelectionModel().select("0"); - skipValue = 0; - } - - // keep calculateCvv - calculatecvvCB.setSelected(calculateCvv); - - // keep string length if set - if (stringLength != null) { - stringLengthTF.setText(String.valueOf(stringLength)); - } else { - stringLengthTF.setText("1"); - stringLength = 1; - } - - // TODO: trigger on rescan - if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) { - // user changed corpus (by type) or by selection & triggered a rescan of headers - // see if we read taxonomy from headers, otherwise use default values for given corpus - ObservableList tax = corpus.getTaxonomy(); - taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType()); - - currentCorpusType = corpus.getCorpusType(); - // setTaxonomyIsDirty(false); - } else { - - } - - // see if we read taxonomy from headers, otherwise use default values for given corpus - ObservableList tax = corpus.getTaxonomy(); - taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType()); - taxonomyCCB.getItems().addAll(taxonomyCCBValues); - - } +// public void populateFields() { +// // corpus changed if: current one is null (this is first run of the app) +// // or if currentCorpus != gui's corpus +// boolean corpusChanged = currentCorpusType == null +// || currentCorpusType != corpus.getCorpusType(); +// +// // keep ngram value if set +// if (ngramValue == null) { +// ngramValueCB.getSelectionModel().select("1"); +// ngramValue = 1; +// } +// +// // TODO: check for GOS, GIGAFIDA, SOLAR... +// // refresh and: +// // TODO if current value != null && is in new calculateFor ? keep : otherwise reset +// if (calculateFor == null) { +// calculateForCB.getSelectionModel().select(calculateForCB.getItems().get(0)); +// calculateFor = CalculateFor.factory(calculateForCB.getItems().get(0)); +// } +// +// if (!filter.hasMsd()) { +// // if current corpus doesn't have msd data, disable this field +// msd = new ArrayList<>(); +// msdTF.setText(""); +// msdTF.setDisable(true); +// logger.info("no msd data"); +// } else { +// if (ValidationUtil.isEmpty(msd) +// || (!ValidationUtil.isEmpty(msd) && corpusChanged)) { +// // msd has not been set previously +// // or msd has been set but the corpus changed -> reset +// msd = new ArrayList<>(); +// msdTF.setText(""); +// msdTF.setDisable(false); +// logger.info("msd reset"); +// } else if (!ValidationUtil.isEmpty(msd) && !corpusChanged) { +// // if msd has been set, but corpus type remained the same, we can keep any set msd value +// msdTF.setText(StringUtils.join(msdStrings, " ")); +// msdTF.setDisable(false); +// logger.info("msd kept"); +// } +// } +// +// // TODO: taxonomy: refresh and keep if in new taxonomy, otherwise empty (no selection) +// +// // keep skip value +// if (skipValue == null) { +// skipValueCB.getSelectionModel().select("0"); +// skipValue = 0; +// } +// +// // keep calculateCvv +// calculatecvvCB.setSelected(calculateCvv); +// +// // keep string length if set +// if (stringLength != null) { +// stringLengthTF.setText(String.valueOf(stringLength)); +// } else { +// stringLengthTF.setText("1"); +// stringLength = 1; +// } +// +// // TODO: trigger on rescan +// if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) { +// // user changed corpus (by type) or by selection & triggered a rescan of headers +// // see if we read taxonomy from headers, otherwise use default values for given corpus +// ObservableList tax = corpus.getTaxonomy(); +// taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType()); +// +// currentCorpusType = corpus.getCorpusType(); +// // setTaxonomyIsDirty(false); +// } else { +// +// } +// +// // see if we read taxonomy from headers, otherwise use default values for given corpus +// ObservableList tax = corpus.getTaxonomy(); +// taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType()); +// taxonomyCCB.getItems().addAll(taxonomyCCBValues); +// +// } /** * Toggles visibility for panes which hold fields for skipgram value (not applicable when calculating for letters) etc., diff --git a/src/main/java/util/Export.java b/src/main/java/util/Export.java index 0b9f812..574db17 100755 --- a/src/main/java/util/Export.java +++ b/src/main/java/util/Export.java @@ -125,9 +125,11 @@ public class Export { // for (Map value : taxonomyResults.values()) { for (CalculateFor otherKey : filter.getMultipleKeys()) { - FILE_HEADER_AL.add(otherKey.toHeaderString()); - if (otherKey.equals(CalculateFor.LEMMA)) - FILE_HEADER_AL.add("Lema male črke"); + if (num_taxonomy_frequencies.get(otherKey) > 0) { + FILE_HEADER_AL.add(otherKey.toHeaderString()); + if (otherKey.equals(CalculateFor.LEMMA)) + FILE_HEADER_AL.add("Lema male črke"); + } } // if(otherKey.equals(CalculateFor.LEMMA)){ @@ -164,7 +166,7 @@ public class Export { // } FILE_HEADER_AL.add("Skupna relativna pogostost (na milijon pojavitev)"); for (String key : taxonomyResults.keySet()) { - if(!key.equals("Total")) { + if(!key.equals("Total") && num_taxonomy_frequencies.get(key) > 0) { FILE_HEADER_AL.add("Absolutna pogostost [" + key + "]"); FILE_HEADER_AL.add("Delež [" + key + "]"); FILE_HEADER_AL.add("Relativna pogostost [" + key + "]"); @@ -257,7 +259,7 @@ public class Export { dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_frequencies)); dataEntry.add(String.format("%.2f", ((double) e.getValue() * 1000000)/num_frequencies)); for (String key : taxonomyResults.keySet()){ - if(!key.equals("Total")) { + if(!key.equals("Total") && num_taxonomy_frequencies.get(key) > 0) { AtomicLong frequency = taxonomyResults.get(key).get(e.getKey()); dataEntry.add(frequency.toString()); dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_taxonomy_frequencies.get(key))); diff --git a/src/main/resources/gui/StringAnalysisTabNew2.fxml b/src/main/resources/gui/StringAnalysisTabNew2.fxml index f032763..0e6ed6e 100755 --- a/src/main/resources/gui/StringAnalysisTabNew2.fxml +++ b/src/main/resources/gui/StringAnalysisTabNew2.fxml @@ -13,6 +13,7 @@ + @@ -80,7 +81,16 @@