From edcd8062bc9cbc813234b9da5c68fe224f1ebcf8 Mon Sep 17 00:00:00 2001 From: Luka Date: Wed, 31 Jul 2019 17:53:32 +0200 Subject: [PATCH] Added GOS normalized words analysis in letter extraction + Fixing normalized words bugs with anonymous names in GOS (when extracting data with in collocability) --- src/main/java/data/MultipleHMKeys2.java | 4 ++ src/main/java/data/MultipleHMKeys3.java | 4 ++ src/main/java/data/MultipleHMKeys4.java | 4 ++ src/main/java/data/MultipleHMKeys5.java | 4 ++ src/main/java/data/Word.java | 9 ++- src/main/java/gui/CharacterAnalysisTab.java | 69 +++++++++++++++++---- 6 files changed, 80 insertions(+), 14 deletions(-) diff --git a/src/main/java/data/MultipleHMKeys2.java b/src/main/java/data/MultipleHMKeys2.java index fded2b8..562c726 100755 --- a/src/main/java/data/MultipleHMKeys2.java +++ b/src/main/java/data/MultipleHMKeys2.java @@ -27,6 +27,10 @@ public final class MultipleHMKeys2 implements MultipleHMKeys { String[] splitedK1 = k1.split("\\s+"); String[] splitedK2 = k2.split("\\s+"); for(int i = 0; i < splitedK1.length; i ++){ + // in GOS words and normalized words may not both have specific word due to anon + if(!(i < splitedK2.length)){ + continue; + } MultipleHMKeys search = new MultipleHMKeys2(splitedK1[i], splitedK2[i]); r.add(search); } diff --git a/src/main/java/data/MultipleHMKeys3.java b/src/main/java/data/MultipleHMKeys3.java index 5c0a1b6..281c6bb 100755 --- a/src/main/java/data/MultipleHMKeys3.java +++ b/src/main/java/data/MultipleHMKeys3.java @@ -33,6 +33,10 @@ public final class MultipleHMKeys3 implements MultipleHMKeys { String[] splitedK2 = k2.split("\\s+"); String[] splitedK3 = k3.split("\\s+"); for(int i = 0; i < splitedK1.length; i ++){ + // in GOS words and normalized words may not both have specific word due to anon + if(!(i < splitedK2.length && i < splitedK3.length)){ + continue; + } MultipleHMKeys search = new MultipleHMKeys3(splitedK1[i], splitedK2[i], splitedK3[i]); r.add(search); } diff --git a/src/main/java/data/MultipleHMKeys4.java b/src/main/java/data/MultipleHMKeys4.java index c7862fc..3c39811 100755 --- a/src/main/java/data/MultipleHMKeys4.java +++ b/src/main/java/data/MultipleHMKeys4.java @@ -39,6 +39,10 @@ public final class MultipleHMKeys4 implements MultipleHMKeys { String[] splitedK3 = k3.split("\\s+"); String[] splitedK4 = k4.split("\\s+"); for(int i = 0; i < splitedK1.length; i ++){ + // in GOS words and normalized words may not both have specific word due to anon + if(!(i < splitedK2.length && i < splitedK3.length && i < splitedK4.length)){ + continue; + } MultipleHMKeys search = new MultipleHMKeys4(splitedK1[i], splitedK2[i], splitedK3[i], splitedK4[i]); r.add(search); } diff --git a/src/main/java/data/MultipleHMKeys5.java b/src/main/java/data/MultipleHMKeys5.java index b999f9f..ad17026 100755 --- a/src/main/java/data/MultipleHMKeys5.java +++ b/src/main/java/data/MultipleHMKeys5.java @@ -45,6 +45,10 @@ public final class MultipleHMKeys5 implements MultipleHMKeys { String[] splitedK4 = k4.split("\\s+"); String[] splitedK5 = k5.split("\\s+"); for(int i = 0; i < splitedK1.length; i ++){ + // in GOS words and normalized words may not both have specific word due to anon + if(!(i < splitedK2.length && i < splitedK3.length && i < splitedK4.length && i < splitedK5.length)){ + continue; + } MultipleHMKeys search = new MultipleHMKeys5(splitedK1[i], splitedK2[i], splitedK3[i], splitedK4[i], splitedK5[i]); r.add(search); } diff --git a/src/main/java/data/Word.java b/src/main/java/data/Word.java index 94e1ce0..22b1a54 100755 --- a/src/main/java/data/Word.java +++ b/src/main/java/data/Word.java @@ -108,7 +108,14 @@ public interface Word { if (cvv) { returnValue = (calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LOWERCASE_WORD) ? getCVVWord(cf) : getCVVLemma(cf); } else { - returnValue = (calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LOWERCASE_WORD) ? getWord(cf) : getLemma(cf); + if (calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LOWERCASE_WORD){ + returnValue = getWord(cf); + } else if (calculateFor == CalculateFor.LEMMA) { + returnValue = getLemma(cf); + } else if (calculateFor == CalculateFor.NORMALIZED_WORD){ + returnValue = getNormalizedWord(cf); + } +// returnValue = (calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LOWERCASE_WORD) ? getWord(cf) : getLemma(cf); } return returnValue; diff --git a/src/main/java/gui/CharacterAnalysisTab.java b/src/main/java/gui/CharacterAnalysisTab.java index 378f8cf..65129a8 100755 --- a/src/main/java/gui/CharacterAnalysisTab.java +++ b/src/main/java/gui/CharacterAnalysisTab.java @@ -161,6 +161,7 @@ public class CharacterAnalysisTab { private HashMap> solarFiltersMap; private HostServices hostService; private ListChangeListener taxonomyListener; + private ChangeListener calculateForListener; private ChangeListener msdListener; private ChangeListener minimalOccurrencesListener; private ChangeListener minimalTaxonomyListener; @@ -169,6 +170,9 @@ public class CharacterAnalysisTab { private static final String [] N_GRAM_COMPUTE_FOR_LETTERS_ARRAY = {"calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.LEMMA"}; private static final ArrayList N_GRAM_COMPUTE_FOR_LETTERS = new ArrayList<>(Arrays.asList(N_GRAM_COMPUTE_FOR_LETTERS_ARRAY)); + private static final String [] N_GRAM_COMPUTE_FOR_WORDS_GOS_ARRAY = {"calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.LEMMA", "calculateFor.NORMALIZED_WORD"}; + private static final ArrayList N_GRAM_COMPUTE_FOR_WORDS_GOS = new ArrayList<>(Arrays.asList(N_GRAM_COMPUTE_FOR_WORDS_GOS_ARRAY)); + private static final String [] TAXONOMY_SET_OPERATION_ARRAY = {"taxonomySetOperation.UNION", "taxonomySetOperation.INTERSECTION"}; private static final ArrayList TAXONOMY_SET_OPERATION = new ArrayList<>(Arrays.asList(TAXONOMY_SET_OPERATION_ARRAY)); @@ -184,14 +188,46 @@ public class CharacterAnalysisTab { currentMode = MODE.LETTER; toggleMode(currentMode); - calculateForCB.valueProperty().addListener((observable, oldValue, newValue) -> { - if(newValue == null){ - newValue = I18N.getTranslatedValue(oldValue, N_GRAM_COMPUTE_FOR_LETTERS); - calculateForCB.getSelectionModel().select(newValue); + if (calculateForListener != null){ + calculateForCB.valueProperty().removeListener(calculateForListener); + } + + // calculateForCB + calculateForListener = new ChangeListener() { + boolean ignoreCode = false; + @Override + public void changed(ObservableValue observable, String oldValue, String newValue) { + if (ignoreCode) { + return; + } + boolean languageChanged = newValue == null; + if (languageChanged) { + ignoreCode = true; + if (corpus.getCorpusType() == CorpusType.GOS) { + newValue = I18N.getTranslatedValue(oldValue, N_GRAM_COMPUTE_FOR_WORDS_GOS); + calculateForCB.getSelectionModel().select(newValue); + } else { + newValue = I18N.getTranslatedValue(oldValue, N_GRAM_COMPUTE_FOR_LETTERS); + calculateForCB.getSelectionModel().select(newValue); + } + ignoreCode = false; + } + + calculateFor = CalculateFor.factory(newValue); + logger.info("calculateForCB:", calculateFor.toString()); } - calculateFor = CalculateFor.factory(newValue); - logger.info("calculateForCB:", calculateFor.toString()); - }); + }; + + calculateForCB.valueProperty().addListener(calculateForListener); + +// calculateForCB.valueProperty().addListener((observable, oldValue, newValue) -> { +// if(newValue == null){ +// newValue = I18N.getTranslatedValue(oldValue, N_GRAM_COMPUTE_FOR_LETTERS); +// calculateForCB.getSelectionModel().select(newValue); +// } +// calculateFor = CalculateFor.factory(newValue); +// logger.info("calculateForCB:", calculateFor.toString()); +// }); calculateForCB.getSelectionModel().select(0); @@ -497,12 +533,19 @@ public class CharacterAnalysisTab { stringLengthTF.setText(String.valueOf(stringLength)); } - // if calculateFor was selected for something other than a word or a lemma -> reset - if (!(calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LEMMA || calculateFor == CalculateFor.LOWERCASE_WORD)) { - // if the user selected something else before selecting ngram for letters, reset that choice - calculateFor = CalculateFor.WORD; - - calculateForCB.getSelectionModel().select(0); +// // if calculateFor was selected for something other than a word or a lemma -> reset +// if (!(calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LEMMA || calculateFor == CalculateFor.LOWERCASE_WORD)) { +// // if the user selected something else before selecting ngram for letters, reset that choice +// calculateFor = CalculateFor.WORD; +// +// calculateForCB.getSelectionModel().select(0); +// } + if (corpus.getCorpusType() == CorpusType.GOS) { + calculateForCB.itemsProperty().unbind(); + calculateForCB.itemsProperty().bind(I18N.createObjectBinding(N_GRAM_COMPUTE_FOR_WORDS_GOS)); + } else { + calculateForCB.itemsProperty().unbind(); + calculateForCB.itemsProperty().bind(I18N.createObjectBinding(N_GRAM_COMPUTE_FOR_LETTERS)); } }