diff --git a/src/main/java/alg/ngram/Ngrams.java b/src/main/java/alg/ngram/Ngrams.java index d86feba..2080d71 100644 --- a/src/main/java/alg/ngram/Ngrams.java +++ b/src/main/java/alg/ngram/Ngrams.java @@ -44,11 +44,14 @@ public class Ngrams { // generate proper MultipleHMKeys depending on filter data String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor()); +// String key = "aaaaaaaaaaaaaaaaaaaaaaa"; + String lemma = ""; String wordType = ""; String msd = ""; for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){ if(otherKey.toString().equals("lema")){ +// lemma = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; lemma = wordToString(ngramCandidate, otherKey); } else if(otherKey.toString().equals("besedna vrsta")){ wordType = wordToString(ngramCandidate, otherKey).substring(0, 1); @@ -222,7 +225,8 @@ public class Ngrams { private static void validateAndCountSkipgramCandidate(ArrayList skipgramCandidate, StatisticsNew stats) { // count if no regex is set or if it is & candidate passes it if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) { - stats.updateResults(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor())); + stats.updateTaxonomyResults(new MultipleHMKeys(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()), "", "", ""), + stats.getCorpus().getTaxonomy()); } } } diff --git a/src/main/java/data/MultipleHMKeys.java b/src/main/java/data/MultipleHMKeys.java index e816382..910611c 100644 --- a/src/main/java/data/MultipleHMKeys.java +++ b/src/main/java/data/MultipleHMKeys.java @@ -1,10 +1,13 @@ package data; + +import java.util.Objects; + /* Created for when words are sorted by multiple keys, i.e. not just lemmas but lemmas and msd simultaneously. */ public final class MultipleHMKeys { private final String key, lemma, wordType, msd; - + private MultipleHMKeys actual_obj; public MultipleHMKeys(String key) { this.key = key; this.lemma = ""; @@ -37,12 +40,7 @@ public final class MultipleHMKeys { @Override public int hashCode() { -// if(key2 == null){ -// return key1.hashCode(); -// } else if (key3 == null){ -// return key1.hashCode() ^ key2.hashCode(); -// } - return key.hashCode() ^ lemma.hashCode() ^ wordType.hashCode() ^ msd.hashCode(); + return Objects.hash(key, lemma, wordType, msd); } @Override diff --git a/src/main/java/data/StatisticsNew.java b/src/main/java/data/StatisticsNew.java index 115a57b..4eed481 100644 --- a/src/main/java/data/StatisticsNew.java +++ b/src/main/java/data/StatisticsNew.java @@ -304,6 +304,10 @@ public class StatisticsNew { } + public Map> getTaxonomyResult() { + return taxonomyResult; + } + public void updateResults(String o) { // if not in map AtomicLong r = result.putIfAbsent(o, new AtomicLong(1)); diff --git a/src/test/java/Common.java b/src/test/java/Common.java index 7ff525d..b699247 100644 --- a/src/test/java/Common.java +++ b/src/test/java/Common.java @@ -16,27 +16,29 @@ public class Common { Sentence testSentence; // full sentence + ArrayList taxonomy = new ArrayList<>(); + taxonomy.add("#Ft.Z.N.N"); List words = new ArrayList<>(); - words.add(new Word("ker", "ker", "Vd")); - words.add(new Word("ima", "imeti", "Ggnste-n")); - words.add(new Word("junak", "junak", "Somei")); - words.add(new Word("v", "v", "Dm")); - words.add(new Word("posesti", "posest", "Sozem")); - words.add(new Word("nekaj", "nekaj", "Rsn")); - words.add(new Word("o", "o", "Dm")); - words.add(new Word("čemer", "kar", "Zz-sem")); - words.add(new Word("se", "se", "Zp------k")); - words.add(new Word("mu", "on", "Zotmed--k")); - words.add(new Word("ne", "ne", "L")); - words.add(new Word("sanja", "sanjati", "Ggnste")); - words.add(new Word("a", "a", "Vp")); - words.add(new Word("se", "se", "Zp------k")); - words.add(new Word("onemu", "oni", "Zk-sed")); - words.add(new Word("zdi", "zdeti", "Ggnste")); - words.add(new Word("ključno", "ključen", "Ppnsei")); - words.add(new Word("pri", "pri", "Dm")); - words.add(new Word("operaciji", "operacija", "Sozem")); - words.add(new Word("666", "666", "Kag")); + words.add(new Word("ker", "ker", "Vd", taxonomy)); + words.add(new Word("ima", "imeti", "Ggnste-n", taxonomy)); + words.add(new Word("junak", "junak", "Somei", taxonomy)); + words.add(new Word("v", "v", "Dm", taxonomy)); + words.add(new Word("posesti", "posest", "Sozem", taxonomy)); + words.add(new Word("nekaj", "nekaj", "Rsn", taxonomy)); + words.add(new Word("o", "o", "Dm", taxonomy)); + words.add(new Word("čemer", "kar", "Zz-sem", taxonomy)); + words.add(new Word("se", "se", "Zp------k", taxonomy)); + words.add(new Word("mu", "on", "Zotmed--k", taxonomy)); + words.add(new Word("ne", "ne", "L", taxonomy)); + words.add(new Word("sanja", "sanjati", "Ggnste", taxonomy)); + words.add(new Word("a", "a", "Vp", taxonomy)); + words.add(new Word("se", "se", "Zp------k", taxonomy)); + words.add(new Word("onemu", "oni", "Zk-sed", taxonomy)); + words.add(new Word("zdi", "zdeti", "Ggnste", taxonomy)); + words.add(new Word("ključno", "ključen", "Ppnsei", taxonomy)); + words.add(new Word("pri", "pri", "Dm", taxonomy)); + words.add(new Word("operaciji", "operacija", "Sozem", taxonomy)); + words.add(new Word("666", "666", "Kag", taxonomy)); testSentence = new Sentence(words, "#Ft.Z.N.N"); corpus = new ArrayList<>(); @@ -49,11 +51,11 @@ public class Common { // five word sentence words = new ArrayList<>(); - words.add(new Word("ker", "ker", "Vd")); - words.add(new Word("ima", "imeti", "Ggnste-n")); - words.add(new Word("junak", "junak", "Somei")); - words.add(new Word("ima", "imeti", "Ggnste-n")); - words.add(new Word("posesti", "posest", "Sozem")); + words.add(new Word("ker", "ker", "Vd", taxonomy)); + words.add(new Word("ima", "imeti", "Ggnste-n", taxonomy)); + words.add(new Word("junak", "junak", "Somei", taxonomy)); + words.add(new Word("ima", "imeti", "Ggnste-n", taxonomy)); + words.add(new Word("posesti", "posest", "Sozem", taxonomy)); testSentence = new Sentence(words, "#Ft.Z.N.N"); midCorpus = new ArrayList<>(); @@ -61,11 +63,11 @@ public class Common { // five word sentence - for skipgrams words = new ArrayList<>(); - words.add(new Word("ker", "ker", "Vd")); - words.add(new Word("ima", "imeti", "Ggnste-n")); - words.add(new Word("junak", "junak", "Somei")); - words.add(new Word("v", "v", "Dm")); - words.add(new Word("posesti", "posest", "Sozem")); + words.add(new Word("ker", "ker", "Vd", taxonomy)); + words.add(new Word("ima", "imeti", "Ggnste-n", taxonomy)); + words.add(new Word("junak", "junak", "Somei", taxonomy)); + words.add(new Word("v", "v", "Dm", taxonomy)); + words.add(new Word("posesti", "posest", "Sozem", taxonomy)); testSentence = new Sentence(words, "#Ft.Z.N.N"); midCorpusSkip = new ArrayList<>(); @@ -73,9 +75,9 @@ public class Common { // JOS test words = new ArrayList<>(); - words.add(new Word("junak", "junak", "Somei")); - words.add(new Word("ima", "imeti", "Ggnste-n")); - words.add(new Word("posesti", "posest", "Sozem")); + words.add(new Word("junak", "junak", "Somei", taxonomy)); + words.add(new Word("ima", "imeti", "Ggnste-n", taxonomy)); + words.add(new Word("posesti", "posest", "Sozem", taxonomy)); testSentence = new Sentence(words, "#Ft.Z.N.N"); josTest = new ArrayList<>(); diff --git a/src/test/java/CorpusTests.java b/src/test/java/CorpusTests.java index ab7f59f..14f5274 100644 --- a/src/test/java/CorpusTests.java +++ b/src/test/java/CorpusTests.java @@ -15,7 +15,7 @@ public class CorpusTests { public void solarTest() { //File selectedDirectory = new File("/home/andrej/Desktop/corpus-analyzer/src/main/resources/Solar"); // File selectedDirectory = new File("/home/andrej/Desktop/corpus-analyzer/src/main/resources/GOS"); - File selectedDirectory = new File("/home/andrej/Desktop/corpus-analyzer/src/main/resources/Gigafida_subset"); + File selectedDirectory = new File("/home/luka/Developement/corpus-analyzer2/src/main/resources/Gigafida_subset"); Settings.resultsFilePath = new File(selectedDirectory.getAbsolutePath().concat(File.separator)); diff --git a/src/test/java/NgramTests.java b/src/test/java/NgramTests.java index bc3c447..ce794cc 100644 --- a/src/test/java/NgramTests.java +++ b/src/test/java/NgramTests.java @@ -3,7 +3,9 @@ import static org.junit.Assert.*; import java.util.*; import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Pattern; +import java.util.stream.Collectors; +import javafx.collections.FXCollections; import org.junit.Test; import alg.ngram.Ngrams; @@ -21,10 +23,17 @@ public class NgramTests { filter.setStringLength(4); filter.setNgramValue(0); // letters filter.setCalculateFor(CalculateFor.WORD); + ArrayList tax= new ArrayList<>(); + tax.add("SSJ.T.P.C"); + filter.setTaxonomy(tax); + Corpus testCorpus = new Corpus(); testCorpus.setCorpusType(CorpusType.GIGAFIDA); testCorpus.setDetectedCorpusFiles(new ArrayList<>()); + ArrayList taxForCombo = new ArrayList<>(); + taxForCombo.add("SSJ.T.P.C"); + testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo)); // tests: // - no regex @@ -103,15 +112,24 @@ public class NgramTests { @Test public void wordsNgramsTest() { - Map result = null; + Map> taxonomyResult; Filter filter = new Filter(); filter.setAl(AnalysisLevel.STRING_LEVEL); filter.setNgramValue(3); + ArrayList tax= new ArrayList<>(); + tax.add("SSJ.T.P.C"); + filter.setTaxonomy(tax); + ArrayList mKeys = new ArrayList<>(); + //mKeys.add("lema"); + filter.setMultipleKeys(mKeys); Corpus testCorpus = new Corpus(); testCorpus.setCorpusType(CorpusType.GIGAFIDA); testCorpus.setDetectedCorpusFiles(new ArrayList<>()); + ArrayList taxForCombo = new ArrayList<>(); + taxForCombo.add("SSJ.T.P.C"); + testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo)); // tests: // - normal ngrams - word @@ -119,36 +137,36 @@ public class NgramTests { filter.setCalculateFor(CalculateFor.WORD); StatisticsNew stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpus, stats); - result = stats.getResult(); + taxonomyResult = stats.getTaxonomyResult(); - assertEquals(3, result.size()); - assertTrue(result.containsKey("ker ima junak")); - assertTrue(result.containsKey("ima junak ima")); - assertTrue(result.containsKey("junak ima posesti")); + assertEquals(3, taxonomyResult.get("Total").size()); + assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ker ima junak", "", "", ""))); + assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ima junak ima", "", "", ""))); + assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak ima posesti", "", "", ""))); // tests: // - normal ngrams - lemmas filter.setCalculateFor(CalculateFor.LEMMA); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpus, stats); - result = stats.getResult(); + taxonomyResult = stats.getTaxonomyResult(); - assertEquals(3, result.size()); - assertTrue(result.containsKey("ker imeti junak")); - assertTrue(result.containsKey("imeti junak imeti")); - assertTrue(result.containsKey("junak imeti posest")); + assertEquals(3, taxonomyResult.get("Total").size()); + assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ker imeti junak", "", "", ""))); + assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("imeti junak imeti", "", "", ""))); + assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak imeti posest", "", "", ""))); // tests: // - normal ngrams - msd filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpus, stats); - result = stats.getResult(); + taxonomyResult = stats.getTaxonomyResult(); - assertEquals(3, result.size()); - assertTrue(result.containsKey("Vd Ggnste-n Somei")); - assertTrue(result.containsKey("Ggnste-n Somei Ggnste-n")); - assertTrue(result.containsKey("Somei Ggnste-n Sozem")); + assertEquals(3, taxonomyResult.get("Total").size()); + assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Vd Ggnste-n Somei", "", "", ""))); + assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Ggnste-n Somei Ggnste-n", "", "", ""))); + assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Somei Ggnste-n Sozem", "", "", ""))); // tests: // - ngrams - word - regex filter @@ -161,10 +179,10 @@ public class NgramTests { stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpus, stats); - result = stats.getResult(); + taxonomyResult = stats.getTaxonomyResult(); - assertEquals(1, result.size()); - assertTrue(result.containsKey("junak ima posesti")); + assertEquals(1, taxonomyResult.get("Total").size()); + assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak ima posesti", "", "", ""))); // tests: // - ngrams - word - regex filter @@ -177,10 +195,10 @@ public class NgramTests { stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpus, stats); - result = stats.getResult(); + taxonomyResult = stats.getTaxonomyResult(); - assertEquals(1, result.size()); - assertTrue(result.containsKey("ima junak")); + assertEquals(1, taxonomyResult.get("Total").size()); + assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ima junak", "", "", ""))); } @@ -273,25 +291,32 @@ public class NgramTests { @Test public void skipgramsTest() { - Map result = null; + Map> taxonomyResult; Filter filter = new Filter(); filter.setAl(AnalysisLevel.STRING_LEVEL); filter.setCalculateFor(CalculateFor.WORD); + ArrayList tax= new ArrayList<>(); + tax.add("SSJ.T.P.C"); + filter.setTaxonomy(tax); Corpus testCorpus = new Corpus(); testCorpus.setCorpusType(CorpusType.GIGAFIDA); testCorpus.setDetectedCorpusFiles(new ArrayList<>()); + ArrayList taxForCombo = new ArrayList<>(); + taxForCombo.add("tisk-periodično-časopis"); + testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo)); // tests: // - bigrams filter.setNgramValue(2); StatisticsNew stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpusSkip, stats); - result = stats.getResult(); + taxonomyResult = stats.getTaxonomyResult(); Set bigrams = new HashSet<>(Arrays.asList("ker ima", "ima junak", "junak v", "v posesti")); - Set bigramsActual = result.keySet(); + Set bigramsMultipleHMKeys = taxonomyResult.get("Total").keySet(); + Set bigramsActual = new HashSet<>(bigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList())); assertEquals(bigrams, bigramsActual); // test: @@ -300,10 +325,11 @@ public class NgramTests { filter.setSkipValue(2); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpusSkip, stats); - result = stats.getResult(); + taxonomyResult = stats.getTaxonomyResult(); Set twoSkipBigrams = new HashSet<>(Arrays.asList("ker ima", "ker junak", "ker v", "ima junak", "ima v", "ima posesti", "junak v", "junak posesti", "v posesti")); - Set twoSkipBigramsActual = result.keySet(); + Set twoSkipBigramsMultipleHMKeys = taxonomyResult.get("Total").keySet(); + Set twoSkipBigramsActual = new HashSet<>(twoSkipBigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList())); assertEquals(twoSkipBigrams, twoSkipBigramsActual); @@ -313,9 +339,10 @@ public class NgramTests { filter.setSkipValue(null); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpusSkip, stats); - result = stats.getResult(); + taxonomyResult = stats.getTaxonomyResult(); Set trigrams = new HashSet<>(Arrays.asList("ker ima junak", "ima junak v", "junak v posesti")); - Set trigramsActual = result.keySet(); + Set trigramsMultipleHMKeys = taxonomyResult.get("Total").keySet(); + Set trigramsActual = new HashSet<>(trigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList())); assertEquals(trigrams, trigramsActual); @@ -325,9 +352,10 @@ public class NgramTests { filter.setSkipValue(2); stats = new StatisticsNew(testCorpus, filter, false); Ngrams.calculateForAll(Common.midCorpusSkip, stats); - result = stats.getResult(); + taxonomyResult = stats.getTaxonomyResult(); HashSet twoSkipTrigrams = new HashSet<>(Arrays.asList("ker ima junak", "ker ima v", "ker ima posesti", "ker junak v", "ker junak posesti", "ker v posesti", "ima junak v", "ima junak posesti", "ima v posesti", "junak v posesti")); - Set twoSkipTrigramsActual = result.keySet(); + Set twoSkipTrigramsMultipleHMKeys = taxonomyResult.get("Total").keySet(); + Set twoSkipTrigramsActual = new HashSet<>(twoSkipTrigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList())); assertEquals(twoSkipTrigrams, twoSkipTrigramsActual); } diff --git a/src/test/java/WordFormationTest.java b/src/test/java/WordFormationTest.java index c725977..0b79f98 100644 --- a/src/test/java/WordFormationTest.java +++ b/src/test/java/WordFormationTest.java @@ -3,6 +3,7 @@ import java.util.ArrayList; import java.util.Map; import java.util.concurrent.atomic.AtomicLong; +import javafx.collections.FXCollections; import org.junit.Test; import alg.inflectedJOS.WordFormation; @@ -22,6 +23,9 @@ public class WordFormationTest { Corpus testCorpus = new Corpus(); testCorpus.setCorpusType(CorpusType.GIGAFIDA); testCorpus.setDetectedCorpusFiles(new ArrayList<>()); + ArrayList taxForCombo = new ArrayList<>(); + taxForCombo.add("tisk-periodično-časopis"); + testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo)); // tests: // - normal ngrams - word