Fixed slow combination of words and lemmas presentation
This commit is contained in:
parent
c073e12f55
commit
84d0086a66
|
@ -44,11 +44,14 @@ public class Ngrams {
|
|||
|
||||
// generate proper MultipleHMKeys depending on filter data
|
||||
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor());
|
||||
// String key = "aaaaaaaaaaaaaaaaaaaaaaa";
|
||||
|
||||
String lemma = "";
|
||||
String wordType = "";
|
||||
String msd = "";
|
||||
for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){
|
||||
if(otherKey.toString().equals("lema")){
|
||||
// lemma = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
|
||||
lemma = wordToString(ngramCandidate, otherKey);
|
||||
} else if(otherKey.toString().equals("besedna vrsta")){
|
||||
wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);
|
||||
|
@ -222,7 +225,8 @@ public class Ngrams {
|
|||
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
|
||||
// count if no regex is set or if it is & candidate passes it
|
||||
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
|
||||
stats.updateResults(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()));
|
||||
stats.updateTaxonomyResults(new MultipleHMKeys(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()), "", "", ""),
|
||||
stats.getCorpus().getTaxonomy());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
package data;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/*
|
||||
Created for when words are sorted by multiple keys, i.e. not just lemmas but lemmas and msd simultaneously.
|
||||
*/
|
||||
public final class MultipleHMKeys {
|
||||
private final String key, lemma, wordType, msd;
|
||||
|
||||
private MultipleHMKeys actual_obj;
|
||||
public MultipleHMKeys(String key) {
|
||||
this.key = key;
|
||||
this.lemma = "";
|
||||
|
@ -37,12 +40,7 @@ public final class MultipleHMKeys {
|
|||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
// if(key2 == null){
|
||||
// return key1.hashCode();
|
||||
// } else if (key3 == null){
|
||||
// return key1.hashCode() ^ key2.hashCode();
|
||||
// }
|
||||
return key.hashCode() ^ lemma.hashCode() ^ wordType.hashCode() ^ msd.hashCode();
|
||||
return Objects.hash(key, lemma, wordType, msd);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -304,6 +304,10 @@ public class StatisticsNew {
|
|||
|
||||
}
|
||||
|
||||
public Map<String, Map<MultipleHMKeys, AtomicLong>> getTaxonomyResult() {
|
||||
return taxonomyResult;
|
||||
}
|
||||
|
||||
public void updateResults(String o) {
|
||||
// if not in map
|
||||
AtomicLong r = result.putIfAbsent(o, new AtomicLong(1));
|
||||
|
|
|
@ -16,27 +16,29 @@ public class Common {
|
|||
Sentence testSentence;
|
||||
|
||||
// full sentence
|
||||
ArrayList<String> taxonomy = new ArrayList<>();
|
||||
taxonomy.add("#Ft.Z.N.N");
|
||||
List<Word> words = new ArrayList<>();
|
||||
words.add(new Word("ker", "ker", "Vd"));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
words.add(new Word("junak", "junak", "Somei"));
|
||||
words.add(new Word("v", "v", "Dm"));
|
||||
words.add(new Word("posesti", "posest", "Sozem"));
|
||||
words.add(new Word("nekaj", "nekaj", "Rsn"));
|
||||
words.add(new Word("o", "o", "Dm"));
|
||||
words.add(new Word("čemer", "kar", "Zz-sem"));
|
||||
words.add(new Word("se", "se", "Zp------k"));
|
||||
words.add(new Word("mu", "on", "Zotmed--k"));
|
||||
words.add(new Word("ne", "ne", "L"));
|
||||
words.add(new Word("sanja", "sanjati", "Ggnste"));
|
||||
words.add(new Word("a", "a", "Vp"));
|
||||
words.add(new Word("se", "se", "Zp------k"));
|
||||
words.add(new Word("onemu", "oni", "Zk-sed"));
|
||||
words.add(new Word("zdi", "zdeti", "Ggnste"));
|
||||
words.add(new Word("ključno", "ključen", "Ppnsei"));
|
||||
words.add(new Word("pri", "pri", "Dm"));
|
||||
words.add(new Word("operaciji", "operacija", "Sozem"));
|
||||
words.add(new Word("666", "666", "Kag"));
|
||||
words.add(new Word("ker", "ker", "Vd", taxonomy));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n", taxonomy));
|
||||
words.add(new Word("junak", "junak", "Somei", taxonomy));
|
||||
words.add(new Word("v", "v", "Dm", taxonomy));
|
||||
words.add(new Word("posesti", "posest", "Sozem", taxonomy));
|
||||
words.add(new Word("nekaj", "nekaj", "Rsn", taxonomy));
|
||||
words.add(new Word("o", "o", "Dm", taxonomy));
|
||||
words.add(new Word("čemer", "kar", "Zz-sem", taxonomy));
|
||||
words.add(new Word("se", "se", "Zp------k", taxonomy));
|
||||
words.add(new Word("mu", "on", "Zotmed--k", taxonomy));
|
||||
words.add(new Word("ne", "ne", "L", taxonomy));
|
||||
words.add(new Word("sanja", "sanjati", "Ggnste", taxonomy));
|
||||
words.add(new Word("a", "a", "Vp", taxonomy));
|
||||
words.add(new Word("se", "se", "Zp------k", taxonomy));
|
||||
words.add(new Word("onemu", "oni", "Zk-sed", taxonomy));
|
||||
words.add(new Word("zdi", "zdeti", "Ggnste", taxonomy));
|
||||
words.add(new Word("ključno", "ključen", "Ppnsei", taxonomy));
|
||||
words.add(new Word("pri", "pri", "Dm", taxonomy));
|
||||
words.add(new Word("operaciji", "operacija", "Sozem", taxonomy));
|
||||
words.add(new Word("666", "666", "Kag", taxonomy));
|
||||
|
||||
testSentence = new Sentence(words, "#Ft.Z.N.N");
|
||||
corpus = new ArrayList<>();
|
||||
|
@ -49,11 +51,11 @@ public class Common {
|
|||
|
||||
// five word sentence
|
||||
words = new ArrayList<>();
|
||||
words.add(new Word("ker", "ker", "Vd"));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
words.add(new Word("junak", "junak", "Somei"));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
words.add(new Word("posesti", "posest", "Sozem"));
|
||||
words.add(new Word("ker", "ker", "Vd", taxonomy));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n", taxonomy));
|
||||
words.add(new Word("junak", "junak", "Somei", taxonomy));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n", taxonomy));
|
||||
words.add(new Word("posesti", "posest", "Sozem", taxonomy));
|
||||
testSentence = new Sentence(words, "#Ft.Z.N.N");
|
||||
|
||||
midCorpus = new ArrayList<>();
|
||||
|
@ -61,11 +63,11 @@ public class Common {
|
|||
|
||||
// five word sentence - for skipgrams
|
||||
words = new ArrayList<>();
|
||||
words.add(new Word("ker", "ker", "Vd"));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
words.add(new Word("junak", "junak", "Somei"));
|
||||
words.add(new Word("v", "v", "Dm"));
|
||||
words.add(new Word("posesti", "posest", "Sozem"));
|
||||
words.add(new Word("ker", "ker", "Vd", taxonomy));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n", taxonomy));
|
||||
words.add(new Word("junak", "junak", "Somei", taxonomy));
|
||||
words.add(new Word("v", "v", "Dm", taxonomy));
|
||||
words.add(new Word("posesti", "posest", "Sozem", taxonomy));
|
||||
testSentence = new Sentence(words, "#Ft.Z.N.N");
|
||||
|
||||
midCorpusSkip = new ArrayList<>();
|
||||
|
@ -73,9 +75,9 @@ public class Common {
|
|||
|
||||
// JOS test
|
||||
words = new ArrayList<>();
|
||||
words.add(new Word("junak", "junak", "Somei"));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n"));
|
||||
words.add(new Word("posesti", "posest", "Sozem"));
|
||||
words.add(new Word("junak", "junak", "Somei", taxonomy));
|
||||
words.add(new Word("ima", "imeti", "Ggnste-n", taxonomy));
|
||||
words.add(new Word("posesti", "posest", "Sozem", taxonomy));
|
||||
testSentence = new Sentence(words, "#Ft.Z.N.N");
|
||||
|
||||
josTest = new ArrayList<>();
|
||||
|
|
|
@ -15,7 +15,7 @@ public class CorpusTests {
|
|||
public void solarTest() {
|
||||
//File selectedDirectory = new File("/home/andrej/Desktop/corpus-analyzer/src/main/resources/Solar");
|
||||
// File selectedDirectory = new File("/home/andrej/Desktop/corpus-analyzer/src/main/resources/GOS");
|
||||
File selectedDirectory = new File("/home/andrej/Desktop/corpus-analyzer/src/main/resources/Gigafida_subset");
|
||||
File selectedDirectory = new File("/home/luka/Developement/corpus-analyzer2/src/main/resources/Gigafida_subset");
|
||||
|
||||
Settings.resultsFilePath = new File(selectedDirectory.getAbsolutePath().concat(File.separator));
|
||||
|
||||
|
|
|
@ -3,7 +3,9 @@ import static org.junit.Assert.*;
|
|||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javafx.collections.FXCollections;
|
||||
import org.junit.Test;
|
||||
|
||||
import alg.ngram.Ngrams;
|
||||
|
@ -21,10 +23,17 @@ public class NgramTests {
|
|||
filter.setStringLength(4);
|
||||
filter.setNgramValue(0); // letters
|
||||
filter.setCalculateFor(CalculateFor.WORD);
|
||||
ArrayList<String> tax= new ArrayList<>();
|
||||
tax.add("SSJ.T.P.C");
|
||||
filter.setTaxonomy(tax);
|
||||
|
||||
|
||||
Corpus testCorpus = new Corpus();
|
||||
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
taxForCombo.add("SSJ.T.P.C");
|
||||
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
|
||||
// tests:
|
||||
// - no regex
|
||||
|
@ -103,15 +112,24 @@ public class NgramTests {
|
|||
|
||||
@Test
|
||||
public void wordsNgramsTest() {
|
||||
Map<String, AtomicLong> result = null;
|
||||
Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
|
||||
|
||||
Filter filter = new Filter();
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
filter.setNgramValue(3);
|
||||
ArrayList<String> tax= new ArrayList<>();
|
||||
tax.add("SSJ.T.P.C");
|
||||
filter.setTaxonomy(tax);
|
||||
ArrayList<String> mKeys = new ArrayList<>();
|
||||
//mKeys.add("lema");
|
||||
filter.setMultipleKeys(mKeys);
|
||||
|
||||
Corpus testCorpus = new Corpus();
|
||||
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
taxForCombo.add("SSJ.T.P.C");
|
||||
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
|
||||
// tests:
|
||||
// - normal ngrams - word
|
||||
|
@ -119,36 +137,36 @@ public class NgramTests {
|
|||
filter.setCalculateFor(CalculateFor.WORD);
|
||||
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
result = stats.getResult();
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
assertEquals(3, result.size());
|
||||
assertTrue(result.containsKey("ker ima junak"));
|
||||
assertTrue(result.containsKey("ima junak ima"));
|
||||
assertTrue(result.containsKey("junak ima posesti"));
|
||||
assertEquals(3, taxonomyResult.get("Total").size());
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ker ima junak", "", "", "")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ima junak ima", "", "", "")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak ima posesti", "", "", "")));
|
||||
|
||||
// tests:
|
||||
// - normal ngrams - lemmas
|
||||
filter.setCalculateFor(CalculateFor.LEMMA);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
result = stats.getResult();
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
assertEquals(3, result.size());
|
||||
assertTrue(result.containsKey("ker imeti junak"));
|
||||
assertTrue(result.containsKey("imeti junak imeti"));
|
||||
assertTrue(result.containsKey("junak imeti posest"));
|
||||
assertEquals(3, taxonomyResult.get("Total").size());
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ker imeti junak", "", "", "")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("imeti junak imeti", "", "", "")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak imeti posest", "", "", "")));
|
||||
|
||||
// tests:
|
||||
// - normal ngrams - msd
|
||||
filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
result = stats.getResult();
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
assertEquals(3, result.size());
|
||||
assertTrue(result.containsKey("Vd Ggnste-n Somei"));
|
||||
assertTrue(result.containsKey("Ggnste-n Somei Ggnste-n"));
|
||||
assertTrue(result.containsKey("Somei Ggnste-n Sozem"));
|
||||
assertEquals(3, taxonomyResult.get("Total").size());
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Vd Ggnste-n Somei", "", "", "")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Ggnste-n Somei Ggnste-n", "", "", "")));
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Somei Ggnste-n Sozem", "", "", "")));
|
||||
|
||||
// tests:
|
||||
// - ngrams - word - regex filter
|
||||
|
@ -161,10 +179,10 @@ public class NgramTests {
|
|||
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
result = stats.getResult();
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
assertEquals(1, result.size());
|
||||
assertTrue(result.containsKey("junak ima posesti"));
|
||||
assertEquals(1, taxonomyResult.get("Total").size());
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak ima posesti", "", "", "")));
|
||||
|
||||
// tests:
|
||||
// - ngrams - word - regex filter
|
||||
|
@ -177,10 +195,10 @@ public class NgramTests {
|
|||
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpus, stats);
|
||||
result = stats.getResult();
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
assertEquals(1, result.size());
|
||||
assertTrue(result.containsKey("ima junak"));
|
||||
assertEquals(1, taxonomyResult.get("Total").size());
|
||||
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ima junak", "", "", "")));
|
||||
}
|
||||
|
||||
|
||||
|
@ -273,25 +291,32 @@ public class NgramTests {
|
|||
|
||||
@Test
|
||||
public void skipgramsTest() {
|
||||
Map<String, AtomicLong> result = null;
|
||||
Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
|
||||
|
||||
Filter filter = new Filter();
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
filter.setCalculateFor(CalculateFor.WORD);
|
||||
ArrayList<String> tax= new ArrayList<>();
|
||||
tax.add("SSJ.T.P.C");
|
||||
filter.setTaxonomy(tax);
|
||||
|
||||
Corpus testCorpus = new Corpus();
|
||||
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
taxForCombo.add("tisk-periodično-časopis");
|
||||
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
|
||||
// tests:
|
||||
// - bigrams
|
||||
filter.setNgramValue(2);
|
||||
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
result = stats.getResult();
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
Set<String> bigrams = new HashSet<>(Arrays.asList("ker ima", "ima junak", "junak v", "v posesti"));
|
||||
Set<String> bigramsActual = result.keySet();
|
||||
Set<MultipleHMKeys> bigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
Set<String> bigramsActual = new HashSet<>(bigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));
|
||||
assertEquals(bigrams, bigramsActual);
|
||||
|
||||
// test:
|
||||
|
@ -300,10 +325,11 @@ public class NgramTests {
|
|||
filter.setSkipValue(2);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
result = stats.getResult();
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
|
||||
Set<String> twoSkipBigrams = new HashSet<>(Arrays.asList("ker ima", "ker junak", "ker v", "ima junak", "ima v", "ima posesti", "junak v", "junak posesti", "v posesti"));
|
||||
Set<String> twoSkipBigramsActual = result.keySet();
|
||||
Set<MultipleHMKeys> twoSkipBigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
Set<String> twoSkipBigramsActual = new HashSet<>(twoSkipBigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));
|
||||
|
||||
assertEquals(twoSkipBigrams, twoSkipBigramsActual);
|
||||
|
||||
|
@ -313,9 +339,10 @@ public class NgramTests {
|
|||
filter.setSkipValue(null);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
result = stats.getResult();
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
Set<String> trigrams = new HashSet<>(Arrays.asList("ker ima junak", "ima junak v", "junak v posesti"));
|
||||
Set<String> trigramsActual = result.keySet();
|
||||
Set<MultipleHMKeys> trigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
Set<String> trigramsActual = new HashSet<>(trigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));
|
||||
|
||||
assertEquals(trigrams, trigramsActual);
|
||||
|
||||
|
@ -325,9 +352,10 @@ public class NgramTests {
|
|||
filter.setSkipValue(2);
|
||||
stats = new StatisticsNew(testCorpus, filter, false);
|
||||
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
|
||||
result = stats.getResult();
|
||||
taxonomyResult = stats.getTaxonomyResult();
|
||||
HashSet<String> twoSkipTrigrams = new HashSet<>(Arrays.asList("ker ima junak", "ker ima v", "ker ima posesti", "ker junak v", "ker junak posesti", "ker v posesti", "ima junak v", "ima junak posesti", "ima v posesti", "junak v posesti"));
|
||||
Set<String> twoSkipTrigramsActual = result.keySet();
|
||||
Set<MultipleHMKeys> twoSkipTrigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
|
||||
Set<String> twoSkipTrigramsActual = new HashSet<>(twoSkipTrigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));
|
||||
|
||||
assertEquals(twoSkipTrigrams, twoSkipTrigramsActual);
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@ import java.util.ArrayList;
|
|||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import javafx.collections.FXCollections;
|
||||
import org.junit.Test;
|
||||
|
||||
import alg.inflectedJOS.WordFormation;
|
||||
|
@ -22,6 +23,9 @@ public class WordFormationTest {
|
|||
Corpus testCorpus = new Corpus();
|
||||
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
|
||||
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
|
||||
ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
taxForCombo.add("tisk-periodično-časopis");
|
||||
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
|
||||
|
||||
// tests:
|
||||
// - normal ngrams - word
|
||||
|
|
Loading…
Reference in New Issue
Block a user