Fixed slow combination of words and lemmas presentation

This commit is contained in:
Luka 2018-07-17 16:04:26 +02:00
parent c073e12f55
commit 84d0086a66
7 changed files with 113 additions and 73 deletions

View File

@ -44,11 +44,14 @@ public class Ngrams {
// generate proper MultipleHMKeys depending on filter data
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor());
// String key = "aaaaaaaaaaaaaaaaaaaaaaa";
String lemma = "";
String wordType = "";
String msd = "";
for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){
if(otherKey.toString().equals("lema")){
// lemma = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
lemma = wordToString(ngramCandidate, otherKey);
} else if(otherKey.toString().equals("besedna vrsta")){
wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);
@ -222,7 +225,8 @@ public class Ngrams {
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
// count if no regex is set or if it is & candidate passes it
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
stats.updateResults(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()));
stats.updateTaxonomyResults(new MultipleHMKeys(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()), "", "", ""),
stats.getCorpus().getTaxonomy());
}
}
}

View File

@ -1,10 +1,13 @@
package data;
import java.util.Objects;
/*
Created for when words are sorted by multiple keys, i.e. not just lemmas but lemmas and msd simultaneously.
*/
public final class MultipleHMKeys {
private final String key, lemma, wordType, msd;
private MultipleHMKeys actual_obj;
public MultipleHMKeys(String key) {
this.key = key;
this.lemma = "";
@ -37,12 +40,7 @@ public final class MultipleHMKeys {
@Override
public int hashCode() {
// if(key2 == null){
// return key1.hashCode();
// } else if (key3 == null){
// return key1.hashCode() ^ key2.hashCode();
// }
return key.hashCode() ^ lemma.hashCode() ^ wordType.hashCode() ^ msd.hashCode();
return Objects.hash(key, lemma, wordType, msd);
}
@Override

View File

@ -304,6 +304,10 @@ public class StatisticsNew {
}
public Map<String, Map<MultipleHMKeys, AtomicLong>> getTaxonomyResult() {
return taxonomyResult;
}
public void updateResults(String o) {
// if not in map
AtomicLong r = result.putIfAbsent(o, new AtomicLong(1));

View File

@ -16,27 +16,29 @@ public class Common {
Sentence testSentence;
// full sentence
ArrayList<String> taxonomy = new ArrayList<>();
taxonomy.add("#Ft.Z.N.N");
List<Word> words = new ArrayList<>();
words.add(new Word("ker", "ker", "Vd"));
words.add(new Word("ima", "imeti", "Ggnste-n"));
words.add(new Word("junak", "junak", "Somei"));
words.add(new Word("v", "v", "Dm"));
words.add(new Word("posesti", "posest", "Sozem"));
words.add(new Word("nekaj", "nekaj", "Rsn"));
words.add(new Word("o", "o", "Dm"));
words.add(new Word("čemer", "kar", "Zz-sem"));
words.add(new Word("se", "se", "Zp------k"));
words.add(new Word("mu", "on", "Zotmed--k"));
words.add(new Word("ne", "ne", "L"));
words.add(new Word("sanja", "sanjati", "Ggnste"));
words.add(new Word("a", "a", "Vp"));
words.add(new Word("se", "se", "Zp------k"));
words.add(new Word("onemu", "oni", "Zk-sed"));
words.add(new Word("zdi", "zdeti", "Ggnste"));
words.add(new Word("ključno", "ključen", "Ppnsei"));
words.add(new Word("pri", "pri", "Dm"));
words.add(new Word("operaciji", "operacija", "Sozem"));
words.add(new Word("666", "666", "Kag"));
words.add(new Word("ker", "ker", "Vd", taxonomy));
words.add(new Word("ima", "imeti", "Ggnste-n", taxonomy));
words.add(new Word("junak", "junak", "Somei", taxonomy));
words.add(new Word("v", "v", "Dm", taxonomy));
words.add(new Word("posesti", "posest", "Sozem", taxonomy));
words.add(new Word("nekaj", "nekaj", "Rsn", taxonomy));
words.add(new Word("o", "o", "Dm", taxonomy));
words.add(new Word("čemer", "kar", "Zz-sem", taxonomy));
words.add(new Word("se", "se", "Zp------k", taxonomy));
words.add(new Word("mu", "on", "Zotmed--k", taxonomy));
words.add(new Word("ne", "ne", "L", taxonomy));
words.add(new Word("sanja", "sanjati", "Ggnste", taxonomy));
words.add(new Word("a", "a", "Vp", taxonomy));
words.add(new Word("se", "se", "Zp------k", taxonomy));
words.add(new Word("onemu", "oni", "Zk-sed", taxonomy));
words.add(new Word("zdi", "zdeti", "Ggnste", taxonomy));
words.add(new Word("ključno", "ključen", "Ppnsei", taxonomy));
words.add(new Word("pri", "pri", "Dm", taxonomy));
words.add(new Word("operaciji", "operacija", "Sozem", taxonomy));
words.add(new Word("666", "666", "Kag", taxonomy));
testSentence = new Sentence(words, "#Ft.Z.N.N");
corpus = new ArrayList<>();
@ -49,11 +51,11 @@ public class Common {
// five word sentence
words = new ArrayList<>();
words.add(new Word("ker", "ker", "Vd"));
words.add(new Word("ima", "imeti", "Ggnste-n"));
words.add(new Word("junak", "junak", "Somei"));
words.add(new Word("ima", "imeti", "Ggnste-n"));
words.add(new Word("posesti", "posest", "Sozem"));
words.add(new Word("ker", "ker", "Vd", taxonomy));
words.add(new Word("ima", "imeti", "Ggnste-n", taxonomy));
words.add(new Word("junak", "junak", "Somei", taxonomy));
words.add(new Word("ima", "imeti", "Ggnste-n", taxonomy));
words.add(new Word("posesti", "posest", "Sozem", taxonomy));
testSentence = new Sentence(words, "#Ft.Z.N.N");
midCorpus = new ArrayList<>();
@ -61,11 +63,11 @@ public class Common {
// five word sentence - for skipgrams
words = new ArrayList<>();
words.add(new Word("ker", "ker", "Vd"));
words.add(new Word("ima", "imeti", "Ggnste-n"));
words.add(new Word("junak", "junak", "Somei"));
words.add(new Word("v", "v", "Dm"));
words.add(new Word("posesti", "posest", "Sozem"));
words.add(new Word("ker", "ker", "Vd", taxonomy));
words.add(new Word("ima", "imeti", "Ggnste-n", taxonomy));
words.add(new Word("junak", "junak", "Somei", taxonomy));
words.add(new Word("v", "v", "Dm", taxonomy));
words.add(new Word("posesti", "posest", "Sozem", taxonomy));
testSentence = new Sentence(words, "#Ft.Z.N.N");
midCorpusSkip = new ArrayList<>();
@ -73,9 +75,9 @@ public class Common {
// JOS test
words = new ArrayList<>();
words.add(new Word("junak", "junak", "Somei"));
words.add(new Word("ima", "imeti", "Ggnste-n"));
words.add(new Word("posesti", "posest", "Sozem"));
words.add(new Word("junak", "junak", "Somei", taxonomy));
words.add(new Word("ima", "imeti", "Ggnste-n", taxonomy));
words.add(new Word("posesti", "posest", "Sozem", taxonomy));
testSentence = new Sentence(words, "#Ft.Z.N.N");
josTest = new ArrayList<>();

View File

@ -15,7 +15,7 @@ public class CorpusTests {
public void solarTest() {
//File selectedDirectory = new File("/home/andrej/Desktop/corpus-analyzer/src/main/resources/Solar");
// File selectedDirectory = new File("/home/andrej/Desktop/corpus-analyzer/src/main/resources/GOS");
File selectedDirectory = new File("/home/andrej/Desktop/corpus-analyzer/src/main/resources/Gigafida_subset");
File selectedDirectory = new File("/home/luka/Developement/corpus-analyzer2/src/main/resources/Gigafida_subset");
Settings.resultsFilePath = new File(selectedDirectory.getAbsolutePath().concat(File.separator));

View File

@ -3,7 +3,9 @@ import static org.junit.Assert.*;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javafx.collections.FXCollections;
import org.junit.Test;
import alg.ngram.Ngrams;
@ -21,10 +23,17 @@ public class NgramTests {
filter.setStringLength(4);
filter.setNgramValue(0); // letters
filter.setCalculateFor(CalculateFor.WORD);
ArrayList<String> tax= new ArrayList<>();
tax.add("SSJ.T.P.C");
filter.setTaxonomy(tax);
Corpus testCorpus = new Corpus();
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
ArrayList<String> taxForCombo = new ArrayList<>();
taxForCombo.add("SSJ.T.P.C");
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
// tests:
// - no regex
@ -103,15 +112,24 @@ public class NgramTests {
@Test
public void wordsNgramsTest() {
Map<String, AtomicLong> result = null;
Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
Filter filter = new Filter();
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setNgramValue(3);
ArrayList<String> tax= new ArrayList<>();
tax.add("SSJ.T.P.C");
filter.setTaxonomy(tax);
ArrayList<String> mKeys = new ArrayList<>();
//mKeys.add("lema");
filter.setMultipleKeys(mKeys);
Corpus testCorpus = new Corpus();
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
ArrayList<String> taxForCombo = new ArrayList<>();
taxForCombo.add("SSJ.T.P.C");
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
// tests:
// - normal ngrams - word
@ -119,36 +137,36 @@ public class NgramTests {
filter.setCalculateFor(CalculateFor.WORD);
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats);
result = stats.getResult();
taxonomyResult = stats.getTaxonomyResult();
assertEquals(3, result.size());
assertTrue(result.containsKey("ker ima junak"));
assertTrue(result.containsKey("ima junak ima"));
assertTrue(result.containsKey("junak ima posesti"));
assertEquals(3, taxonomyResult.get("Total").size());
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ker ima junak", "", "", "")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ima junak ima", "", "", "")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak ima posesti", "", "", "")));
// tests:
// - normal ngrams - lemmas
filter.setCalculateFor(CalculateFor.LEMMA);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats);
result = stats.getResult();
taxonomyResult = stats.getTaxonomyResult();
assertEquals(3, result.size());
assertTrue(result.containsKey("ker imeti junak"));
assertTrue(result.containsKey("imeti junak imeti"));
assertTrue(result.containsKey("junak imeti posest"));
assertEquals(3, taxonomyResult.get("Total").size());
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ker imeti junak", "", "", "")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("imeti junak imeti", "", "", "")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak imeti posest", "", "", "")));
// tests:
// - normal ngrams - msd
filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats);
result = stats.getResult();
taxonomyResult = stats.getTaxonomyResult();
assertEquals(3, result.size());
assertTrue(result.containsKey("Vd Ggnste-n Somei"));
assertTrue(result.containsKey("Ggnste-n Somei Ggnste-n"));
assertTrue(result.containsKey("Somei Ggnste-n Sozem"));
assertEquals(3, taxonomyResult.get("Total").size());
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Vd Ggnste-n Somei", "", "", "")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Ggnste-n Somei Ggnste-n", "", "", "")));
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("Somei Ggnste-n Sozem", "", "", "")));
// tests:
// - ngrams - word - regex filter
@ -161,10 +179,10 @@ public class NgramTests {
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats);
result = stats.getResult();
taxonomyResult = stats.getTaxonomyResult();
assertEquals(1, result.size());
assertTrue(result.containsKey("junak ima posesti"));
assertEquals(1, taxonomyResult.get("Total").size());
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("junak ima posesti", "", "", "")));
// tests:
// - ngrams - word - regex filter
@ -177,10 +195,10 @@ public class NgramTests {
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpus, stats);
result = stats.getResult();
taxonomyResult = stats.getTaxonomyResult();
assertEquals(1, result.size());
assertTrue(result.containsKey("ima junak"));
assertEquals(1, taxonomyResult.get("Total").size());
assertTrue(taxonomyResult.get("Total").containsKey(new MultipleHMKeys("ima junak", "", "", "")));
}
@ -273,25 +291,32 @@ public class NgramTests {
@Test
public void skipgramsTest() {
Map<String, AtomicLong> result = null;
Map<String, Map<MultipleHMKeys, AtomicLong>> taxonomyResult;
Filter filter = new Filter();
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setCalculateFor(CalculateFor.WORD);
ArrayList<String> tax= new ArrayList<>();
tax.add("SSJ.T.P.C");
filter.setTaxonomy(tax);
Corpus testCorpus = new Corpus();
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
ArrayList<String> taxForCombo = new ArrayList<>();
taxForCombo.add("tisk-periodično-časopis");
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
// tests:
// - bigrams
filter.setNgramValue(2);
StatisticsNew stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
result = stats.getResult();
taxonomyResult = stats.getTaxonomyResult();
Set<String> bigrams = new HashSet<>(Arrays.asList("ker ima", "ima junak", "junak v", "v posesti"));
Set<String> bigramsActual = result.keySet();
Set<MultipleHMKeys> bigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
Set<String> bigramsActual = new HashSet<>(bigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));
assertEquals(bigrams, bigramsActual);
// test:
@ -300,10 +325,11 @@ public class NgramTests {
filter.setSkipValue(2);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
result = stats.getResult();
taxonomyResult = stats.getTaxonomyResult();
Set<String> twoSkipBigrams = new HashSet<>(Arrays.asList("ker ima", "ker junak", "ker v", "ima junak", "ima v", "ima posesti", "junak v", "junak posesti", "v posesti"));
Set<String> twoSkipBigramsActual = result.keySet();
Set<MultipleHMKeys> twoSkipBigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
Set<String> twoSkipBigramsActual = new HashSet<>(twoSkipBigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));
assertEquals(twoSkipBigrams, twoSkipBigramsActual);
@ -313,9 +339,10 @@ public class NgramTests {
filter.setSkipValue(null);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
result = stats.getResult();
taxonomyResult = stats.getTaxonomyResult();
Set<String> trigrams = new HashSet<>(Arrays.asList("ker ima junak", "ima junak v", "junak v posesti"));
Set<String> trigramsActual = result.keySet();
Set<MultipleHMKeys> trigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
Set<String> trigramsActual = new HashSet<>(trigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));
assertEquals(trigrams, trigramsActual);
@ -325,9 +352,10 @@ public class NgramTests {
filter.setSkipValue(2);
stats = new StatisticsNew(testCorpus, filter, false);
Ngrams.calculateForAll(Common.midCorpusSkip, stats);
result = stats.getResult();
taxonomyResult = stats.getTaxonomyResult();
HashSet<String> twoSkipTrigrams = new HashSet<>(Arrays.asList("ker ima junak", "ker ima v", "ker ima posesti", "ker junak v", "ker junak posesti", "ker v posesti", "ima junak v", "ima junak posesti", "ima v posesti", "junak v posesti"));
Set<String> twoSkipTrigramsActual = result.keySet();
Set<MultipleHMKeys> twoSkipTrigramsMultipleHMKeys = taxonomyResult.get("Total").keySet();
Set<String> twoSkipTrigramsActual = new HashSet<>(twoSkipTrigramsMultipleHMKeys.stream().map(MultipleHMKeys::getKey).collect(Collectors.toList()));
assertEquals(twoSkipTrigrams, twoSkipTrigramsActual);
}

View File

@ -3,6 +3,7 @@ import java.util.ArrayList;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import javafx.collections.FXCollections;
import org.junit.Test;
import alg.inflectedJOS.WordFormation;
@ -22,6 +23,9 @@ public class WordFormationTest {
Corpus testCorpus = new Corpus();
testCorpus.setCorpusType(CorpusType.GIGAFIDA);
testCorpus.setDetectedCorpusFiles(new ArrayList<>());
ArrayList<String> taxForCombo = new ArrayList<>();
taxForCombo.add("tisk-periodično-časopis");
testCorpus.setTaxonomy(FXCollections.observableArrayList(taxForCombo));
// tests:
// - normal ngrams - word