From 47340fe80cc138528c720fba96cab4c38d1245bf Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Fri, 28 Jun 2019 22:00:38 +0200 Subject: [PATCH] common msd now based on (lemma,msd0) not only lemma #757-127 --- src/word_stats.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/word_stats.py b/src/word_stats.py index bb34b6c..33a8ca2 100644 --- a/src/word_stats.py +++ b/src/word_stats.py @@ -18,12 +18,12 @@ class WordStats: text varchar(64), frequency int )""") - self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd varchar(16))") + self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd0 char, msd varchar(16))") self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)") self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)") self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)") - self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma)") + self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma, msd0)") self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)") def add_words(self, words): @@ -46,12 +46,20 @@ class WordStats: lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")] for lemma in progress(lemmas, 'common-msd'): - common_msd = "*" * 10 - for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)): - common_msd = self.merge_msd(common_msd, msd[0]) - common_msd = self.common_lemma_msd(lemma, common_msd) + common_msds = defaultdict(lambda: "*" * 10) + + for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)): + msd = msd[0] + current_msd = common_msds[msd[0]] + new_msd = self.merge_msd(current_msd, msd) + common_msds[msd[0]] = new_msd + + for msd0, common_msd in common_msds.items(): + common_msd = self.common_lemma_msd(lemma, common_msd) + self.db.execute("INSERT INTO CommonMsd (lemma, msd0, msd) VALUES (?, ?, ?)", + (lemma, msd0, common_msd)) + - self.db.execute("INSERT INTO CommonMsd (lemma, msd) VALUES (?, ?)", (lemma, common_msd)) self.db.commit() for lemma in progress(lemmas, 'word-count'): @@ -118,7 +126,9 @@ class WordStats: def get_lemma_msd(self, lemma, word_msd): # should be here, since we collect every lemmas - lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=?", (lemma,)).fetchone()[0] + msd0 = word_msd[0] + lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=? AND msd0=?", + (lemma, msd0)).fetchone()[0] if lemma_msd[0] == '-': if word_msd[0] in self.lemma_features: