common msd now based on (lemma,msd0) not only lemma #757-127

This commit is contained in:
Ozbolt Menegatti 2019-06-28 22:00:38 +02:00
parent 8c20295adf
commit 47340fe80c

View File

@ -18,12 +18,12 @@ class WordStats:
text varchar(64), text varchar(64),
frequency int frequency int
)""") )""")
self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd varchar(16))") self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd0 char, msd varchar(16))")
self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)") self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")
self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)") self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)") self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma)") self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma, msd0)")
self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)") self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")
def add_words(self, words): def add_words(self, words):
@ -46,12 +46,20 @@ class WordStats:
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")] lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
for lemma in progress(lemmas, 'common-msd'): for lemma in progress(lemmas, 'common-msd'):
common_msd = "*" * 10 common_msds = defaultdict(lambda: "*" * 10)
for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)):
common_msd = self.merge_msd(common_msd, msd[0]) for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)):
common_msd = self.common_lemma_msd(lemma, common_msd) msd = msd[0]
current_msd = common_msds[msd[0]]
new_msd = self.merge_msd(current_msd, msd)
common_msds[msd[0]] = new_msd
for msd0, common_msd in common_msds.items():
common_msd = self.common_lemma_msd(lemma, common_msd)
self.db.execute("INSERT INTO CommonMsd (lemma, msd0, msd) VALUES (?, ?, ?)",
(lemma, msd0, common_msd))
self.db.execute("INSERT INTO CommonMsd (lemma, msd) VALUES (?, ?)", (lemma, common_msd))
self.db.commit() self.db.commit()
for lemma in progress(lemmas, 'word-count'): for lemma in progress(lemmas, 'word-count'):
@ -118,7 +126,9 @@ class WordStats:
def get_lemma_msd(self, lemma, word_msd): def get_lemma_msd(self, lemma, word_msd):
# should be here, since we collect every lemmas # should be here, since we collect every lemmas
lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=?", (lemma,)).fetchone()[0] msd0 = word_msd[0]
lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=? AND msd0=?",
(lemma, msd0)).fetchone()[0]
if lemma_msd[0] == '-': if lemma_msd[0] == '-':
if word_msd[0] in self.lemma_features: if word_msd[0] in self.lemma_features: