common msd now based on (lemma,msd0) not only lemma #757-127
This commit is contained in:
parent
8c20295adf
commit
47340fe80c
|
@ -18,12 +18,12 @@ class WordStats:
|
||||||
text varchar(64),
|
text varchar(64),
|
||||||
frequency int
|
frequency int
|
||||||
)""")
|
)""")
|
||||||
self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd varchar(16))")
|
self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd0 char, msd varchar(16))")
|
||||||
self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")
|
self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")
|
||||||
|
|
||||||
self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
|
self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
|
||||||
self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
|
self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
|
||||||
self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma)")
|
self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma, msd0)")
|
||||||
self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")
|
self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")
|
||||||
|
|
||||||
def add_words(self, words):
|
def add_words(self, words):
|
||||||
|
@ -46,12 +46,20 @@ class WordStats:
|
||||||
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
|
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
|
||||||
|
|
||||||
for lemma in progress(lemmas, 'common-msd'):
|
for lemma in progress(lemmas, 'common-msd'):
|
||||||
common_msd = "*" * 10
|
common_msds = defaultdict(lambda: "*" * 10)
|
||||||
for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)):
|
|
||||||
common_msd = self.merge_msd(common_msd, msd[0])
|
for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)):
|
||||||
common_msd = self.common_lemma_msd(lemma, common_msd)
|
msd = msd[0]
|
||||||
|
current_msd = common_msds[msd[0]]
|
||||||
|
new_msd = self.merge_msd(current_msd, msd)
|
||||||
|
common_msds[msd[0]] = new_msd
|
||||||
|
|
||||||
|
for msd0, common_msd in common_msds.items():
|
||||||
|
common_msd = self.common_lemma_msd(lemma, common_msd)
|
||||||
|
self.db.execute("INSERT INTO CommonMsd (lemma, msd0, msd) VALUES (?, ?, ?)",
|
||||||
|
(lemma, msd0, common_msd))
|
||||||
|
|
||||||
|
|
||||||
self.db.execute("INSERT INTO CommonMsd (lemma, msd) VALUES (?, ?)", (lemma, common_msd))
|
|
||||||
self.db.commit()
|
self.db.commit()
|
||||||
|
|
||||||
for lemma in progress(lemmas, 'word-count'):
|
for lemma in progress(lemmas, 'word-count'):
|
||||||
|
@ -118,7 +126,9 @@ class WordStats:
|
||||||
|
|
||||||
def get_lemma_msd(self, lemma, word_msd):
|
def get_lemma_msd(self, lemma, word_msd):
|
||||||
# should be here, since we collect every lemmas
|
# should be here, since we collect every lemmas
|
||||||
lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=?", (lemma,)).fetchone()[0]
|
msd0 = word_msd[0]
|
||||||
|
lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=? AND msd0=?",
|
||||||
|
(lemma, msd0)).fetchone()[0]
|
||||||
|
|
||||||
if lemma_msd[0] == '-':
|
if lemma_msd[0] == '-':
|
||||||
if word_msd[0] in self.lemma_features:
|
if word_msd[0] in self.lemma_features:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user