common msd now based on (lemma,msd0) not only lemma #757-127
This commit is contained in:
parent
8c20295adf
commit
47340fe80c
|
@ -18,12 +18,12 @@ class WordStats:
|
|||
text varchar(64),
|
||||
frequency int
|
||||
)""")
|
||||
self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd varchar(16))")
|
||||
self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd0 char, msd varchar(16))")
|
||||
self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")
|
||||
|
||||
self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
|
||||
self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
|
||||
self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma)")
|
||||
self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma, msd0)")
|
||||
self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")
|
||||
|
||||
def add_words(self, words):
|
||||
|
@ -46,12 +46,20 @@ class WordStats:
|
|||
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
|
||||
|
||||
for lemma in progress(lemmas, 'common-msd'):
|
||||
common_msd = "*" * 10
|
||||
for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)):
|
||||
common_msd = self.merge_msd(common_msd, msd[0])
|
||||
common_msd = self.common_lemma_msd(lemma, common_msd)
|
||||
common_msds = defaultdict(lambda: "*" * 10)
|
||||
|
||||
for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)):
|
||||
msd = msd[0]
|
||||
current_msd = common_msds[msd[0]]
|
||||
new_msd = self.merge_msd(current_msd, msd)
|
||||
common_msds[msd[0]] = new_msd
|
||||
|
||||
for msd0, common_msd in common_msds.items():
|
||||
common_msd = self.common_lemma_msd(lemma, common_msd)
|
||||
self.db.execute("INSERT INTO CommonMsd (lemma, msd0, msd) VALUES (?, ?, ?)",
|
||||
(lemma, msd0, common_msd))
|
||||
|
||||
|
||||
self.db.execute("INSERT INTO CommonMsd (lemma, msd) VALUES (?, ?)", (lemma, common_msd))
|
||||
self.db.commit()
|
||||
|
||||
for lemma in progress(lemmas, 'word-count'):
|
||||
|
@ -118,7 +126,9 @@ class WordStats:
|
|||
|
||||
def get_lemma_msd(self, lemma, word_msd):
|
||||
# should be here, since we collect every lemmas
|
||||
lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=?", (lemma,)).fetchone()[0]
|
||||
msd0 = word_msd[0]
|
||||
lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=? AND msd0=?",
|
||||
(lemma, msd0)).fetchone()[0]
|
||||
|
||||
if lemma_msd[0] == '-':
|
||||
if word_msd[0] in self.lemma_features:
|
||||
|
|
Loading…
Reference in New Issue
Block a user