130 lines
4.8 KiB
Python
130 lines
4.8 KiB
Python
from collections import defaultdict, Counter
|
|
|
|
from progress_bar import progress
|
|
|
|
|
|
class WordStats:
|
|
def __init__(self, lemma_features, db):
|
|
self.lemma_features = lemma_features
|
|
self.db = db
|
|
|
|
self.all_words = 0
|
|
self.memoized_msd_merges = {}
|
|
|
|
self.db.init("""CREATE TABLE UniqWords (
|
|
uw_id INTEGER PRIMARY KEY,
|
|
lemma varchar(64),
|
|
msd varchar(16),
|
|
text varchar(64),
|
|
frequency int
|
|
)""")
|
|
self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd varchar(16))")
|
|
self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")
|
|
|
|
self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
|
|
self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
|
|
self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma)")
|
|
self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")
|
|
|
|
def add_words(self, words):
|
|
for w in progress(words, "adding-words"):
|
|
params = {'lemma': w.lemma, 'msd': w.msd, 'text': w.text}
|
|
res = self.db.execute("""UPDATE UniqWords SET frequency=frequency + 1
|
|
WHERE lemma=:lemma AND msd=:msd AND text=:text""", params)
|
|
|
|
if res.rowcount == 0:
|
|
self.db.execute("""INSERT INTO UniqWords (lemma, msd, text, frequency)
|
|
VALUES (:lemma, :msd, :text, 1)""", params)
|
|
|
|
self.db.commit()
|
|
self.all_words += len(words)
|
|
|
|
def num_all_words(self):
|
|
return self.all_words
|
|
|
|
def generate_renders(self):
|
|
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
|
|
|
|
for lemma in progress(lemmas, 'common-msd'):
|
|
common_msd = "*" * 10
|
|
for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)):
|
|
common_msd = self.merge_msd(common_msd, msd[0])
|
|
common_msd = self.common_lemma_msd(lemma, common_msd)
|
|
|
|
self.db.execute("INSERT INTO CommonMsd (lemma, msd) VALUES (?, ?)", (lemma, common_msd))
|
|
self.db.commit()
|
|
|
|
for lemma in progress(lemmas, 'word-count'):
|
|
num_words = defaultdict(int)
|
|
for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)):
|
|
num_words[msd[0]] += freq
|
|
|
|
for msd0, freq in num_words.items():
|
|
self.db.execute("INSERT INTO WordCount (lemma, msd0, frequency) VALUES (?,?,?)",
|
|
(lemma, msd0, freq))
|
|
self.db.commit()
|
|
|
|
def common_lemma_msd(self, lemma, msd):
|
|
lf = self.lemma_features
|
|
if msd[0] in lf:
|
|
return "".join(
|
|
l1 if l1 != "-" else l2 for l1, l2 in zip(lf[msd[0]], msd)
|
|
)
|
|
else:
|
|
return msd
|
|
|
|
def merge_msd(self, common_msd, new_msd):
|
|
key = (common_msd, new_msd)
|
|
if key in self.memoized_msd_merges:
|
|
return self.memoized_msd_merges[key]
|
|
|
|
def merge_letter(l1, l2):
|
|
if l1 == "*":
|
|
return l2
|
|
elif l1 != l2:
|
|
return "-"
|
|
else:
|
|
return l1
|
|
|
|
value = "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd))
|
|
self.memoized_msd_merges[key] = value
|
|
return value
|
|
|
|
def render(self, lemma, msd):
|
|
statement = """SELECT msd, frequency FROM UniqWords WHERE
|
|
lemma=:lemma AND msd=:msd ORDER BY frequency DESC"""
|
|
|
|
cur = self.db.execute(statement, {"lemma": lemma, "msd": msd})
|
|
if cur.rowcount > 0:
|
|
return cur.fetchone()[0]
|
|
|
|
def available_words(self, lemma, existing_texts):
|
|
counted_texts = Counter(existing_texts)
|
|
for (msd, text), _n in counted_texts.most_common():
|
|
yield (msd, text)
|
|
|
|
statement = """SELECT msd, text, frequency FROM UniqWords WHERE
|
|
lemma=:lemma ORDER BY frequency DESC"""
|
|
for msd, text, _f in self.db.execute(statement, {'lemma': lemma}):
|
|
if (msd, text) not in counted_texts:
|
|
yield (msd, text)
|
|
|
|
def num_words(self, lemma, msd0):
|
|
statement = "SELECT frequency FROM WordCount WHERE lemma=? AND msd0=? LIMIT 1"
|
|
cur = self.db.execute(statement, (lemma, msd0))
|
|
result = cur.fetchone()[0]
|
|
return result
|
|
|
|
|
|
def get_lemma_msd(self, lemma, word_msd):
|
|
# should be here, since we collect every lemmas
|
|
lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=?", (lemma,)).fetchone()[0]
|
|
|
|
if lemma_msd[0] == '-':
|
|
if word_msd[0] in self.lemma_features:
|
|
return self.lemma_features[word_msd[0]]
|
|
else:
|
|
return '-'
|
|
else:
|
|
return lemma_msd
|