luscenje_struktur/src/word_stats.py

130 lines
4.8 KiB
Python

from collections import defaultdict, Counter
from progress_bar import progress
class WordStats:
def __init__(self, lemma_features, db):
self.lemma_features = lemma_features
self.db = db
self.all_words = 0
self.memoized_msd_merges = {}
self.db.init("""CREATE TABLE UniqWords (
uw_id INTEGER PRIMARY KEY,
lemma varchar(64),
msd varchar(16),
text varchar(64),
frequency int
)""")
self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd varchar(16))")
self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")
self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma)")
self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")
def add_words(self, words):
for w in progress(words, "adding-words"):
params = {'lemma': w.lemma, 'msd': w.msd, 'text': w.text}
res = self.db.execute("""UPDATE UniqWords SET frequency=frequency + 1
WHERE lemma=:lemma AND msd=:msd AND text=:text""", params)
if res.rowcount == 0:
self.db.execute("""INSERT INTO UniqWords (lemma, msd, text, frequency)
VALUES (:lemma, :msd, :text, 1)""", params)
self.db.commit()
self.all_words += len(words)
def num_all_words(self):
return self.all_words
def generate_renders(self):
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
for lemma in progress(lemmas, 'common-msd'):
common_msd = "*" * 10
for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)):
common_msd = self.merge_msd(common_msd, msd[0])
common_msd = self.common_lemma_msd(lemma, common_msd)
self.db.execute("INSERT INTO CommonMsd (lemma, msd) VALUES (?, ?)", (lemma, common_msd))
self.db.commit()
for lemma in progress(lemmas, 'word-count'):
num_words = defaultdict(int)
for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)):
num_words[msd[0]] += freq
for msd0, freq in num_words.items():
self.db.execute("INSERT INTO WordCount (lemma, msd0, frequency) VALUES (?,?,?)",
(lemma, msd0, freq))
self.db.commit()
def common_lemma_msd(self, lemma, msd):
lf = self.lemma_features
if msd[0] in lf:
return "".join(
l1 if l1 != "-" else l2 for l1, l2 in zip(lf[msd[0]], msd)
)
else:
return msd
def merge_msd(self, common_msd, new_msd):
key = (common_msd, new_msd)
if key in self.memoized_msd_merges:
return self.memoized_msd_merges[key]
def merge_letter(l1, l2):
if l1 == "*":
return l2
elif l1 != l2:
return "-"
else:
return l1
value = "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd))
self.memoized_msd_merges[key] = value
return value
def render(self, lemma, msd):
statement = """SELECT msd, frequency FROM UniqWords WHERE
lemma=:lemma AND msd=:msd ORDER BY frequency DESC"""
cur = self.db.execute(statement, {"lemma": lemma, "msd": msd})
if cur.rowcount > 0:
return cur.fetchone()[0]
def available_words(self, lemma, existing_texts):
counted_texts = Counter(existing_texts)
for (msd, text), _n in counted_texts.most_common():
yield (msd, text)
statement = """SELECT msd, text, frequency FROM UniqWords WHERE
lemma=:lemma ORDER BY frequency DESC"""
for msd, text, _f in self.db.execute(statement, {'lemma': lemma}):
if (msd, text) not in counted_texts:
yield (msd, text)
def num_words(self, lemma, msd0):
statement = "SELECT frequency FROM WordCount WHERE lemma=? AND msd0=? LIMIT 1"
cur = self.db.execute(statement, (lemma, msd0))
result = cur.fetchone()[0]
return result
def get_lemma_msd(self, lemma, word_msd):
# should be here, since we collect every lemmas
lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=?", (lemma,)).fetchone()[0]
if lemma_msd[0] == '-':
if word_msd[0] in self.lemma_features:
return self.lemma_features[word_msd[0]]
else:
return '-'
else:
return lemma_msd