luscenje_struktur/src/word_stats.py

110 lines
4.2 KiB
Python

from collections import defaultdict, Counter
from progress_bar import progress
class WordStats:
def __init__(self, lemma_features, db):
self.lemma_features = lemma_features
self.db = db
self.all_words = None
self.db.init("""CREATE TABLE UniqWords (
uw_id INTEGER PRIMARY KEY,
lemma varchar(64),
msd varchar(16),
text varchar(64),
frequency int
)""")
self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")
self.db.init("CREATE TABLE NumWords (id INTEGER PRIMARY KEY, n INTEGER)")
self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")
def add_words(self, words):
for w in progress(words, "adding-words"):
params = {'lemma': w.lemma, 'msd': w.msd, 'text': w.text}
res = self.db.execute("""UPDATE UniqWords SET frequency=frequency + 1
WHERE lemma=:lemma AND msd=:msd AND text=:text""", params)
if res.rowcount == 0:
self.db.execute("""INSERT INTO UniqWords (lemma, msd, text, frequency)
VALUES (:lemma, :msd, :text, 1)""", params)
self.db.execute("INSERT INTO NumWords (n) VALUES (?)", (len(words),))
def num_all_words(self):
if self.all_words is None:
cur = self.db.execute("SELECT sum(n) FROM NumWords")
self.all_words = int(cur.fetchone()[0])
return self.all_words
def generate_renders(self):
step_name = 'generate_renders'
if self.db.is_step_done(step_name):
print("Skipping GenerateRenders, already complete")
return
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
for lemma in progress(lemmas, 'word-count'):
num_words = defaultdict(int)
for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)):
num_words[msd[0]] += freq
for msd0, freq in num_words.items():
self.db.execute("INSERT INTO WordCount (lemma, msd0, frequency) VALUES (?,?,?)",
(lemma, msd0, freq))
self.db.step_is_done(step_name)
def common_lemma_msd(self, lemma, msd):
lf = self.lemma_features
if msd[0] in lf:
return "".join(
l1 if l1 != "-" else l2 for l1, l2 in zip(lf[msd[0]], msd)
)
else:
return msd
def render(self, lemma, msd):
statement = """SELECT msd, frequency FROM UniqWords WHERE
lemma=:lemma AND msd=:msd ORDER BY frequency DESC"""
cur = self.db.execute(statement, {"lemma": lemma, "msd": msd})
if cur.rowcount > 0:
return cur.fetchone()[0]
def available_words(self, lemma, existing_texts):
counted_texts = Counter(existing_texts)
for (msd, text), _n in counted_texts.most_common():
yield (msd, text)
statement = """SELECT msd, text, frequency FROM UniqWords WHERE
lemma=:lemma ORDER BY frequency DESC"""
for msd, text, _f in self.db.execute(statement, {'lemma': lemma}):
if (msd, text) not in counted_texts:
yield (msd, text)
def num_words(self, lemma, msd0):
statement = "SELECT frequency FROM WordCount WHERE lemma=? AND msd0=? LIMIT 1"
cur = self.db.execute(statement, (lemma, msd0))
result = cur.fetchone()[0]
return result
def get_lemma_msd(self, lemma, word_msd):
# should be here, since we collect every lemmas
msd0 = word_msd[0]
lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=? AND msd0=?",
(lemma, msd0)).fetchone()[0]
if lemma_msd[0] == '-':
if word_msd[0] in self.lemma_features:
return self.lemma_features[word_msd[0]]
else:
return '-'
else:
return lemma_msd