from collections import defaultdict, Counter from progress_bar import progress class WordStats: def __init__(self, lemma_features, db): self.lemma_features = lemma_features self.db = db self.all_words = 0 self.memoized_msd_merges = {} self.db.init("""CREATE TABLE UniqWords ( uw_id INTEGER PRIMARY KEY, lemma varchar(64), msd varchar(16), text varchar(64), frequency int )""") self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)") self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)") self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)") self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)") def add_words(self, words): for w in progress(words, "adding-words"): params = {'lemma': w.lemma, 'msd': w.msd, 'text': w.text} res = self.db.execute("""UPDATE UniqWords SET frequency=frequency + 1 WHERE lemma=:lemma AND msd=:msd AND text=:text""", params) if res.rowcount == 0: self.db.execute("""INSERT INTO UniqWords (lemma, msd, text, frequency) VALUES (:lemma, :msd, :text, 1)""", params) self.db.commit() self.all_words += len(words) def num_all_words(self): return self.all_words def generate_renders(self): lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")] for lemma in progress(lemmas, 'word-count'): num_words = defaultdict(int) for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)): num_words[msd[0]] += freq for msd0, freq in num_words.items(): self.db.execute("INSERT INTO WordCount (lemma, msd0, frequency) VALUES (?,?,?)", (lemma, msd0, freq)) self.db.commit() def common_lemma_msd(self, lemma, msd): lf = self.lemma_features if msd[0] in lf: return "".join( l1 if l1 != "-" else l2 for l1, l2 in zip(lf[msd[0]], msd) ) else: return msd def render(self, lemma, msd): statement = """SELECT msd, frequency FROM UniqWords WHERE lemma=:lemma AND msd=:msd ORDER BY frequency DESC""" cur = self.db.execute(statement, {"lemma": lemma, "msd": msd}) if cur.rowcount > 0: return cur.fetchone()[0] def available_words(self, lemma, existing_texts): counted_texts = Counter(existing_texts) for (msd, text), _n in counted_texts.most_common(): yield (msd, text) statement = """SELECT msd, text, frequency FROM UniqWords WHERE lemma=:lemma ORDER BY frequency DESC""" for msd, text, _f in self.db.execute(statement, {'lemma': lemma}): if (msd, text) not in counted_texts: yield (msd, text) def num_words(self, lemma, msd0): statement = "SELECT frequency FROM WordCount WHERE lemma=? AND msd0=? LIMIT 1" cur = self.db.execute(statement, (lemma, msd0)) result = cur.fetchone()[0] return result def get_lemma_msd(self, lemma, word_msd): # should be here, since we collect every lemmas msd0 = word_msd[0] lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=? AND msd0=?", (lemma, msd0)).fetchone()[0] if lemma_msd[0] == '-': if word_msd[0] in self.lemma_features: return self.lemma_features[word_msd[0]] else: return '-' else: return lemma_msd