From 11706b6f818d20be362a55ce584444e21698b8d5 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Thu, 27 Jun 2019 00:37:47 +0200 Subject: [PATCH 1/3] word stats on sqlite now, not yet really working. --- src/formatter.py | 2 +- src/word_stats.py | 124 ++++++++++++++++++++++++++++------------------ 2 files changed, 76 insertions(+), 50 deletions(-) diff --git a/src/formatter.py b/src/formatter.py index f1c384d..a8b69fa 100644 --- a/src/formatter.py +++ b/src/formatter.py @@ -119,7 +119,7 @@ class StatsFormatter(Formatter): freq = 0 else: word = match.matches[0][cid] - freq = self.word_renderer.num_words[(word.lemma, word.msd[0])] + freq = self.word_renderer.num_words(word.lemma, word.msd[0]) self.stats["freq"][cid] = freq diff --git a/src/word_stats.py b/src/word_stats.py index d0191e3..ae21618 100644 --- a/src/word_stats.py +++ b/src/word_stats.py @@ -1,65 +1,81 @@ from collections import defaultdict, Counter from progress_bar import progress +import sqlite3 class WordStats: def __init__(self, lemma_features): - self.raw_data = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) - self.all_words = 0 - - self.rendered_words = {} - self.frequent_words = {} - self.num_words = {} - self.lemma_msd = {} self.lemma_features = lemma_features + + self.all_words = 0 self.memoized_msd_merges = {} + with open("sqlite.db", 'w') as fp: + fp.write("") + + self.db = sqlite3.connect('sqlite.db') + self.db.execute("""CREATE TABLE UniqWords ( + uw_id INTEGER PRIMARY KEY, + lemma varchar(64), + msd varchar(16), + text varchar(64), + frequency int + )""") + self.db.execute("CREATE TABLE CommonMsd (lemma varchar(64), msd varchar(16))") + self.db.execute("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)") + + self.db.execute("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)") + self.db.execute("CREATE INDEX lemma_on_uw ON UniqWords (lemma)") + self.db.execute("CREATE INDEX lemma_on_cm ON CommonMsd (lemma)") + self.db.execute("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)") + def add_words(self, words): - for w in words: - self.raw_data[w.lemma][w.msd][w.text] += 1 + for w in progress(words, "adding-words"): + params = {'lemma': w.lemma, 'msd': w.msd, 'text': w.text} + res = self.db.execute("""UPDATE UniqWords SET frequency=frequency + 1 + WHERE lemma=:lemma AND msd=:msd AND text=:text""", params) + + if res.rowcount == 0: + self.db.execute("""INSERT INTO UniqWords (lemma, msd, text, frequency) + VALUES (:lemma, :msd, :text, 1)""", params) + + self.db.commit() self.all_words += len(words) def num_all_words(self): return self.all_words def generate_renders(self): - num_words = defaultdict(int) - for lemma, ld in progress(self.raw_data.items(), "lemma-render"): - self.rendered_words[lemma] = {} - freq_words = defaultdict(int) + lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")] + + for lemma in progress(lemmas, 'common-msd'): common_msd = "*" * 10 + for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)): + common_msd = self.merge_msd(common_msd, msd[0]) + common_msd = self.common_lemma_msd(lemma, common_msd) - for msd, text_counters in ld.items(): - num = sum(text_counters.values()) + self.db.execute("INSERT INTO CommonMsd (lemma, msd) VALUES (?, ?)", (lemma, common_msd)) + self.db.commit() - # TODO: this should be out of generate_renders... - num_words[(lemma, msd[0])] += num + for lemma in progress(lemmas, 'word-count'): + num_words = defaultdict(int) + for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)): + num_words[msd[0]] += freq + + for msd0, freq in num_words.items(): + self.db.execute("INSERT INTO WordCount (lemma, msd0, frequency) VALUES (?,?,?)", + (lemma, msd0, freq)) + self.db.commit() - rep = max(text_counters, key=text_counters.get) - self.rendered_words[lemma][msd] = (rep, num) - - for txt, n in text_counters.items(): - freq_words[(msd, txt)] += n - - common_msd = self.merge_msd(common_msd, msd) - - self.lemma_msd[lemma] = common_msd - - self.frequent_words[lemma] = [] - for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]): - self.frequent_words[lemma].append((msd, txt, n)) - - self.num_words = dict(num_words) - - #next, determine lemma's default msds + def common_lemma_msd(self, lemma, msd): lf = self.lemma_features - for lemma in self.lemma_msd: - cmsd = self.lemma_msd[lemma] - if cmsd[0] in lf: - self.lemma_msd[lemma] = "".join( - l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd) - ) + if msd[0] in lf: + return "".join( + l1 if l1 != "-" else l2 for l1, l2 in zip(lf[msd[0]], msd) + ) + else: + return msd def merge_msd(self, common_msd, new_msd): key = (common_msd, new_msd) @@ -79,23 +95,33 @@ class WordStats: return value def render(self, lemma, msd): - if lemma in self.rendered_words: - if msd in self.rendered_words[lemma]: - return self.rendered_words[lemma][msd][0] + statement = """SELECT msd, frequency FROM UniqWords WHERE + lemma=:lemma AND msd=:msd ORDER BY frequency DESC""" + + cur = self.db.execute(statement, {"lemma": lemma, "msd": msd}) + if cur.rowcount > 0: + return cur.fetchone()[0] def available_words(self, lemma, existing_texts): counted_texts = Counter(existing_texts) for (msd, text), _n in counted_texts.most_common(): yield (msd, text) - if lemma in self.frequent_words: - for msd, text, _ in self.frequent_words[lemma]: - if (msd, text) not in counted_texts: - yield (msd, text) + statement = """SELECT msd, text, frequency FROM UniqWords WHERE + lemma=:lemma ORDER BY frequency DESC""" + for msd, text, _f in self.db.execute(statement, {'lemma': lemma}): + yield (msd, text) + + def num_words(self, lemma, msd0): + statement = "SELECT frequency FROM WordCount WHERE lemma=? AND msd0=? LIMIT 1" + cur = self.db.execute(statement, (lemma, msd0)) + result = cur.fetchone()[0] + return result + def get_lemma_msd(self, lemma, word_msd): # should be here, since we collect every lemmas - lemma_msd = self.lemma_msd[lemma] + lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=?", (lemma,)).fetchone()[0] if lemma_msd[0] == '-': if word_msd[0] in self.lemma_features: @@ -103,4 +129,4 @@ class WordStats: else: return '-' else: - return lemma_msd \ No newline at end of file + return lemma_msd From 8b06c4ec3862357478b8d2bfbf941b047533acd7 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Thu, 27 Jun 2019 00:57:46 +0200 Subject: [PATCH 2/3] Skipping already used abailable words, stupid refactoring bug --- src/word_stats.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/word_stats.py b/src/word_stats.py index ae21618..46a4ca1 100644 --- a/src/word_stats.py +++ b/src/word_stats.py @@ -110,7 +110,8 @@ class WordStats: statement = """SELECT msd, text, frequency FROM UniqWords WHERE lemma=:lemma ORDER BY frequency DESC""" for msd, text, _f in self.db.execute(statement, {'lemma': lemma}): - yield (msd, text) + if (msd, text) not in counted_texts: + yield (msd, text) def num_words(self, lemma, msd0): statement = "SELECT frequency FROM WordCount WHERE lemma=? AND msd0=? LIMIT 1" From c2c2ce7ff80968bfaae08dcadc0c910b9eb91dd9 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Thu, 27 Jun 2019 11:44:02 +0200 Subject: [PATCH 3/3] making sorted words sorted a bit more non-randomly. --- src/representation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/representation.py b/src/representation.py index 78a6388..2b5ebf2 100644 --- a/src/representation.py +++ b/src/representation.py @@ -52,7 +52,8 @@ class WordFormAnyCR(ComponentRepresentation): words_counter = [] for word in self.words: words_counter.append((word.msd, word.lemma)) - sorted_words = sorted(set(words_counter), key=lambda x: -words_counter.count(x)) + sorted_words = sorted( + set(words_counter), key=lambda x: -words_counter.count(x) + (sum(ord(l) for l in x[1]) / 1e5 if x[1] is not None else .5)) for word_msd, word_lemma in sorted_words: for agr in self.agreement: