word stats on sqlite now, not yet really working.

This commit is contained in:
Ozbolt Menegatti 2019-06-27 00:37:47 +02:00
parent cfdb36b894
commit 11706b6f81
2 changed files with 76 additions and 50 deletions

View File

@ -119,7 +119,7 @@ class StatsFormatter(Formatter):
freq = 0 freq = 0
else: else:
word = match.matches[0][cid] word = match.matches[0][cid]
freq = self.word_renderer.num_words[(word.lemma, word.msd[0])] freq = self.word_renderer.num_words(word.lemma, word.msd[0])
self.stats["freq"][cid] = freq self.stats["freq"][cid] = freq

View File

@ -1,65 +1,81 @@
from collections import defaultdict, Counter from collections import defaultdict, Counter
from progress_bar import progress from progress_bar import progress
import sqlite3
class WordStats: class WordStats:
def __init__(self, lemma_features): def __init__(self, lemma_features):
self.raw_data = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
self.all_words = 0
self.rendered_words = {}
self.frequent_words = {}
self.num_words = {}
self.lemma_msd = {}
self.lemma_features = lemma_features self.lemma_features = lemma_features
self.all_words = 0
self.memoized_msd_merges = {} self.memoized_msd_merges = {}
with open("sqlite.db", 'w') as fp:
fp.write("")
self.db = sqlite3.connect('sqlite.db')
self.db.execute("""CREATE TABLE UniqWords (
uw_id INTEGER PRIMARY KEY,
lemma varchar(64),
msd varchar(16),
text varchar(64),
frequency int
)""")
self.db.execute("CREATE TABLE CommonMsd (lemma varchar(64), msd varchar(16))")
self.db.execute("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")
self.db.execute("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
self.db.execute("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
self.db.execute("CREATE INDEX lemma_on_cm ON CommonMsd (lemma)")
self.db.execute("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")
def add_words(self, words): def add_words(self, words):
for w in words: for w in progress(words, "adding-words"):
self.raw_data[w.lemma][w.msd][w.text] += 1 params = {'lemma': w.lemma, 'msd': w.msd, 'text': w.text}
res = self.db.execute("""UPDATE UniqWords SET frequency=frequency + 1
WHERE lemma=:lemma AND msd=:msd AND text=:text""", params)
if res.rowcount == 0:
self.db.execute("""INSERT INTO UniqWords (lemma, msd, text, frequency)
VALUES (:lemma, :msd, :text, 1)""", params)
self.db.commit()
self.all_words += len(words) self.all_words += len(words)
def num_all_words(self): def num_all_words(self):
return self.all_words return self.all_words
def generate_renders(self): def generate_renders(self):
num_words = defaultdict(int) lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
for lemma, ld in progress(self.raw_data.items(), "lemma-render"):
self.rendered_words[lemma] = {} for lemma in progress(lemmas, 'common-msd'):
freq_words = defaultdict(int)
common_msd = "*" * 10 common_msd = "*" * 10
for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)):
common_msd = self.merge_msd(common_msd, msd[0])
common_msd = self.common_lemma_msd(lemma, common_msd)
for msd, text_counters in ld.items(): self.db.execute("INSERT INTO CommonMsd (lemma, msd) VALUES (?, ?)", (lemma, common_msd))
num = sum(text_counters.values()) self.db.commit()
# TODO: this should be out of generate_renders... for lemma in progress(lemmas, 'word-count'):
num_words[(lemma, msd[0])] += num num_words = defaultdict(int)
for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)):
num_words[msd[0]] += freq
for msd0, freq in num_words.items():
self.db.execute("INSERT INTO WordCount (lemma, msd0, frequency) VALUES (?,?,?)",
(lemma, msd0, freq))
self.db.commit()
rep = max(text_counters, key=text_counters.get) def common_lemma_msd(self, lemma, msd):
self.rendered_words[lemma][msd] = (rep, num)
for txt, n in text_counters.items():
freq_words[(msd, txt)] += n
common_msd = self.merge_msd(common_msd, msd)
self.lemma_msd[lemma] = common_msd
self.frequent_words[lemma] = []
for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]):
self.frequent_words[lemma].append((msd, txt, n))
self.num_words = dict(num_words)
#next, determine lemma's default msds
lf = self.lemma_features lf = self.lemma_features
for lemma in self.lemma_msd: if msd[0] in lf:
cmsd = self.lemma_msd[lemma] return "".join(
if cmsd[0] in lf: l1 if l1 != "-" else l2 for l1, l2 in zip(lf[msd[0]], msd)
self.lemma_msd[lemma] = "".join( )
l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd) else:
) return msd
def merge_msd(self, common_msd, new_msd): def merge_msd(self, common_msd, new_msd):
key = (common_msd, new_msd) key = (common_msd, new_msd)
@ -79,23 +95,33 @@ class WordStats:
return value return value
def render(self, lemma, msd): def render(self, lemma, msd):
if lemma in self.rendered_words: statement = """SELECT msd, frequency FROM UniqWords WHERE
if msd in self.rendered_words[lemma]: lemma=:lemma AND msd=:msd ORDER BY frequency DESC"""
return self.rendered_words[lemma][msd][0]
cur = self.db.execute(statement, {"lemma": lemma, "msd": msd})
if cur.rowcount > 0:
return cur.fetchone()[0]
def available_words(self, lemma, existing_texts): def available_words(self, lemma, existing_texts):
counted_texts = Counter(existing_texts) counted_texts = Counter(existing_texts)
for (msd, text), _n in counted_texts.most_common(): for (msd, text), _n in counted_texts.most_common():
yield (msd, text) yield (msd, text)
if lemma in self.frequent_words: statement = """SELECT msd, text, frequency FROM UniqWords WHERE
for msd, text, _ in self.frequent_words[lemma]: lemma=:lemma ORDER BY frequency DESC"""
if (msd, text) not in counted_texts: for msd, text, _f in self.db.execute(statement, {'lemma': lemma}):
yield (msd, text) yield (msd, text)
def num_words(self, lemma, msd0):
statement = "SELECT frequency FROM WordCount WHERE lemma=? AND msd0=? LIMIT 1"
cur = self.db.execute(statement, (lemma, msd0))
result = cur.fetchone()[0]
return result
def get_lemma_msd(self, lemma, word_msd): def get_lemma_msd(self, lemma, word_msd):
# should be here, since we collect every lemmas # should be here, since we collect every lemmas
lemma_msd = self.lemma_msd[lemma] lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=?", (lemma,)).fetchone()[0]
if lemma_msd[0] == '-': if lemma_msd[0] == '-':
if word_msd[0] in self.lemma_features: if word_msd[0] in self.lemma_features:
@ -103,4 +129,4 @@ class WordStats:
else: else:
return '-' return '-'
else: else:
return lemma_msd return lemma_msd