Merge branch 'sqlite'
This commit is contained in:
commit
fa8a5e55f8
|
@ -119,7 +119,7 @@ class StatsFormatter(Formatter):
|
|||
freq = 0
|
||||
else:
|
||||
word = match.matches[0][cid]
|
||||
freq = self.word_renderer.num_words[(word.lemma, word.msd[0])]
|
||||
freq = self.word_renderer.num_words(word.lemma, word.msd[0])
|
||||
|
||||
self.stats["freq"][cid] = freq
|
||||
|
||||
|
|
|
@ -52,7 +52,8 @@ class WordFormAnyCR(ComponentRepresentation):
|
|||
words_counter = []
|
||||
for word in self.words:
|
||||
words_counter.append((word.msd, word.lemma))
|
||||
sorted_words = sorted(set(words_counter), key=lambda x: -words_counter.count(x))
|
||||
sorted_words = sorted(
|
||||
set(words_counter), key=lambda x: -words_counter.count(x) + (sum(ord(l) for l in x[1]) / 1e5 if x[1] is not None else .5))
|
||||
|
||||
for word_msd, word_lemma in sorted_words:
|
||||
for agr in self.agreement:
|
||||
|
|
|
@ -1,65 +1,81 @@
|
|||
from collections import defaultdict, Counter
|
||||
|
||||
from progress_bar import progress
|
||||
import sqlite3
|
||||
|
||||
|
||||
class WordStats:
|
||||
def __init__(self, lemma_features):
|
||||
self.raw_data = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
|
||||
self.all_words = 0
|
||||
|
||||
self.rendered_words = {}
|
||||
self.frequent_words = {}
|
||||
self.num_words = {}
|
||||
self.lemma_msd = {}
|
||||
self.lemma_features = lemma_features
|
||||
|
||||
self.all_words = 0
|
||||
self.memoized_msd_merges = {}
|
||||
|
||||
with open("sqlite.db", 'w') as fp:
|
||||
fp.write("")
|
||||
|
||||
self.db = sqlite3.connect('sqlite.db')
|
||||
self.db.execute("""CREATE TABLE UniqWords (
|
||||
uw_id INTEGER PRIMARY KEY,
|
||||
lemma varchar(64),
|
||||
msd varchar(16),
|
||||
text varchar(64),
|
||||
frequency int
|
||||
)""")
|
||||
self.db.execute("CREATE TABLE CommonMsd (lemma varchar(64), msd varchar(16))")
|
||||
self.db.execute("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")
|
||||
|
||||
self.db.execute("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
|
||||
self.db.execute("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
|
||||
self.db.execute("CREATE INDEX lemma_on_cm ON CommonMsd (lemma)")
|
||||
self.db.execute("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")
|
||||
|
||||
def add_words(self, words):
|
||||
for w in words:
|
||||
self.raw_data[w.lemma][w.msd][w.text] += 1
|
||||
for w in progress(words, "adding-words"):
|
||||
params = {'lemma': w.lemma, 'msd': w.msd, 'text': w.text}
|
||||
res = self.db.execute("""UPDATE UniqWords SET frequency=frequency + 1
|
||||
WHERE lemma=:lemma AND msd=:msd AND text=:text""", params)
|
||||
|
||||
if res.rowcount == 0:
|
||||
self.db.execute("""INSERT INTO UniqWords (lemma, msd, text, frequency)
|
||||
VALUES (:lemma, :msd, :text, 1)""", params)
|
||||
|
||||
self.db.commit()
|
||||
self.all_words += len(words)
|
||||
|
||||
def num_all_words(self):
|
||||
return self.all_words
|
||||
|
||||
def generate_renders(self):
|
||||
num_words = defaultdict(int)
|
||||
for lemma, ld in progress(self.raw_data.items(), "lemma-render"):
|
||||
self.rendered_words[lemma] = {}
|
||||
freq_words = defaultdict(int)
|
||||
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
|
||||
|
||||
for lemma in progress(lemmas, 'common-msd'):
|
||||
common_msd = "*" * 10
|
||||
for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)):
|
||||
common_msd = self.merge_msd(common_msd, msd[0])
|
||||
common_msd = self.common_lemma_msd(lemma, common_msd)
|
||||
|
||||
for msd, text_counters in ld.items():
|
||||
num = sum(text_counters.values())
|
||||
self.db.execute("INSERT INTO CommonMsd (lemma, msd) VALUES (?, ?)", (lemma, common_msd))
|
||||
self.db.commit()
|
||||
|
||||
# TODO: this should be out of generate_renders...
|
||||
num_words[(lemma, msd[0])] += num
|
||||
for lemma in progress(lemmas, 'word-count'):
|
||||
num_words = defaultdict(int)
|
||||
for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)):
|
||||
num_words[msd[0]] += freq
|
||||
|
||||
for msd0, freq in num_words.items():
|
||||
self.db.execute("INSERT INTO WordCount (lemma, msd0, frequency) VALUES (?,?,?)",
|
||||
(lemma, msd0, freq))
|
||||
self.db.commit()
|
||||
|
||||
rep = max(text_counters, key=text_counters.get)
|
||||
self.rendered_words[lemma][msd] = (rep, num)
|
||||
|
||||
for txt, n in text_counters.items():
|
||||
freq_words[(msd, txt)] += n
|
||||
|
||||
common_msd = self.merge_msd(common_msd, msd)
|
||||
|
||||
self.lemma_msd[lemma] = common_msd
|
||||
|
||||
self.frequent_words[lemma] = []
|
||||
for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]):
|
||||
self.frequent_words[lemma].append((msd, txt, n))
|
||||
|
||||
self.num_words = dict(num_words)
|
||||
|
||||
#next, determine lemma's default msds
|
||||
def common_lemma_msd(self, lemma, msd):
|
||||
lf = self.lemma_features
|
||||
for lemma in self.lemma_msd:
|
||||
cmsd = self.lemma_msd[lemma]
|
||||
if cmsd[0] in lf:
|
||||
self.lemma_msd[lemma] = "".join(
|
||||
l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd)
|
||||
)
|
||||
if msd[0] in lf:
|
||||
return "".join(
|
||||
l1 if l1 != "-" else l2 for l1, l2 in zip(lf[msd[0]], msd)
|
||||
)
|
||||
else:
|
||||
return msd
|
||||
|
||||
def merge_msd(self, common_msd, new_msd):
|
||||
key = (common_msd, new_msd)
|
||||
|
@ -79,23 +95,34 @@ class WordStats:
|
|||
return value
|
||||
|
||||
def render(self, lemma, msd):
|
||||
if lemma in self.rendered_words:
|
||||
if msd in self.rendered_words[lemma]:
|
||||
return self.rendered_words[lemma][msd][0]
|
||||
statement = """SELECT msd, frequency FROM UniqWords WHERE
|
||||
lemma=:lemma AND msd=:msd ORDER BY frequency DESC"""
|
||||
|
||||
cur = self.db.execute(statement, {"lemma": lemma, "msd": msd})
|
||||
if cur.rowcount > 0:
|
||||
return cur.fetchone()[0]
|
||||
|
||||
def available_words(self, lemma, existing_texts):
|
||||
counted_texts = Counter(existing_texts)
|
||||
for (msd, text), _n in counted_texts.most_common():
|
||||
yield (msd, text)
|
||||
|
||||
if lemma in self.frequent_words:
|
||||
for msd, text, _ in self.frequent_words[lemma]:
|
||||
if (msd, text) not in counted_texts:
|
||||
yield (msd, text)
|
||||
statement = """SELECT msd, text, frequency FROM UniqWords WHERE
|
||||
lemma=:lemma ORDER BY frequency DESC"""
|
||||
for msd, text, _f in self.db.execute(statement, {'lemma': lemma}):
|
||||
if (msd, text) not in counted_texts:
|
||||
yield (msd, text)
|
||||
|
||||
def num_words(self, lemma, msd0):
|
||||
statement = "SELECT frequency FROM WordCount WHERE lemma=? AND msd0=? LIMIT 1"
|
||||
cur = self.db.execute(statement, (lemma, msd0))
|
||||
result = cur.fetchone()[0]
|
||||
return result
|
||||
|
||||
|
||||
def get_lemma_msd(self, lemma, word_msd):
|
||||
# should be here, since we collect every lemmas
|
||||
lemma_msd = self.lemma_msd[lemma]
|
||||
lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=?", (lemma,)).fetchone()[0]
|
||||
|
||||
if lemma_msd[0] == '-':
|
||||
if word_msd[0] in self.lemma_features:
|
||||
|
@ -103,4 +130,4 @@ class WordStats:
|
|||
else:
|
||||
return '-'
|
||||
else:
|
||||
return lemma_msd
|
||||
return lemma_msd
|
||||
|
|
Loading…
Reference in New Issue
Block a user