2019-06-15 16:55:35 +00:00
|
|
|
from collections import defaultdict, Counter
|
|
|
|
|
|
|
|
class WordStats:
|
|
|
|
def __init__(self, lemma_features):
|
2019-06-15 23:31:40 +00:00
|
|
|
self.raw_data = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
|
2019-06-15 20:20:20 +00:00
|
|
|
self.all_words = 0
|
|
|
|
|
2019-06-15 16:55:35 +00:00
|
|
|
self.rendered_words = {}
|
|
|
|
self.frequent_words = {}
|
|
|
|
self.num_words = {}
|
|
|
|
self.lemma_msd = {}
|
|
|
|
self.lemma_features = lemma_features
|
|
|
|
self.memoized_msd_merges = {}
|
|
|
|
|
|
|
|
def add_words(self, words):
|
2019-06-15 20:20:20 +00:00
|
|
|
for w in words:
|
2019-06-15 23:31:40 +00:00
|
|
|
self.raw_data[w.lemma][w.msd][w.text] += 1
|
2019-06-15 20:20:20 +00:00
|
|
|
self.all_words += len(words)
|
2019-06-15 16:55:35 +00:00
|
|
|
|
|
|
|
def num_all_words(self):
|
2019-06-15 20:20:20 +00:00
|
|
|
return self.all_words
|
2019-06-15 16:55:35 +00:00
|
|
|
|
|
|
|
def generate_renders(self):
|
|
|
|
num_words = defaultdict(int)
|
2019-06-15 20:20:20 +00:00
|
|
|
for lemma, ld in self.raw_data.items():
|
2019-06-15 16:55:35 +00:00
|
|
|
self.rendered_words[lemma] = {}
|
|
|
|
freq_words = defaultdict(int)
|
|
|
|
common_msd = "*" * 10
|
|
|
|
|
2019-06-15 23:31:40 +00:00
|
|
|
for msd, text_counters in ld.items():
|
|
|
|
num = sum(text_counters.values())
|
|
|
|
|
2019-06-15 16:55:35 +00:00
|
|
|
# TODO: this should be out of generate_renders...
|
2019-06-15 23:31:40 +00:00
|
|
|
num_words[(lemma, msd[0])] += num
|
2019-06-15 16:55:35 +00:00
|
|
|
|
2019-06-15 23:31:40 +00:00
|
|
|
rep = max(text_counters, key=text_counters.get)
|
|
|
|
self.rendered_words[lemma][msd] = (rep, num)
|
2019-06-15 16:55:35 +00:00
|
|
|
|
2019-06-15 23:31:40 +00:00
|
|
|
for txt, n in text_counters.items():
|
|
|
|
freq_words[(msd, txt)] += n
|
2019-06-15 16:55:35 +00:00
|
|
|
|
|
|
|
common_msd = self.merge_msd(common_msd, msd)
|
|
|
|
|
|
|
|
self.lemma_msd[lemma] = common_msd
|
|
|
|
|
|
|
|
self.frequent_words[lemma] = []
|
|
|
|
for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]):
|
|
|
|
self.frequent_words[lemma].append((msd, txt, n))
|
|
|
|
|
2019-06-15 20:20:20 +00:00
|
|
|
self.num_words = dict(num_words)
|
|
|
|
|
|
|
|
#next, determine lemma's default msds
|
2019-06-15 16:55:35 +00:00
|
|
|
lf = self.lemma_features
|
|
|
|
for lemma in self.lemma_msd:
|
|
|
|
cmsd = self.lemma_msd[lemma]
|
|
|
|
if cmsd[0] in lf:
|
|
|
|
self.lemma_msd[lemma] = "".join(
|
|
|
|
l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd)
|
|
|
|
)
|
|
|
|
|
|
|
|
def merge_msd(self, common_msd, new_msd):
|
|
|
|
key = (common_msd, new_msd)
|
|
|
|
if key in self.memoized_msd_merges:
|
|
|
|
return self.memoized_msd_merges[key]
|
|
|
|
|
|
|
|
def merge_letter(l1, l2):
|
|
|
|
if l1 == "*":
|
|
|
|
return l2
|
|
|
|
elif l1 != l2:
|
|
|
|
return "-"
|
|
|
|
else:
|
|
|
|
return l1
|
|
|
|
|
|
|
|
value = "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd))
|
|
|
|
self.memoized_msd_merges[key] = value
|
|
|
|
return value
|
|
|
|
|
|
|
|
def render(self, lemma, msd):
|
|
|
|
if lemma in self.rendered_words:
|
|
|
|
if msd in self.rendered_words[lemma]:
|
|
|
|
return self.rendered_words[lemma][msd][0]
|
|
|
|
|
|
|
|
def available_words(self, lemma, existing_texts):
|
|
|
|
counted_texts = Counter(existing_texts)
|
|
|
|
for (msd, text), _n in counted_texts.most_common():
|
|
|
|
yield (msd, text)
|
|
|
|
|
|
|
|
if lemma in self.frequent_words:
|
|
|
|
for msd, text, _ in self.frequent_words[lemma]:
|
|
|
|
if (msd, text) not in counted_texts:
|
|
|
|
yield (msd, text)
|
|
|
|
|
|
|
|
def get_lemma_msd(self, lemma, word_msd):
|
|
|
|
# should be here, since we collect every lemmas
|
|
|
|
lemma_msd = self.lemma_msd[lemma]
|
|
|
|
|
|
|
|
if lemma_msd[0] == '-':
|
|
|
|
if word_msd[0] in self.lemma_features:
|
|
|
|
return self.lemma_features[word_msd[0]]
|
|
|
|
else:
|
|
|
|
return '-'
|
|
|
|
else:
|
|
|
|
return lemma_msd
|