luscenje_struktur/src/word_stats.py

103 lines
3.4 KiB
Python
Raw Normal View History

from collections import defaultdict, Counter
class WordStats:
def __init__(self, lemma_features):
2019-06-15 23:31:40 +00:00
self.raw_data = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
self.all_words = 0
self.rendered_words = {}
self.frequent_words = {}
self.num_words = {}
self.lemma_msd = {}
self.lemma_features = lemma_features
self.memoized_msd_merges = {}
def add_words(self, words):
for w in words:
2019-06-15 23:31:40 +00:00
self.raw_data[w.lemma][w.msd][w.text] += 1
self.all_words += len(words)
def num_all_words(self):
return self.all_words
def generate_renders(self):
num_words = defaultdict(int)
for lemma, ld in self.raw_data.items():
self.rendered_words[lemma] = {}
freq_words = defaultdict(int)
common_msd = "*" * 10
2019-06-15 23:31:40 +00:00
for msd, text_counters in ld.items():
num = sum(text_counters.values())
# TODO: this should be out of generate_renders...
2019-06-15 23:31:40 +00:00
num_words[(lemma, msd[0])] += num
2019-06-15 23:31:40 +00:00
rep = max(text_counters, key=text_counters.get)
self.rendered_words[lemma][msd] = (rep, num)
2019-06-15 23:31:40 +00:00
for txt, n in text_counters.items():
freq_words[(msd, txt)] += n
common_msd = self.merge_msd(common_msd, msd)
self.lemma_msd[lemma] = common_msd
self.frequent_words[lemma] = []
for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]):
self.frequent_words[lemma].append((msd, txt, n))
self.num_words = dict(num_words)
#next, determine lemma's default msds
lf = self.lemma_features
for lemma in self.lemma_msd:
cmsd = self.lemma_msd[lemma]
if cmsd[0] in lf:
self.lemma_msd[lemma] = "".join(
l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd)
)
def merge_msd(self, common_msd, new_msd):
key = (common_msd, new_msd)
if key in self.memoized_msd_merges:
return self.memoized_msd_merges[key]
def merge_letter(l1, l2):
if l1 == "*":
return l2
elif l1 != l2:
return "-"
else:
return l1
value = "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd))
self.memoized_msd_merges[key] = value
return value
def render(self, lemma, msd):
if lemma in self.rendered_words:
if msd in self.rendered_words[lemma]:
return self.rendered_words[lemma][msd][0]
def available_words(self, lemma, existing_texts):
counted_texts = Counter(existing_texts)
for (msd, text), _n in counted_texts.most_common():
yield (msd, text)
if lemma in self.frequent_words:
for msd, text, _ in self.frequent_words[lemma]:
if (msd, text) not in counted_texts:
yield (msd, text)
def get_lemma_msd(self, lemma, word_msd):
# should be here, since we collect every lemmas
lemma_msd = self.lemma_msd[lemma]
if lemma_msd[0] == '-':
if word_msd[0] in self.lemma_features:
return self.lemma_features[word_msd[0]]
else:
return '-'
else:
return lemma_msd