Saving memory in word-stats

This commit is contained in:
Ozbolt Menegatti 2019-06-16 01:31:40 +02:00
parent 37acabc076
commit dc285ce265

View File

@ -2,7 +2,7 @@ from collections import defaultdict, Counter
class WordStats: class WordStats:
def __init__(self, lemma_features): def __init__(self, lemma_features):
self.raw_data = defaultdict(lambda: defaultdict(list)) self.raw_data = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
self.all_words = 0 self.all_words = 0
self.rendered_words = {} self.rendered_words = {}
@ -14,7 +14,7 @@ class WordStats:
def add_words(self, words): def add_words(self, words):
for w in words: for w in words:
self.raw_data[w.lemma][w.msd].append(w.text) self.raw_data[w.lemma][w.msd][w.text] += 1
self.all_words += len(words) self.all_words += len(words)
def num_all_words(self): def num_all_words(self):
@ -27,15 +27,17 @@ class WordStats:
freq_words = defaultdict(int) freq_words = defaultdict(int)
common_msd = "*" * 10 common_msd = "*" * 10
for msd, texts in ld.items(): for msd, text_counters in ld.items():
num = sum(text_counters.values())
# TODO: this should be out of generate_renders... # TODO: this should be out of generate_renders...
num_words[(lemma, msd[0])] += len(texts) num_words[(lemma, msd[0])] += num
rep = max(set(texts), key=texts.count) rep = max(text_counters, key=text_counters.get)
self.rendered_words[lemma][msd] = (rep, len(texts)) self.rendered_words[lemma][msd] = (rep, num)
for txt in texts: for txt, n in text_counters.items():
freq_words[(msd, txt)] += 1 freq_words[(msd, txt)] += n
common_msd = self.merge_msd(common_msd, msd) common_msd = self.merge_msd(common_msd, msd)