Saving memory in word-stats
This commit is contained in:
parent
37acabc076
commit
dc285ce265
|
@ -2,7 +2,7 @@ from collections import defaultdict, Counter
|
|||
|
||||
class WordStats:
|
||||
def __init__(self, lemma_features):
|
||||
self.raw_data = defaultdict(lambda: defaultdict(list))
|
||||
self.raw_data = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
|
||||
self.all_words = 0
|
||||
|
||||
self.rendered_words = {}
|
||||
|
@ -14,7 +14,7 @@ class WordStats:
|
|||
|
||||
def add_words(self, words):
|
||||
for w in words:
|
||||
self.raw_data[w.lemma][w.msd].append(w.text)
|
||||
self.raw_data[w.lemma][w.msd][w.text] += 1
|
||||
self.all_words += len(words)
|
||||
|
||||
def num_all_words(self):
|
||||
|
@ -27,15 +27,17 @@ class WordStats:
|
|||
freq_words = defaultdict(int)
|
||||
common_msd = "*" * 10
|
||||
|
||||
for msd, texts in ld.items():
|
||||
for msd, text_counters in ld.items():
|
||||
num = sum(text_counters.values())
|
||||
|
||||
# TODO: this should be out of generate_renders...
|
||||
num_words[(lemma, msd[0])] += len(texts)
|
||||
num_words[(lemma, msd[0])] += num
|
||||
|
||||
rep = max(set(texts), key=texts.count)
|
||||
self.rendered_words[lemma][msd] = (rep, len(texts))
|
||||
rep = max(text_counters, key=text_counters.get)
|
||||
self.rendered_words[lemma][msd] = (rep, num)
|
||||
|
||||
for txt in texts:
|
||||
freq_words[(msd, txt)] += 1
|
||||
for txt, n in text_counters.items():
|
||||
freq_words[(msd, txt)] += n
|
||||
|
||||
common_msd = self.merge_msd(common_msd, msd)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user