From dc285ce265280f00d6a20179cf07d3f294b59eae Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Sun, 16 Jun 2019 01:31:40 +0200 Subject: [PATCH] Saving memory in word-stats --- src/word_stats.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/word_stats.py b/src/word_stats.py index 472c4a0..fea4072 100644 --- a/src/word_stats.py +++ b/src/word_stats.py @@ -2,7 +2,7 @@ from collections import defaultdict, Counter class WordStats: def __init__(self, lemma_features): - self.raw_data = defaultdict(lambda: defaultdict(list)) + self.raw_data = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) self.all_words = 0 self.rendered_words = {} @@ -14,7 +14,7 @@ class WordStats: def add_words(self, words): for w in words: - self.raw_data[w.lemma][w.msd].append(w.text) + self.raw_data[w.lemma][w.msd][w.text] += 1 self.all_words += len(words) def num_all_words(self): @@ -27,15 +27,17 @@ class WordStats: freq_words = defaultdict(int) common_msd = "*" * 10 - for msd, texts in ld.items(): + for msd, text_counters in ld.items(): + num = sum(text_counters.values()) + # TODO: this should be out of generate_renders... - num_words[(lemma, msd[0])] += len(texts) + num_words[(lemma, msd[0])] += num - rep = max(set(texts), key=texts.count) - self.rendered_words[lemma][msd] = (rep, len(texts)) + rep = max(text_counters, key=text_counters.get) + self.rendered_words[lemma][msd] = (rep, num) - for txt in texts: - freq_words[(msd, txt)] += 1 + for txt, n in text_counters.items(): + freq_words[(msd, txt)] += n common_msd = self.merge_msd(common_msd, msd)