From a8183cf50735c58713b81e841a40df26261cf851 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Sat, 15 Jun 2019 22:20:20 +0200 Subject: [PATCH] word stats now collected more memory-efficient --- src/word_stats.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/word_stats.py b/src/word_stats.py index ac02b3c..472c4a0 100644 --- a/src/word_stats.py +++ b/src/word_stats.py @@ -2,7 +2,9 @@ from collections import defaultdict, Counter class WordStats: def __init__(self, lemma_features): - self.all_words = [] + self.raw_data = defaultdict(lambda: defaultdict(list)) + self.all_words = 0 + self.rendered_words = {} self.frequent_words = {} self.num_words = {} @@ -11,18 +13,16 @@ class WordStats: self.memoized_msd_merges = {} def add_words(self, words): - self.all_words.extend(words) + for w in words: + self.raw_data[w.lemma][w.msd].append(w.text) + self.all_words += len(words) def num_all_words(self): - return len(self.all_words) + return self.all_words def generate_renders(self): num_words = defaultdict(int) - data = defaultdict(lambda: defaultdict(list)) - for w in self.all_words: - data[w.lemma][w.msd].append(w.text) - - for lemma, ld in data.items(): + for lemma, ld in self.raw_data.items(): self.rendered_words[lemma] = {} freq_words = defaultdict(int) common_msd = "*" * 10 @@ -45,6 +45,9 @@ class WordStats: for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]): self.frequent_words[lemma].append((msd, txt, n)) + self.num_words = dict(num_words) + + #next, determine lemma's default msds lf = self.lemma_features for lemma in self.lemma_msd: cmsd = self.lemma_msd[lemma] @@ -52,8 +55,6 @@ class WordStats: self.lemma_msd[lemma] = "".join( l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd) ) - - self.num_words = dict(num_words) def merge_msd(self, common_msd, new_msd): key = (common_msd, new_msd)