From dc285ce265280f00d6a20179cf07d3f294b59eae Mon Sep 17 00:00:00 2001
From: Ozbolt Menegatti <ozbolt.menegatti@gmail.com>
Date: Sun, 16 Jun 2019 01:31:40 +0200
Subject: [PATCH] Saving memory in word-stats

---
 src/word_stats.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/word_stats.py b/src/word_stats.py
index 472c4a0..fea4072 100644
--- a/src/word_stats.py
+++ b/src/word_stats.py
@@ -2,7 +2,7 @@ from collections import defaultdict, Counter
 
 class WordStats:
     def __init__(self, lemma_features):
-        self.raw_data = defaultdict(lambda: defaultdict(list))
+        self.raw_data = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
         self.all_words = 0
 
         self.rendered_words = {}
@@ -14,7 +14,7 @@ class WordStats:
 
     def add_words(self, words):
         for w in words:
-            self.raw_data[w.lemma][w.msd].append(w.text)
+            self.raw_data[w.lemma][w.msd][w.text] += 1
         self.all_words += len(words)
 
     def num_all_words(self):
@@ -27,15 +27,17 @@ class WordStats:
             freq_words = defaultdict(int)
             common_msd = "*" * 10
 
-            for msd, texts in ld.items():
+            for msd, text_counters in ld.items():
+                num = sum(text_counters.values())
+
                 # TODO: this should be out of generate_renders...
-                num_words[(lemma, msd[0])] += len(texts)
+                num_words[(lemma, msd[0])] += num
 
-                rep = max(set(texts), key=texts.count)
-                self.rendered_words[lemma][msd] = (rep, len(texts))
+                rep = max(text_counters, key=text_counters.get)
+                self.rendered_words[lemma][msd] = (rep, num)
 
-                for txt in texts:
-                    freq_words[(msd, txt)] += 1
+                for txt, n in text_counters.items():
+                    freq_words[(msd, txt)] += n
 
                 common_msd = self.merge_msd(common_msd, msd)