diff --git a/src/word_stats.py b/src/word_stats.py index 6a4e612..0462bee 100644 --- a/src/word_stats.py +++ b/src/word_stats.py @@ -7,9 +7,7 @@ class WordStats: def __init__(self, lemma_features, db): self.lemma_features = lemma_features self.db = db - - self.all_words = 0 - self.memoized_msd_merges = {} + self.all_words = None self.db.init("""CREATE TABLE UniqWords ( uw_id INTEGER PRIMARY KEY, @@ -19,6 +17,7 @@ class WordStats: frequency int )""") self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)") + self.db.init("CREATE TABLE NumWords (id INTEGER PRIMARY KEY, n INTEGER)") self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)") self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)") @@ -34,10 +33,12 @@ class WordStats: self.db.execute("""INSERT INTO UniqWords (lemma, msd, text, frequency) VALUES (:lemma, :msd, :text, 1)""", params) - self.db.commit() - self.all_words += len(words) + self.db.execute("INSERT INTO NumWords (n) VALUES (?)", (len(words),)) def num_all_words(self): + if self.all_words is None: + cur = self.db.execute("SELECT sum(n) FROM NumWords") + self.all_words = int(cur.fetchone()[0]) return self.all_words def generate_renders(self):