word stats now collected more memory-efficient
This commit is contained in:
parent
90dbbca5d5
commit
a8183cf507
|
@ -2,7 +2,9 @@ from collections import defaultdict, Counter
|
||||||
|
|
||||||
class WordStats:
|
class WordStats:
|
||||||
def __init__(self, lemma_features):
|
def __init__(self, lemma_features):
|
||||||
self.all_words = []
|
self.raw_data = defaultdict(lambda: defaultdict(list))
|
||||||
|
self.all_words = 0
|
||||||
|
|
||||||
self.rendered_words = {}
|
self.rendered_words = {}
|
||||||
self.frequent_words = {}
|
self.frequent_words = {}
|
||||||
self.num_words = {}
|
self.num_words = {}
|
||||||
|
@ -11,18 +13,16 @@ class WordStats:
|
||||||
self.memoized_msd_merges = {}
|
self.memoized_msd_merges = {}
|
||||||
|
|
||||||
def add_words(self, words):
|
def add_words(self, words):
|
||||||
self.all_words.extend(words)
|
for w in words:
|
||||||
|
self.raw_data[w.lemma][w.msd].append(w.text)
|
||||||
|
self.all_words += len(words)
|
||||||
|
|
||||||
def num_all_words(self):
|
def num_all_words(self):
|
||||||
return len(self.all_words)
|
return self.all_words
|
||||||
|
|
||||||
def generate_renders(self):
|
def generate_renders(self):
|
||||||
num_words = defaultdict(int)
|
num_words = defaultdict(int)
|
||||||
data = defaultdict(lambda: defaultdict(list))
|
for lemma, ld in self.raw_data.items():
|
||||||
for w in self.all_words:
|
|
||||||
data[w.lemma][w.msd].append(w.text)
|
|
||||||
|
|
||||||
for lemma, ld in data.items():
|
|
||||||
self.rendered_words[lemma] = {}
|
self.rendered_words[lemma] = {}
|
||||||
freq_words = defaultdict(int)
|
freq_words = defaultdict(int)
|
||||||
common_msd = "*" * 10
|
common_msd = "*" * 10
|
||||||
|
@ -45,6 +45,9 @@ class WordStats:
|
||||||
for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]):
|
for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]):
|
||||||
self.frequent_words[lemma].append((msd, txt, n))
|
self.frequent_words[lemma].append((msd, txt, n))
|
||||||
|
|
||||||
|
self.num_words = dict(num_words)
|
||||||
|
|
||||||
|
#next, determine lemma's default msds
|
||||||
lf = self.lemma_features
|
lf = self.lemma_features
|
||||||
for lemma in self.lemma_msd:
|
for lemma in self.lemma_msd:
|
||||||
cmsd = self.lemma_msd[lemma]
|
cmsd = self.lemma_msd[lemma]
|
||||||
|
@ -53,8 +56,6 @@ class WordStats:
|
||||||
l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd)
|
l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd)
|
||||||
)
|
)
|
||||||
|
|
||||||
self.num_words = dict(num_words)
|
|
||||||
|
|
||||||
def merge_msd(self, common_msd, new_msd):
|
def merge_msd(self, common_msd, new_msd):
|
||||||
key = (common_msd, new_msd)
|
key = (common_msd, new_msd)
|
||||||
if key in self.memoized_msd_merges:
|
if key in self.memoized_msd_merges:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user