from collections import defaultdict, Counter class WordStats: def __init__(self, lemma_features): self.all_words = [] self.rendered_words = {} self.frequent_words = {} self.num_words = {} self.lemma_msd = {} self.lemma_features = lemma_features self.memoized_msd_merges = {} def add_words(self, words): self.all_words.extend(words) def num_all_words(self): return len(self.all_words) def generate_renders(self): num_words = defaultdict(int) data = defaultdict(lambda: defaultdict(list)) for w in self.all_words: data[w.lemma][w.msd].append(w.text) for lemma, ld in data.items(): self.rendered_words[lemma] = {} freq_words = defaultdict(int) common_msd = "*" * 10 for msd, texts in ld.items(): # TODO: this should be out of generate_renders... num_words[(lemma, msd[0])] += len(texts) rep = max(set(texts), key=texts.count) self.rendered_words[lemma][msd] = (rep, len(texts)) for txt in texts: freq_words[(msd, txt)] += 1 common_msd = self.merge_msd(common_msd, msd) self.lemma_msd[lemma] = common_msd self.frequent_words[lemma] = [] for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]): self.frequent_words[lemma].append((msd, txt, n)) lf = self.lemma_features for lemma in self.lemma_msd: cmsd = self.lemma_msd[lemma] if cmsd[0] in lf: self.lemma_msd[lemma] = "".join( l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd) ) self.num_words = dict(num_words) def merge_msd(self, common_msd, new_msd): key = (common_msd, new_msd) if key in self.memoized_msd_merges: return self.memoized_msd_merges[key] def merge_letter(l1, l2): if l1 == "*": return l2 elif l1 != l2: return "-" else: return l1 value = "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd)) self.memoized_msd_merges[key] = value return value def render(self, lemma, msd): if lemma in self.rendered_words: if msd in self.rendered_words[lemma]: return self.rendered_words[lemma][msd][0] def available_words(self, lemma, existing_texts): counted_texts = Counter(existing_texts) for (msd, text), _n in counted_texts.most_common(): yield (msd, text) if lemma in self.frequent_words: for msd, text, _ in self.frequent_words[lemma]: if (msd, text) not in counted_texts: yield (msd, text) def get_lemma_msd(self, lemma, word_msd): # should be here, since we collect every lemmas lemma_msd = self.lemma_msd[lemma] if lemma_msd[0] == '-': if word_msd[0] in self.lemma_features: return self.lemma_features[word_msd[0]] else: return '-' else: return lemma_msd