from collections import defaultdict, Counter from progress_bar import progress class WordStats: def __init__(self, lemma_features): self.raw_data = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) self.all_words = 0 self.rendered_words = {} self.frequent_words = {} self.num_words = {} self.lemma_msd = {} self.lemma_features = lemma_features self.memoized_msd_merges = {} def add_words(self, words): for w in words: self.raw_data[w.lemma][w.msd][w.text] += 1 self.all_words += len(words) def num_all_words(self): return self.all_words def generate_renders(self): num_words = defaultdict(int) for lemma, ld in progress(self.raw_data.items(), "lemma-render"): self.rendered_words[lemma] = {} freq_words = defaultdict(int) common_msd = "*" * 10 for msd, text_counters in ld.items(): num = sum(text_counters.values()) # TODO: this should be out of generate_renders... num_words[(lemma, msd[0])] += num rep = max(text_counters, key=text_counters.get) self.rendered_words[lemma][msd] = (rep, num) for txt, n in text_counters.items(): freq_words[(msd, txt)] += n common_msd = self.merge_msd(common_msd, msd) self.lemma_msd[lemma] = common_msd self.frequent_words[lemma] = [] for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]): self.frequent_words[lemma].append((msd, txt, n)) self.num_words = dict(num_words) #next, determine lemma's default msds lf = self.lemma_features for lemma in self.lemma_msd: cmsd = self.lemma_msd[lemma] if cmsd[0] in lf: self.lemma_msd[lemma] = "".join( l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd) ) def merge_msd(self, common_msd, new_msd): key = (common_msd, new_msd) if key in self.memoized_msd_merges: return self.memoized_msd_merges[key] def merge_letter(l1, l2): if l1 == "*": return l2 elif l1 != l2: return "-" else: return l1 value = "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd)) self.memoized_msd_merges[key] = value return value def render(self, lemma, msd): if lemma in self.rendered_words: if msd in self.rendered_words[lemma]: return self.rendered_words[lemma][msd][0] def available_words(self, lemma, existing_texts): counted_texts = Counter(existing_texts) for (msd, text), _n in counted_texts.most_common(): yield (msd, text) if lemma in self.frequent_words: for msd, text, _ in self.frequent_words[lemma]: if (msd, text) not in counted_texts: yield (msd, text) def get_lemma_msd(self, lemma, word_msd): # should be here, since we collect every lemmas lemma_msd = self.lemma_msd[lemma] if lemma_msd[0] == '-': if word_msd[0] in self.lemma_features: return self.lemma_features[word_msd[0]] else: return '-' else: return lemma_msd