from math import log2 import re from component import ComponentType class Formatter: def __init__(self, colocation_ids, word_renderer): self.colocation_ids = colocation_ids self.word_renderer = word_renderer self.additional_init() def header_repeat(self): raise NotImplementedError("Header repeat formatter not implemented") def header_right(self): raise NotImplementedError("Header right formatter not implemented") def content_repeat(self, words, representations, idx, sidx): raise NotImplementedError("Content repeat formatter not implemented") def content_right(self, freq): raise NotImplementedError("Content right formatter not implemented") def group(self): raise NotImplementedError("Group for formatter not implemented") def additional_init(self): pass def length(self): return len(self.header_repeat()) def set_structure(self, structure): pass def new_match(self, match): pass class OutNoStatFormatter(Formatter): def additional_init(self): self.representation = "" def header_repeat(self): return ["Lemma", "Representative_form", "RF_scenario"] def header_right(self): return ["Joint_representative_form", "Frequency"] def content_repeat(self, words, representations, idx, _sidx): word = words[idx] if idx not in representations: return [word.lemma, "", ""] rep = representations[idx] if rep is None: self.representation += " " + word.lemma return [word.lemma, word.lemma, "lemma_fallback"] else: self.representation += " " + rep return [word.lemma, rep, "ok"] def content_right(self, freq): rep = re.sub(' +', ' ', self.representation.strip()) result = [rep, str(freq)] self.representation = "" return result def group(self): return True def __str__(self): return "out-no-stat" class AllFormatter(Formatter): def header_repeat(self): return ["Token_ID", "Word_form", "Lemma", "Msd"] def header_right(self): return [] def content_repeat(self, words, _representations, idx, _sidx): word = words[idx] return [word.id, word.text, word.lemma, word.msd] def content_right(self, _freq): return [] def group(self): return False def __str__(self): return "all" class StatsFormatter(Formatter): def additional_init(self): self.stats = None self.jppb = None self.corew = None @staticmethod def stat_str(num): return "{:.5f}".format(num) if isinstance(num, float) else str(num) def set_structure(self, structure): jppb = [] corew = [] for component in structure.components: if component.type == ComponentType.Core2w: jppb.append(component.idx) if component.type != ComponentType.Other: corew.append(component.idx) assert(len(jppb) == 2) self.jppb = tuple(jppb) self.corew = tuple(corew) def new_match(self, match): self.stats = {"freq": {}} for cid in self.corew: if cid not in match.matches[0]: freq = 0 else: word = match.matches[0][cid] freq = self.word_renderer.num_words[(word.lemma, word.msd[0])] self.stats["freq"][cid] = freq fx = self.stats["freq"][self.jppb[0]] fy = self.stats["freq"][self.jppb[1]] freq = len(match) N = self.word_renderer.num_all_words() self.stats['d12'] = freq / fx - (fy - freq) / (N - fx) self.stats['d21'] = freq / fy - (fx - freq) / (N - fy) self.stats['df'] = match.distinct_forms() self.stats['freq_all'] = freq def header_repeat(self): return ["Distribution"] def header_right(self): return ["Delta_p12", "Delta_p21", "LogDice_core", "LogDice_all", "Distinct_forms"] def content_repeat(self, words, representations, idx, sidx): # not a core word if idx not in self.corew: return [""] * self.length() word = words[idx] key = (sidx, idx, word.lemma) distribution = self.colocation_ids.dispersions[key] return [self.stat_str(distribution)] def content_right(self, freq): fx = self.stats["freq"][self.jppb[0]] fy = self.stats["freq"][self.jppb[1]] freq = self.stats['freq_all'] logdice_core = 14 + log2(2 * freq / (fx + fy)) fi = [self.stats["freq"][idx] for idx in self.corew] fi = [f for f in fi if f > 0] logdice_all = 14 + log2(len(fi) * freq / sum(fi)) return [self.stat_str(x) for x in ( self.stats["d12"], self.stats["d21"], logdice_core, logdice_all, self.stats['df'] )] def group(self): return True def __str__(self): return "stat" class OutFormatter(Formatter): def additional_init(self): self.f1 = OutNoStatFormatter(self.colocation_ids, self.word_renderer) self.f2 = StatsFormatter(self.colocation_ids, self.word_renderer) def header_repeat(self): return self.f1.header_repeat() + self.f2.header_repeat() def header_right(self): return self.f1.header_right() + self.f2.header_right() def content_repeat(self, words, representations, idx, sidx): cr1 = self.f1.content_repeat(words, representations, idx, sidx) cr2 = self.f2.content_repeat(words, representations, idx, sidx) return cr1 + cr2 def content_right(self, freq): return self.f1.content_right(freq) + self.f2.content_right(freq) def group(self): return self.f1.group() and self.f2.group() def set_structure(self, structure): self.f2.set_structure(structure) def new_match(self, match): self.f2.new_match(match) def __str__(self): return "out"