202 lines
6.0 KiB
Python
202 lines
6.0 KiB
Python
from math import log2
|
|
import re
|
|
|
|
from component import ComponentType
|
|
|
|
|
|
class Formatter:
|
|
def __init__(self, colocation_ids, word_renderer):
|
|
self.colocation_ids = colocation_ids
|
|
self.word_renderer = word_renderer
|
|
self.additional_init()
|
|
|
|
def header_repeat(self):
|
|
raise NotImplementedError("Header repeat formatter not implemented")
|
|
def header_right(self):
|
|
raise NotImplementedError("Header right formatter not implemented")
|
|
def content_repeat(self, words, representations, idx, sidx):
|
|
raise NotImplementedError("Content repeat formatter not implemented")
|
|
def content_right(self, freq):
|
|
raise NotImplementedError("Content right formatter not implemented")
|
|
def group(self):
|
|
raise NotImplementedError("Group for formatter not implemented")
|
|
|
|
def additional_init(self):
|
|
pass
|
|
def length(self):
|
|
return len(self.header_repeat())
|
|
|
|
def set_structure(self, structure):
|
|
pass
|
|
def new_match(self, match):
|
|
pass
|
|
|
|
|
|
class OutNoStatFormatter(Formatter):
|
|
def additional_init(self):
|
|
self.representation = ""
|
|
|
|
def header_repeat(self):
|
|
return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"]
|
|
|
|
def header_right(self):
|
|
return ["Joint_representative_form", "Frequency"]
|
|
|
|
def content_repeat(self, words, representations, idx, _sidx):
|
|
word = words[idx]
|
|
if idx not in representations:
|
|
return [word.lemma, "", ""]
|
|
|
|
rep = representations[idx]
|
|
if rep is None:
|
|
self.representation += " " + word.lemma
|
|
return [word.lemma, word.lemma, "", "lemma_fallback"]
|
|
else:
|
|
self.representation += " " + rep
|
|
return [word.lemma, rep, word.msd, "ok"]
|
|
|
|
def content_right(self, freq):
|
|
rep = re.sub(' +', ' ', self.representation.strip())
|
|
result = [rep, str(freq)]
|
|
self.representation = ""
|
|
return result
|
|
|
|
def group(self):
|
|
return True
|
|
|
|
def __str__(self):
|
|
return "out-no-stat"
|
|
|
|
class AllFormatter(Formatter):
|
|
def header_repeat(self):
|
|
return ["Token_ID", "Word_form", "Lemma", "Msd"]
|
|
|
|
def header_right(self):
|
|
return []
|
|
|
|
def content_repeat(self, words, _representations, idx, _sidx):
|
|
word = words[idx]
|
|
return [word.id, word.text, word.lemma, word.msd]
|
|
|
|
def content_right(self, _freq):
|
|
return []
|
|
|
|
def group(self):
|
|
return False
|
|
|
|
def __str__(self):
|
|
return "all"
|
|
|
|
class StatsFormatter(Formatter):
|
|
def additional_init(self):
|
|
self.stats = None
|
|
self.jppb = None
|
|
self.corew = None
|
|
|
|
@staticmethod
|
|
def stat_str(num):
|
|
return "{:.5f}".format(num) if isinstance(num, float) else str(num)
|
|
|
|
def set_structure(self, structure):
|
|
jppb = []
|
|
corew = []
|
|
|
|
for component in structure.components:
|
|
if component.type == ComponentType.Core2w:
|
|
jppb.append(component.idx)
|
|
if component.type != ComponentType.Other:
|
|
corew.append(component.idx)
|
|
|
|
assert(len(jppb) == 2)
|
|
self.jppb = tuple(jppb)
|
|
self.corew = tuple(corew)
|
|
|
|
def new_match(self, match):
|
|
self.stats = {"freq": {}}
|
|
|
|
for cid in self.corew:
|
|
if cid not in match.matches[0]:
|
|
freq = 0
|
|
else:
|
|
word = match.matches[0][cid]
|
|
freq = self.word_renderer.num_words(word.lemma, word.msd[0])
|
|
|
|
self.stats["freq"][cid] = freq
|
|
|
|
fx = self.stats["freq"][self.jppb[0]]
|
|
fy = self.stats["freq"][self.jppb[1]]
|
|
freq = len(match)
|
|
N = self.word_renderer.num_all_words()
|
|
|
|
self.stats['d12'] = freq / fx - (fy - freq) / (N - fx)
|
|
self.stats['d21'] = freq / fy - (fx - freq) / (N - fy)
|
|
|
|
self.stats['df'] = match.distinct_forms()
|
|
self.stats['freq_all'] = freq
|
|
|
|
def header_repeat(self):
|
|
return ["Distribution"]
|
|
|
|
def header_right(self):
|
|
return ["Delta_p12", "Delta_p21", "LogDice_core", "LogDice_all", "Distinct_forms"]
|
|
|
|
def content_repeat(self, words, representations, idx, sidx):
|
|
# not a core word
|
|
if idx not in self.corew:
|
|
return [""] * self.length()
|
|
|
|
word = words[idx]
|
|
key = (sidx, idx, word.lemma)
|
|
distribution = self.colocation_ids.dispersions[key]
|
|
return [self.stat_str(distribution)]
|
|
|
|
def content_right(self, freq):
|
|
fx = self.stats["freq"][self.jppb[0]]
|
|
fy = self.stats["freq"][self.jppb[1]]
|
|
freq = self.stats['freq_all']
|
|
logdice_core = 14 + log2(2 * freq / (fx + fy))
|
|
|
|
fi = [self.stats["freq"][idx] for idx in self.corew]
|
|
fi = [f for f in fi if f > 0]
|
|
logdice_all = 14 + log2(len(fi) * freq / sum(fi))
|
|
|
|
return [self.stat_str(x) for x in (
|
|
self.stats["d12"], self.stats["d21"], logdice_core, logdice_all, self.stats['df']
|
|
)]
|
|
|
|
def group(self):
|
|
return True
|
|
|
|
def __str__(self):
|
|
return "stat"
|
|
|
|
class OutFormatter(Formatter):
|
|
def additional_init(self):
|
|
self.f1 = OutNoStatFormatter(self.colocation_ids, self.word_renderer)
|
|
self.f2 = StatsFormatter(self.colocation_ids, self.word_renderer)
|
|
|
|
def header_repeat(self):
|
|
return self.f1.header_repeat() + self.f2.header_repeat()
|
|
|
|
def header_right(self):
|
|
return self.f1.header_right() + self.f2.header_right()
|
|
|
|
def content_repeat(self, words, representations, idx, sidx):
|
|
cr1 = self.f1.content_repeat(words, representations, idx, sidx)
|
|
cr2 = self.f2.content_repeat(words, representations, idx, sidx)
|
|
return cr1 + cr2
|
|
|
|
def content_right(self, freq):
|
|
return self.f1.content_right(freq) + self.f2.content_right(freq)
|
|
|
|
def group(self):
|
|
return self.f1.group() and self.f2.group()
|
|
|
|
def set_structure(self, structure):
|
|
self.f2.set_structure(structure)
|
|
|
|
def new_match(self, match):
|
|
self.f2.new_match(match)
|
|
|
|
def __str__(self):
|
|
return "out" |