luscenje_struktur/src/formatter.py

202 lines
6.0 KiB
Python
Raw Normal View History

from math import log2
import re
from component import ComponentType
class Formatter:
def __init__(self, colocation_ids, word_renderer):
self.colocation_ids = colocation_ids
self.word_renderer = word_renderer
self.additional_init()
def header_repeat(self):
raise NotImplementedError("Header repeat formatter not implemented")
def header_right(self):
raise NotImplementedError("Header right formatter not implemented")
def content_repeat(self, words, representations, idx, sidx):
raise NotImplementedError("Content repeat formatter not implemented")
def content_right(self, freq):
raise NotImplementedError("Content right formatter not implemented")
def group(self):
raise NotImplementedError("Group for formatter not implemented")
def additional_init(self):
pass
def length(self):
return len(self.header_repeat())
def set_structure(self, structure):
pass
def new_match(self, match):
pass
class OutNoStatFormatter(Formatter):
def additional_init(self):
self.representation = ""
def header_repeat(self):
2019-07-01 15:18:25 +00:00
return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"]
def header_right(self):
return ["Joint_representative_form", "Frequency"]
def content_repeat(self, words, representations, idx, _sidx):
word = words[idx]
if idx not in representations:
return [word.lemma, "", ""]
rep = representations[idx]
if rep is None:
self.representation += " " + word.lemma
2019-07-01 15:18:25 +00:00
return [word.lemma, word.lemma, "", "lemma_fallback"]
else:
self.representation += " " + rep
2019-07-01 15:18:25 +00:00
return [word.lemma, rep, word.msd, "ok"]
def content_right(self, freq):
rep = re.sub(' +', ' ', self.representation.strip())
result = [rep, str(freq)]
self.representation = ""
return result
def group(self):
return True
2019-06-17 15:30:51 +00:00
def __str__(self):
return "out-no-stat"
class AllFormatter(Formatter):
def header_repeat(self):
return ["Token_ID", "Word_form", "Lemma", "Msd"]
def header_right(self):
return []
def content_repeat(self, words, _representations, idx, _sidx):
word = words[idx]
return [word.id, word.text, word.lemma, word.msd]
def content_right(self, _freq):
return []
def group(self):
return False
2019-06-17 15:30:51 +00:00
def __str__(self):
return "all"
class StatsFormatter(Formatter):
def additional_init(self):
self.stats = None
self.jppb = None
self.corew = None
@staticmethod
def stat_str(num):
return "{:.5f}".format(num) if isinstance(num, float) else str(num)
def set_structure(self, structure):
jppb = []
corew = []
for component in structure.components:
if component.type == ComponentType.Core2w:
jppb.append(component.idx)
if component.type != ComponentType.Other:
corew.append(component.idx)
assert(len(jppb) == 2)
self.jppb = tuple(jppb)
self.corew = tuple(corew)
def new_match(self, match):
self.stats = {"freq": {}}
for cid in self.corew:
if cid not in match.matches[0]:
freq = 0
else:
word = match.matches[0][cid]
freq = self.word_renderer.num_words(word.lemma, word.msd[0])
self.stats["freq"][cid] = freq
fx = self.stats["freq"][self.jppb[0]]
fy = self.stats["freq"][self.jppb[1]]
freq = len(match)
N = self.word_renderer.num_all_words()
self.stats['d12'] = freq / fx - (fy - freq) / (N - fx)
self.stats['d21'] = freq / fy - (fx - freq) / (N - fy)
self.stats['df'] = match.distinct_forms()
self.stats['freq_all'] = freq
def header_repeat(self):
return ["Distribution"]
def header_right(self):
return ["Delta_p12", "Delta_p21", "LogDice_core", "LogDice_all", "Distinct_forms"]
def content_repeat(self, words, representations, idx, sidx):
# not a core word
if idx not in self.corew:
return [""] * self.length()
word = words[idx]
key = (sidx, idx, word.lemma)
distribution = self.colocation_ids.dispersions[key]
return [self.stat_str(distribution)]
def content_right(self, freq):
fx = self.stats["freq"][self.jppb[0]]
fy = self.stats["freq"][self.jppb[1]]
freq = self.stats['freq_all']
logdice_core = 14 + log2(2 * freq / (fx + fy))
fi = [self.stats["freq"][idx] for idx in self.corew]
fi = [f for f in fi if f > 0]
logdice_all = 14 + log2(len(fi) * freq / sum(fi))
return [self.stat_str(x) for x in (
self.stats["d12"], self.stats["d21"], logdice_core, logdice_all, self.stats['df']
)]
def group(self):
return True
2019-06-17 15:30:51 +00:00
def __str__(self):
return "stat"
class OutFormatter(Formatter):
def additional_init(self):
self.f1 = OutNoStatFormatter(self.colocation_ids, self.word_renderer)
self.f2 = StatsFormatter(self.colocation_ids, self.word_renderer)
def header_repeat(self):
return self.f1.header_repeat() + self.f2.header_repeat()
def header_right(self):
return self.f1.header_right() + self.f2.header_right()
def content_repeat(self, words, representations, idx, sidx):
cr1 = self.f1.content_repeat(words, representations, idx, sidx)
cr2 = self.f2.content_repeat(words, representations, idx, sidx)
return cr1 + cr2
def content_right(self, freq):
return self.f1.content_right(freq) + self.f2.content_right(freq)
def group(self):
return self.f1.group() and self.f2.group()
def set_structure(self, structure):
self.f2.set_structure(structure)
def new_match(self, match):
2019-06-17 15:30:51 +00:00
self.f2.new_match(match)
def __str__(self):
return "out"