You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
luscenje_struktur/luscenje_struktur/formatter.py

228 lines
7.4 KiB

from math import log2
import re
import logging
from luscenje_struktur.component import ComponentType
class Formatter:
def __init__(self, colocation_ids, word_renderer):
self.colocation_ids = colocation_ids
self.word_renderer = word_renderer
self.additional_init()
def header_repeat(self):
raise NotImplementedError("Header repeat formatter not implemented")
def header_right(self):
raise NotImplementedError("Header right formatter not implemented")
def content_repeat(self, words, representations, idx, sidx):
raise NotImplementedError("Content repeat formatter not implemented")
def content_right(self, freq):
raise NotImplementedError("Content right formatter not implemented")
def group(self):
raise NotImplementedError("Group for formatter not implemented")
def additional_init(self):
pass
def length(self):
return len(self.header_repeat())
def set_structure(self, structure):
pass
def new_match(self, match):
pass
class OutNoStatFormatter(Formatter):
def additional_init(self):
self.representation = {}
def header_repeat(self):
return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"]
def header_right(self):
return ["Joint_representative_form_fixed", "Joint_representative_form_variable", "Frequency"]
def content_repeat(self, words, representations, idx, _sidx):
word = words[idx]
if idx not in representations:
return [word.lemma, "", ""]
rep_text, rep_msd = representations[idx]
if rep_text is None:
self.representation[idx] = word.lemma
return [word.lemma, word.lemma, "", "lemma_fallback"]
else:
self.representation[idx] = rep_text
return [word.lemma, rep_text, rep_msd, "ok"]
def content_right(self, freq, variable_word_order=None):
fixed_word_order = sorted(self.representation.keys())
if variable_word_order is None:
variable_word_order = fixed_word_order
rep_fixed_word_order = ' '.join([self.representation[o] for o in fixed_word_order if o in self.representation])
rep_variable_word_order = ' '.join([self.representation[o] for o in variable_word_order if o in self.representation])
result = [rep_fixed_word_order, rep_variable_word_order, str(freq)]
self.representation = {}
return result
def group(self):
return True
def __str__(self):
return "out-no-stat"
class AllFormatter(Formatter):
def header_repeat(self):
return ["Token_ID", "Word_form", "Lemma", "Msd"]
def header_right(self):
return []
def content_repeat(self, words, _representations, idx, _sidx):
word = words[idx]
return [word.id, word.text, word.lemma, word.msd]
def content_right(self, _freq, variable_word_order=None):
return []
def group(self):
return False
def __str__(self):
return "all"
class StatsFormatter(Formatter):
def additional_init(self):
self.stats = None
self.jppb = None
self.corew = None
@staticmethod
def stat_str(num):
return "{:.5f}".format(num) if isinstance(num, float) else str(num)
def set_structure(self, structure):
jppb = []
corew = []
for component in structure.components:
if component.type == ComponentType.Core2w:
jppb.append(component.idx)
if component.type != ComponentType.Other:
corew.append(component.idx)
assert(len(jppb) == 2)
self.jppb = tuple(jppb)
self.corew = tuple(corew)
def new_match(self, match):
self.stats = {"freq": {}}
for cid in self.corew:
if cid not in match.matches[0]:
freq = 0
else:
word = match.matches[0][cid]
freq = self.word_renderer.num_words(word.lemma, word.msd[0])
self.stats["freq"][cid] = freq
fx = self.stats["freq"][self.jppb[0]]
fy = self.stats["freq"][self.jppb[1]]
freq = len(match)
N = self.word_renderer.num_all_words()
self.stats['d12'] = freq / fx - (fy - freq) / (N - fx)
self.stats['d21'] = freq / fy - (fx - freq) / (N - fy)
self.stats['df'] = match.distinct_forms()
self.stats['freq_all'] = freq
def header_repeat(self):
return ["Distribution"]
def header_right(self):
return ["Delta_p12", "Delta_p21", "LogDice_core", "LogDice_all", "Distinct_forms"]
def content_repeat(self, words, representations, idx, sidx):
# not a core word
if idx not in self.corew:
return [""] * self.length()
word = words[idx]
key = (sidx, idx, word.lemma)
# try to fix missing dispersions
if key not in self.colocation_ids.dispersions:
if word.lemma == 'k':
new_key = (sidx, idx, 'h')
elif word.lemma == 'h':
new_key = (sidx, idx, 'k')
elif word.lemma == 's':
new_key = (sidx, idx, 'z')
elif word.lemma == 'z':
new_key = (sidx, idx, 's')
else:
new_key = (sidx, idx, '')
if new_key in self.colocation_ids.dispersions:
key = new_key
logging.info('Dispersions fixed.')
else:
logging.info('Dispersions not fixed.')
if key in self.colocation_ids.dispersions:
distribution = self.colocation_ids.dispersions[key]
else:
distribution = 1
return [self.stat_str(distribution)]
def content_right(self, freq):
fx = self.stats["freq"][self.jppb[0]]
fy = self.stats["freq"][self.jppb[1]]
freq = self.stats['freq_all']
logdice_core = 14 + log2(2 * freq / (fx + fy))
fi = [self.stats["freq"][idx] for idx in self.corew]
fi = [f for f in fi if f > 0]
logdice_all = 14 + log2(len(fi) * freq / sum(fi))
return [self.stat_str(x) for x in (
self.stats["d12"], self.stats["d21"], logdice_core, logdice_all, self.stats['df']
)]
def group(self):
return True
def __str__(self):
return "stat"
class OutFormatter(Formatter):
def additional_init(self):
self.f1 = OutNoStatFormatter(self.colocation_ids, self.word_renderer)
self.f2 = StatsFormatter(self.colocation_ids, self.word_renderer)
def header_repeat(self):
return self.f1.header_repeat() + self.f2.header_repeat()
def header_right(self):
return self.f1.header_right() + self.f2.header_right()
def content_repeat(self, words, representations, idx, sidx, variable_word_order=None):
cr1 = self.f1.content_repeat(words, representations, idx, sidx)
cr2 = self.f2.content_repeat(words, representations, idx, sidx)
return cr1 + cr2
def content_right(self, freq, variable_word_order=None):
return self.f1.content_right(freq, variable_word_order) + self.f2.content_right(freq)
def group(self):
return self.f1.group() and self.f2.group()
def set_structure(self, structure):
self.f2.set_structure(structure)
def new_match(self, match):
self.f2.new_match(match)
def __str__(self):
return "out"