luscenje_struktur/src/representation.py

187 lines
5.9 KiB
Python

import logging
from collections import Counter
from codes_tagset import TAGSET, CODES
from word import WordMsdOnly
class ComponentRepresentation:
def __init__(self, data, word_renderer):
self.data = data
self.word_renderer = word_renderer
self.words = []
self.rendition_text = None
self.agreement = []
def get_agreement(self):
return []
def add_word(self, word):
self.words.append(word)
def render(self):
if self.rendition_text is None:
self.rendition_text = self._render()
def _render(self):
raise NotImplementedError("Not implemented for class: {}".format(type(self)))
class LemmaCR(ComponentRepresentation):
def _render(self):
return self.words[0].lemma if len(self.words) > 0 else None
class LexisCR(ComponentRepresentation):
def _render(self):
return self.data['lexis']
class WordFormAllCR(ComponentRepresentation):
def _render(self):
if len(self.words) == 0:
return None
else:
forms = [w.text.lower() for w in self.words]
return "/".join(set(forms))
class WordFormAnyCR(ComponentRepresentation):
def _render(self):
text_forms = {}
msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
text_forms[(msd, lemma)] = text
words_counter = []
for word in self.words:
words_counter.append((word.msd, word.lemma))
sorted_words = sorted(
set(words_counter), key=lambda x: -words_counter.count(x) + (sum(ord(l) for l in x[1]) / 1e5 if x[1] is not None else .5))
# so lets got through all words, sorted by frequency
for word_msd, word_lemma in sorted_words:
# check if agreements match
agreements_matched = [agr.match(word_msd) for agr in self.agreement]
# if we are at the last "backup word", then confirm matches
# that worked for this one and return
if word_lemma is None:
for agr, matched in zip(self.agreement, agreements_matched):
if matched:
agr.confirm_match()
return None
# if all agreements match, we win!
if all(agreements_matched):
for agr in self.agreement:
agr.confirm_match()
return text_forms[(word_msd, word_lemma)]
class WordFormMsdCR(WordFormAnyCR):
def __init__(self, *args):
super().__init__(*args)
self.lemma = None
self.msds = []
def msd(self):
return self.msds[0]
def check_msd(self, word_msd):
if 'msd' not in self.data:
return True
selectors = self.data['msd']
for key, value in selectors.items():
t = word_msd[0]
v = TAGSET[t].index(key.lower())
f1 = word_msd[v + 1]
f2 = CODES[value]
if '-' not in [f1, f2] and f1 != f2:
return False
return True
def add_word(self, word):
if self.lemma is None:
self.lemma = word.lemma
self.msds.append(word.msd)
if self.check_msd(word.msd):
super().add_word(word)
def _render(self):
self.words.append(WordMsdOnly(self._common_msd()))
return super()._render()
def _common_msd(self):
msds = sorted(self.msds, key=len)
common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds)))
else msds[0][idx] for idx in range(len(msds[0]))]
common_msd = "".join(common_msd)
iommon_msd = "".join(common_msd)
return self.word_renderer.common_lemma_msd(self.lemma, common_msd)
class WordFormAgreementCR(WordFormMsdCR):
def __init__(self, data, word_renderer):
super().__init__(data, word_renderer)
self.rendition_candidate = None
def get_agreement(self):
return self.data['other']
def match(self, word_msd):
existing = [(w.msd, w.text) for w in self.words]
lemma_available_words = self.word_renderer.available_words(self.lemma, existing)
for candidate_msd, candidate_text in lemma_available_words:
if self.msd()[0] != candidate_msd[0]:
continue
if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):
if self.check_msd(candidate_msd):
self.rendition_candidate = candidate_text
return True
return False
def confirm_match(self):
self.rendition_text = self.rendition_candidate
@staticmethod
def check_agreement(msd1, msd2, agreements):
for agr_case in agreements:
t1 = msd1[0]
# if not in msd, some strange msd was tries, skipping...
if agr_case not in TAGSET[t1]:
logging.warning("Cannot do agreement: {} for msd {} not found!"
.format(agr_case, msd1))
return False
v1 = TAGSET[t1].index(agr_case)
# if none specified: nedolocnik, always agrees
if v1 + 1 >= len(msd1):
continue
# first is uppercase, not in TAGSET
m1 = msd1[v1 + 1]
# REPEAT (not DRY!)
t2 = msd2[0]
if agr_case not in TAGSET[t2]:
logging.warning("Cannot do agreement: {} for msd {} not found!"
.format(agr_case, msd2))
return False
v2 = TAGSET[t2].index(agr_case)
if v2 + 1 >= len(msd2):
continue
m2 = msd2[v2 + 1]
# match!
if '-' not in [m1, m2] and m1 != m2:
return False
return True
def render(self):
pass