You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
luscenje_struktur/luscenje_struktur/representation.py

223 lines
7.7 KiB

import logging
from collections import Counter
from codes_tagset import TAGSET, CODES
from word import WordMsdOnly
from word import WordDummy
class ComponentRepresentation:
def __init__(self, data, word_renderer):
self.data = data
self.word_renderer = word_renderer
self.words = []
self.rendition_text = None
self.rendition_msd = None
self.agreement = []
def get_agreement(self):
return []
def add_word(self, word):
self.words.append(word)
def render(self, sloleks_db=None):
if self.rendition_text is None:
self.rendition_text, self.rendition_msd = self._render(sloleks_db=sloleks_db)
def _render(self, sloleks_db=None):
raise NotImplementedError("Not implemented for class: {}".format(type(self)))
class LemmaCR(ComponentRepresentation):
def _render(self, sloleks_db=None):
# TODO FIX THIS TO LEMMA MSD
if len(self.words) > 0:
return self.words[0].lemma, self.words[0].msd
else:
return None, None
class LexisCR(ComponentRepresentation):
def _render(self, sloleks_db=None):
return self.data['lexis'], 'Q'
class WordFormAllCR(ComponentRepresentation):
def _render(self, sloleks_db=None):
if len(self.words) == 0:
return None, None
else:
forms = [w.text.lower() for w in self.words]
msds = [w.msd for w in self.words]
return "/".join(set(forms)), "/".join(set(msds))
class WordFormAnyCR(ComponentRepresentation):
def _render(self, sloleks_db=None):
text_forms = {}
msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
text_forms[(msd, lemma)] = text
words_counter = []
for word in self.words:
words_counter.append((word.msd, word.lemma))
sorted_words = sorted(
set(words_counter), key=lambda x: -words_counter.count(x) + (sum(ord(l) for l in x[1]) / 1e5 if x[1] is not None else .5))
# so lets got through all words, sorted by frequency
for word_msd, word_lemma in sorted_words:
# check if agreements match
agreements_matched = [agr.match(word_msd) for agr in self.agreement]
# in case all agreements do not match try to get data from sloleks and change properly
if sloleks_db is not None and not all(agreements_matched):
for i, agr in enumerate(self.agreement):
if not agr.match(word_msd):
msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
if msd is not None:
agr.msds[0] = msd
agr.words.append(WordDummy(msd, lemma, text))
# when we find element in sloleks automatically add it (no need for second checks, since msd
# is tailored to pass tests by default)
agr.rendition_candidate = text
agr.rendition_msd_candidate = msd
agreements_matched[i] = True
else:
break
# if we are at the last "backup word", then confirm matches
# that worked for this one and return
if word_lemma is None:
for agr, matched in zip(self.agreement, agreements_matched):
if matched:
agr.confirm_match()
return None, None
# if all agreements match, we win!
if all(agreements_matched):
for agr in self.agreement:
agr.confirm_match()
return text_forms[(word_msd, word_lemma)], word_msd
return None, None
class WordFormMsdCR(WordFormAnyCR):
def __init__(self, *args):
super().__init__(*args)
self.lemma = None
self.msds = []
def msd(self):
return self.msds[0]
def check_msd(self, word_msd):
if 'msd' not in self.data:
return True
selectors = self.data['msd']
for key, value in selectors.items():
t = word_msd[0]
v = TAGSET[t].index(key.lower())
if v + 1 >= len(word_msd):
return False
f1 = word_msd[v + 1]
f2 = CODES[value]
if '-' not in [f1, f2] and f1 != f2:
return False
return True
def add_word(self, word):
if self.lemma is None:
self.lemma = word.lemma
self.msds.append(word.msd)
if self.check_msd(word.msd):
super().add_word(word)
def _render(self, sloleks_db=None):
if len(self.words) == 0 and sloleks_db is not None:
msd, lemma, text = sloleks_db.get_word_form(self.lemma, self.msd(), self.data)
if msd is not None:
self.words.append(WordDummy(msd, lemma, text))
self.words.append(WordMsdOnly(self._common_msd()))
return super()._render(sloleks_db)
def _common_msd(self):
msds = sorted(self.msds, key=len)
common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds)))
else msds[0][idx] for idx in range(len(msds[0]))]
common_msd = "".join(common_msd)
return self.word_renderer.common_lemma_msd(self.lemma, common_msd)
class WordFormAgreementCR(WordFormMsdCR):
def __init__(self, data, word_renderer):
super().__init__(data, word_renderer)
self.rendition_candidate = None
self.rendition_msd_candidate = None
def get_agreement(self):
return self.data['other']
def match(self, word_msd):
existing = [(w.msd, w.text) for w in self.words]
lemma_available_words = self.word_renderer.available_words(self.lemma, existing)
for candidate_msd, candidate_text in lemma_available_words:
if self.msd()[0] != candidate_msd[0]:
continue
if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):
if self.check_msd(candidate_msd):
self.rendition_candidate = candidate_text
self.rendition_msd_candidate = candidate_msd
return True
return False
def confirm_match(self):
self.rendition_text = self.rendition_candidate
self.rendition_msd = self.rendition_msd_candidate
@staticmethod
def check_agreement(msd1, msd2, agreements):
for agr_case in agreements:
t1 = msd1[0]
# if not in msd, some strange msd was tries, skipping...
if agr_case not in TAGSET[t1]:
logging.warning("Cannot do agreement: {} for msd {} not found!"
.format(agr_case, msd1))
return False
v1 = TAGSET[t1].index(agr_case)
# if none specified: nedolocnik, always agrees
if v1 + 1 >= len(msd1):
continue
# first is uppercase, not in TAGSET
m1 = msd1[v1 + 1]
# REPEAT (not DRY!)
t2 = msd2[0]
if agr_case not in TAGSET[t2]:
logging.warning("Cannot do agreement: {} for msd {} not found!"
.format(agr_case, msd2))
return False
v2 = TAGSET[t2].index(agr_case)
if v2 + 1 >= len(msd2):
continue
m2 = msd2[v2 + 1]
# match!
if '-' not in [m1, m2] and m1 != m2:
return False
return True
def render(self, sloleks_db=None):
pass