2019-06-15 16:55:35 +00:00
|
|
|
import logging
|
|
|
|
|
|
|
|
from collections import Counter
|
|
|
|
from codes_tagset import TAGSET, CODES
|
|
|
|
from word import WordMsdOnly
|
|
|
|
|
|
|
|
class ComponentRepresentation:
|
|
|
|
def __init__(self, data, word_renderer):
|
|
|
|
self.data = data
|
|
|
|
self.word_renderer = word_renderer
|
|
|
|
|
|
|
|
self.words = []
|
|
|
|
self.rendition_text = None
|
|
|
|
self.agreement = []
|
|
|
|
|
|
|
|
def get_agreement(self):
|
|
|
|
return []
|
|
|
|
|
|
|
|
def add_word(self, word):
|
|
|
|
self.words.append(word)
|
|
|
|
|
|
|
|
def render(self):
|
|
|
|
if self.rendition_text is None:
|
|
|
|
self.rendition_text = self._render()
|
|
|
|
|
|
|
|
def _render(self):
|
|
|
|
raise NotImplementedError("Not implemented for class: {}".format(type(self)))
|
|
|
|
|
|
|
|
class LemmaCR(ComponentRepresentation):
|
|
|
|
def _render(self):
|
|
|
|
return self.words[0].lemma if len(self.words) > 0 else None
|
|
|
|
|
|
|
|
class LexisCR(ComponentRepresentation):
|
|
|
|
def _render(self):
|
|
|
|
return self.data['lexis']
|
|
|
|
|
|
|
|
class WordFormAllCR(ComponentRepresentation):
|
|
|
|
def _render(self):
|
|
|
|
if len(self.words) == 0:
|
|
|
|
return None
|
|
|
|
else:
|
|
|
|
forms = [w.text.lower() for w in self.words]
|
|
|
|
return "/".join(set(forms))
|
|
|
|
|
|
|
|
class WordFormAnyCR(ComponentRepresentation):
|
|
|
|
def _render(self):
|
|
|
|
text_forms = {}
|
|
|
|
msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
|
|
|
|
for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
|
|
|
|
text_forms[(msd, lemma)] = text
|
|
|
|
|
|
|
|
words_counter = []
|
|
|
|
for word in self.words:
|
|
|
|
words_counter.append((word.msd, word.lemma))
|
2019-06-27 09:44:02 +00:00
|
|
|
sorted_words = sorted(
|
|
|
|
set(words_counter), key=lambda x: -words_counter.count(x) + (sum(ord(l) for l in x[1]) / 1e5 if x[1] is not None else .5))
|
2019-06-15 16:55:35 +00:00
|
|
|
|
2019-07-01 15:20:27 +00:00
|
|
|
# so lets got through all words, sorted by frequency
|
2019-06-15 16:55:35 +00:00
|
|
|
for word_msd, word_lemma in sorted_words:
|
2019-07-01 15:20:27 +00:00
|
|
|
# check if agreements match
|
|
|
|
agreements_matched = [agr.match(word_msd) for agr in self.agreement]
|
|
|
|
|
|
|
|
# if we are at the last "backup word", then confirm matches
|
|
|
|
# that worked for this one and return
|
|
|
|
if word_lemma is None:
|
|
|
|
for agr, matched in zip(self.agreement, agreements_matched):
|
|
|
|
if matched:
|
|
|
|
agr.confirm_match()
|
|
|
|
return None
|
|
|
|
|
|
|
|
# if all agreements match, we win!
|
|
|
|
if all(agreements_matched):
|
2019-06-15 16:55:35 +00:00
|
|
|
for agr in self.agreement:
|
|
|
|
agr.confirm_match()
|
|
|
|
|
2019-07-01 15:20:27 +00:00
|
|
|
return text_forms[(word_msd, word_lemma)]
|
2019-06-15 16:55:35 +00:00
|
|
|
|
2019-07-01 15:21:28 +00:00
|
|
|
|
2019-06-15 16:55:35 +00:00
|
|
|
class WordFormMsdCR(WordFormAnyCR):
|
|
|
|
def __init__(self, *args):
|
|
|
|
super().__init__(*args)
|
|
|
|
self.lemma = None
|
2019-07-01 15:21:28 +00:00
|
|
|
self.msds = []
|
|
|
|
|
|
|
|
def msd(self):
|
|
|
|
return self.msds[0]
|
2019-06-15 16:55:35 +00:00
|
|
|
|
|
|
|
def check_msd(self, word_msd):
|
|
|
|
if 'msd' not in self.data:
|
|
|
|
return True
|
|
|
|
selectors = self.data['msd']
|
|
|
|
|
|
|
|
for key, value in selectors.items():
|
|
|
|
t = word_msd[0]
|
|
|
|
v = TAGSET[t].index(key.lower())
|
|
|
|
f1 = word_msd[v + 1]
|
|
|
|
f2 = CODES[value]
|
|
|
|
|
|
|
|
if '-' not in [f1, f2] and f1 != f2:
|
|
|
|
return False
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
def add_word(self, word):
|
|
|
|
if self.lemma is None:
|
|
|
|
self.lemma = word.lemma
|
|
|
|
|
2019-07-01 15:21:28 +00:00
|
|
|
self.msds.append(word.msd)
|
2019-06-15 16:55:35 +00:00
|
|
|
if self.check_msd(word.msd):
|
|
|
|
super().add_word(word)
|
|
|
|
|
|
|
|
def _render(self):
|
2019-07-01 15:21:28 +00:00
|
|
|
self.words.append(WordMsdOnly(self._common_msd()))
|
2019-06-15 16:55:35 +00:00
|
|
|
return super()._render()
|
2019-07-01 15:21:28 +00:00
|
|
|
|
|
|
|
def _common_msd(self):
|
|
|
|
msds = sorted(self.msds, key=len)
|
|
|
|
common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds)))
|
|
|
|
else msds[0][idx] for idx in range(len(msds[0]))]
|
|
|
|
common_msd = "".join(common_msd)
|
|
|
|
iommon_msd = "".join(common_msd)
|
|
|
|
return self.word_renderer.common_lemma_msd(self.lemma, common_msd)
|
|
|
|
|
2019-06-15 16:55:35 +00:00
|
|
|
|
|
|
|
class WordFormAgreementCR(WordFormMsdCR):
|
|
|
|
def __init__(self, data, word_renderer):
|
|
|
|
super().__init__(data, word_renderer)
|
|
|
|
self.rendition_candidate = None
|
|
|
|
|
|
|
|
def get_agreement(self):
|
|
|
|
return self.data['other']
|
|
|
|
|
|
|
|
def match(self, word_msd):
|
|
|
|
existing = [(w.msd, w.text) for w in self.words]
|
|
|
|
|
|
|
|
lemma_available_words = self.word_renderer.available_words(self.lemma, existing)
|
|
|
|
for candidate_msd, candidate_text in lemma_available_words:
|
2019-07-01 15:21:28 +00:00
|
|
|
if self.msd()[0] != candidate_msd[0]:
|
2019-06-15 16:55:35 +00:00
|
|
|
continue
|
|
|
|
|
|
|
|
if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):
|
|
|
|
if self.check_msd(candidate_msd):
|
|
|
|
self.rendition_candidate = candidate_text
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
def confirm_match(self):
|
|
|
|
self.rendition_text = self.rendition_candidate
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def check_agreement(msd1, msd2, agreements):
|
|
|
|
for agr_case in agreements:
|
|
|
|
t1 = msd1[0]
|
|
|
|
# if not in msd, some strange msd was tries, skipping...
|
|
|
|
if agr_case not in TAGSET[t1]:
|
|
|
|
logging.warning("Cannot do agreement: {} for msd {} not found!"
|
|
|
|
.format(agr_case, msd1))
|
|
|
|
return False
|
|
|
|
|
|
|
|
v1 = TAGSET[t1].index(agr_case)
|
|
|
|
# if none specified: nedolocnik, always agrees
|
|
|
|
if v1 + 1 >= len(msd1):
|
|
|
|
continue
|
|
|
|
# first is uppercase, not in TAGSET
|
|
|
|
m1 = msd1[v1 + 1]
|
|
|
|
|
|
|
|
# REPEAT (not DRY!)
|
|
|
|
t2 = msd2[0]
|
|
|
|
if agr_case not in TAGSET[t2]:
|
|
|
|
logging.warning("Cannot do agreement: {} for msd {} not found!"
|
|
|
|
.format(agr_case, msd2))
|
|
|
|
return False
|
|
|
|
v2 = TAGSET[t2].index(agr_case)
|
|
|
|
if v2 + 1 >= len(msd2):
|
|
|
|
continue
|
|
|
|
m2 = msd2[v2 + 1]
|
|
|
|
|
|
|
|
# match!
|
|
|
|
if '-' not in [m1, m2] and m1 != m2:
|
|
|
|
return False
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
def render(self):
|
|
|
|
pass
|