import logging from collections import Counter from codes_tagset import TAGSET, CODES from word import WordMsdOnly class ComponentRepresentation: def __init__(self, data, word_renderer): self.data = data self.word_renderer = word_renderer self.words = [] self.rendition_text = None self.agreement = [] def get_agreement(self): return [] def add_word(self, word): self.words.append(word) def render(self): if self.rendition_text is None: self.rendition_text = self._render() def _render(self): raise NotImplementedError("Not implemented for class: {}".format(type(self))) class LemmaCR(ComponentRepresentation): def _render(self): return self.words[0].lemma if len(self.words) > 0 else None class LexisCR(ComponentRepresentation): def _render(self): return self.data['lexis'] class WordFormAllCR(ComponentRepresentation): def _render(self): if len(self.words) == 0: return None else: forms = [w.text.lower() for w in self.words] return "/".join(set(forms)) class WordFormAnyCR(ComponentRepresentation): def _render(self): text_forms = {} msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words]) for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()): text_forms[(msd, lemma)] = text words_counter = [] for word in self.words: words_counter.append((word.msd, word.lemma)) sorted_words = sorted(set(words_counter), key=lambda x: -words_counter.count(x)) for word_msd, word_lemma in sorted_words: for agr in self.agreement: if not agr.match(word_msd): break else: for agr in self.agreement: agr.confirm_match() if word_lemma is None: return None else: return text_forms[(word_msd, word_lemma)] class WordFormMsdCR(WordFormAnyCR): def __init__(self, *args): super().__init__(*args) self.lemma = None self.msd = None def check_msd(self, word_msd): if 'msd' not in self.data: return True selectors = self.data['msd'] for key, value in selectors.items(): t = word_msd[0] v = TAGSET[t].index(key.lower()) f1 = word_msd[v + 1] f2 = CODES[value] if '-' not in [f1, f2] and f1 != f2: return False return True def add_word(self, word): if self.lemma is None: self.lemma = word.lemma self.msd = word.msd if self.check_msd(word.msd): super().add_word(word) def _render(self): msd = self.word_renderer.get_lemma_msd(self.lemma, self.msd) self.words.append(WordMsdOnly(msd)) return super()._render() class WordFormAgreementCR(WordFormMsdCR): def __init__(self, data, word_renderer): super().__init__(data, word_renderer) self.rendition_candidate = None def get_agreement(self): return self.data['other'] def match(self, word_msd): existing = [(w.msd, w.text) for w in self.words] lemma_available_words = self.word_renderer.available_words(self.lemma, existing) for candidate_msd, candidate_text in lemma_available_words: if self.msd[0] != candidate_msd[0]: continue if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']): if self.check_msd(candidate_msd): self.rendition_candidate = candidate_text return True return False def confirm_match(self): self.rendition_text = self.rendition_candidate @staticmethod def check_agreement(msd1, msd2, agreements): for agr_case in agreements: t1 = msd1[0] # if not in msd, some strange msd was tries, skipping... if agr_case not in TAGSET[t1]: logging.warning("Cannot do agreement: {} for msd {} not found!" .format(agr_case, msd1)) return False v1 = TAGSET[t1].index(agr_case) # if none specified: nedolocnik, always agrees if v1 + 1 >= len(msd1): continue # first is uppercase, not in TAGSET m1 = msd1[v1 + 1] # REPEAT (not DRY!) t2 = msd2[0] if agr_case not in TAGSET[t2]: logging.warning("Cannot do agreement: {} for msd {} not found!" .format(agr_case, msd2)) return False v2 = TAGSET[t2].index(agr_case) if v2 + 1 >= len(msd2): continue m2 = msd2[v2 + 1] # match! if '-' not in [m1, m2] and m1 != m2: return False return True def render(self): pass