From ed83b2b9c4da67fba7d284bcdb6047d2b54e0969 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Sat, 1 Jun 2019 10:36:28 +0200 Subject: [PATCH] implementing multiple agreements to one cid. --- wani.py | 68 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/wani.py b/wani.py index d12b09c..ab99345 100644 --- a/wani.py +++ b/wani.py @@ -17,7 +17,6 @@ from tqdm import tqdm MAX_NUM_COMPONENTS = 5 - CODES = { "Noun": "N", "Verb": "V", @@ -166,10 +165,10 @@ class ComponentRepresentation: self.words = [] self.rendition_text = None - self.agreement = None + self.agreement = [] def get_agreement(self): - return None + return [] def add_word(self, word): self.words.append(word) @@ -210,12 +209,17 @@ class WordFormAnyCR(ComponentRepresentation): sorted_words = sorted(set(words_counter), key=lambda x: -words_counter.count(x)) for word_msd, word_lemma in sorted_words: - if self.agreement is not None: - if self.agreement.match(word_msd): - if word_lemma is None: - return None - else: - return text_forms[(word_msd, word_lemma)] + for agr in self.agreement: + if not agr.match(word_msd): + break + else: + for agr in self.agreement: + agr.confirm_match() + + if word_lemma is None: + return None + else: + return text_forms[(word_msd, word_lemma)] class WordFormMsdCR(WordFormAnyCR): def __init__(self, *args): @@ -253,6 +257,7 @@ class WordFormAgreementCR(ComponentRepresentation): def __init__(self, data, word_renderer): super().__init__(data, word_renderer) self.agree_with, self.data = self.data + self.rendition_candidate = None def get_agreement(self): return self.agree_with @@ -269,10 +274,13 @@ class WordFormAgreementCR(ComponentRepresentation): continue if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, agreements): - self.rendition_text = candidate_text + self.rendition_candidate = candidate_text return True return False + + def confirm_match(self): + self.rendition_text = self.rendition_candidate @staticmethod def check_agreement(msd1, msd2, agreements): @@ -361,20 +369,17 @@ class ComponentRendition: for cid, reps in representations.items(): for rep in reps: - agr = rep.get_agreement() - if agr is None: - continue + for agr in rep.get_agreement(): + if len(representations[agr]) != 1: + n = len(representations[agr]) + raise NotImplementedError( + "Structure {}: ".format(structure.id) + + "component {} has agreement".format(cid) + + " with component {}".format(agr) + + ", however there are {} (!= 1) representations".format(n) + + " of component {}!".format(agr)) - if len(representations[agr]) != 1: - n = len(representations[agr]) - raise NotImplementedError( - "Structure {}: ".format(structure.id) + - "component {} has agreement".format(cid) + - " with component {}".format(agr) + - ", however there are {} (!= 1) representations".format(n) + - " of component {}!".format(agr)) - - representations[agr][0].agreement = rep + representations[agr][0].agreement.append(rep) # representations = { # c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""] @@ -494,6 +499,7 @@ class ComponentRendition: # if add: # representations[w_id][0].append(w) + for cid, reps in representations.items(): for rep in reps: rep.render() @@ -1178,16 +1184,17 @@ class Writer: def length(self): return 4 if self.all else 3 - def from_word(self, word, representation): + def from_word(self, word, representation, rep_exists): if word is None: return [""] * self.length() elif self.all: return [word.id, word.text, word.lemma, word.msd] + elif not rep_exists: + return [word.lemma, "", ""] + elif representation is None: + return [word.lemma, word.lemma, "lemma_fallback"] else: - if representation is None: - return [word.lemma, word.lemma, "lemma_fallback"] - else: - return [word.lemma, representation, "ok"] + return [word.lemma, representation, "ok"] def sorted_rows(self, rows): if self.sort_by < 0 or len(rows) < 2: @@ -1218,8 +1225,9 @@ class Writer: for idx, _comp in enumerate(components): idx = str(idx + 1) word = m[idx] if idx in m else None - rep = rprsnt[idx] if idx in rprsnt else None - to_write.extend(self.from_word(word, rep)) + rep_exists = idx in rprsnt + rep = rprsnt[idx] if rep_exists else None + to_write.extend(self.from_word(word, rep, rep_exists)) representation += " " + to_write[-2] # make them equal size