From 3c669c79015200ad9ecea3af751e24c6684af3f7 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Thu, 23 May 2019 08:13:29 +0200 Subject: [PATCH] looking for agreements from the whole corpus --- wani.py | 89 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 33 deletions(-) diff --git a/wani.py b/wani.py index 3511de4..60d8547 100644 --- a/wani.py +++ b/wani.py @@ -222,30 +222,44 @@ class ComponentRendition: c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""] for c in structure.components } - representations_to_check = [] + found_agreements = {} word_component_id = {} # doprint = structure.id == '1' and matches[0]['1'].text.startswith('evrop') and matches[0]['2'].text.startswith('prv') doprint = False - def render_all(lst): - return "/".join([w.text for w in set(lst)]) + def render_all(component_id, lst): + matches.representations[component_id] = "/".join([w.text for w in set(lst)]) - def render_form(lst): + def render_form(component_id, lst): sorted_lst = sorted(set(lst), key=lst.count) for word in sorted_lst: - othw = are_agreements_ok(word, representations_to_check) - if othw is not None: - matches.representations[word_component_id[othw.id]] = othw.most_frequent_text(word_renderer) - matches.representations[word_component_id[word.id]] = word.most_frequent_text(word_renderer) - return + if component_id in found_agreements: + other_component_id, other_word, agreements = found_agreements[component_id] + print(word.lemma, other_word.lemma, component_id, other_component_id, word.msd, word.msd) + agr = are_agreements_ok(word.msd, other_word.lemma, other_word.msd, agreements) + if agr is None: + continue + matches.representations[other_component_id] = agr + + matches.representations[word_component_id[word.id]] = word.most_frequent_text(word_renderer) + break + + # othw = are_agreements_ok(word, found_agreements) + # if othw is not None: + # matches.representations[word_component_id[othw.id]] = othw.most_frequent_text(word_renderer) + # return - def are_agreements_ok(word, words_to_try): - for w_id, other_word, agreements in words_to_try: - if check_agreement(word, other_word, agreements): + def are_agreements_ok(w1_msd, ow_lemma, ow_msd, agreements): + for w2_msd, w2_txt in word_renderer.available_words(ow_lemma): + if ow_msd[0] != w2_msd[0]: + continue + + print(w1_msd, w2_msd) + if check_agreement(w1_msd, w2_msd, agreements): if doprint: print("GOOD :)") - return other_word + return w2_txt def check_msd(word, selectors): for key, value in selectors.items(): @@ -259,27 +273,32 @@ class ComponentRendition: return True - def check_agreement(w1, w2, agreements): - if doprint: - print("CHECK", w1.text, w1, w2.text, w2) - + def check_agreement(msd1, msd2, agreements): for agr_case in agreements: - t1 = w1.msd[0] + t1 = msd1[0] + # if not in msd, some strange msd was tries, skipping... + if agr_case not in TAGSET[t1]: + logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1)) + print("BAAAD") + return False + v1 = TAGSET[t1].index(agr_case) - assert(v1 >= 0) # if none specified: nedolocnik, always agrees - if v1 + 1 >= len(w1.msd): + if v1 + 1 >= len(msd1): continue # first is uppercase, not in TAGSET - m1 = w1.msd[v1 + 1] + m1 = msd1[v1 + 1] # REPEAT (not DRY!) - t2 = w2.msd[0] + t2 = msd2[0] + if agr_case not in TAGSET[t2]: + logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2)) + print("BAAAD") + return False v2 = TAGSET[t2].index(agr_case) - assert(v2 >= 0) - if v2 + 1 >= len(w2.msd): + if v2 + 1 >= len(msd2): continue - m2 = w2.msd[v2 + 1] + m2 = msd2[v2 + 1] # match! if '-' not in [m1, m2] and m1 != m2: @@ -321,18 +340,18 @@ class ComponentRendition: else: assert(wf_type is WordFormSelection.Agreement) other_w, agreements = more - representations_to_check.append((other_w, w, agreements)) + found_agreements[other_w] = (w_id, w.lemma, agreements) add = True - func = lambda x: None + func = lambda *x: None if add: representations[w_id][0].append(w) representations[w_id][1] = func if doprint: - print(len(matches), len(representations_to_check)) + print(len(matches), len(found_agreements)) - # for w1i, w2i, agreements in representations_to_check: + # for w1i, w2i, agreements in found_agreements: # w1, w2 = words[w1i], words[w2i] # if doprint: # print("? ", w1.msd, w2.msd, end="") @@ -368,7 +387,7 @@ class ComponentRendition: elif len(data[0]) == 0: matches.representations[w_id] = None else: - data[1](data[0]) + data[1](str(w_id), data[0]) if doprint: print(matches.representations) @@ -882,6 +901,11 @@ class WordMsdRenderer: if lemma in self.rendered_words: if msd in self.rendered_words[lemma]: return self.rendered_words[lemma][msd] + + def available_words(self, lemma): + if lemma in self.rendered_words: + for msd in self.rendered_words[lemma].keys(): + yield (msd, self.rendered_words[lemma][msd]) def is_root_id(id_): return len(id_.split('.')) == 3 @@ -928,7 +952,7 @@ def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status): if lfrom in words: if not skip_id_check and is_root_id(lfrom): - logging.error("NOO: ", lfrom) + logging.error("NOO: {}".format(lfrom)) sys.exit(1) if dest in words: @@ -986,7 +1010,7 @@ class Writer: elif self.all: return [word.id, word.text, word.lemma, word.msd] else: - print("1", word) + # print("1", word) if representation is None: return [word.lemma, word.lemma, "lemma_fallback"] else: @@ -1021,7 +1045,6 @@ class Writer: for idx, _comp in enumerate(components): idx = str(idx + 1) word = m[idx] if idx in m else None - print(rprsnt) rep = rprsnt[idx] if idx in rprsnt else None to_write.extend(self.from_word(word, rep)) representation += " " + to_write[-2]