From 3c669c79015200ad9ecea3af751e24c6684af3f7 Mon Sep 17 00:00:00 2001
From: Ozbolt Menegatti <ozbolt.menegatti@gmail.com>
Date: Thu, 23 May 2019 08:13:29 +0200
Subject: [PATCH] looking for agreements from the whole corpus

---
 wani.py | 89 ++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 56 insertions(+), 33 deletions(-)

diff --git a/wani.py b/wani.py
index 3511de4..60d8547 100644
--- a/wani.py
+++ b/wani.py
@@ -222,30 +222,44 @@ class ComponentRendition:
             c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""]
             for c in structure.components
         }
-        representations_to_check = []
+        found_agreements = {}
         word_component_id = {}
 
         # doprint = structure.id == '1' and matches[0]['1'].text.startswith('evrop') and matches[0]['2'].text.startswith('prv')
         doprint = False
 
-        def render_all(lst):
-            return "/".join([w.text for w in set(lst)])
+        def render_all(component_id, lst):
+            matches.representations[component_id] = "/".join([w.text for w in set(lst)])
         
-        def render_form(lst):
+        def render_form(component_id, lst):
             sorted_lst = sorted(set(lst), key=lst.count)
             for word in sorted_lst:
-                othw = are_agreements_ok(word, representations_to_check)
-                if othw is not None:
-                    matches.representations[word_component_id[othw.id]] = othw.most_frequent_text(word_renderer)
-                    matches.representations[word_component_id[word.id]] = word.most_frequent_text(word_renderer)
-                    return
+                if component_id in found_agreements:
+                    other_component_id, other_word, agreements = found_agreements[component_id]
+                    print(word.lemma, other_word.lemma, component_id, other_component_id, word.msd, word.msd)
+                    agr = are_agreements_ok(word.msd, other_word.lemma, other_word.msd, agreements)
+                    if agr is None:
+                        continue
+                    matches.representations[other_component_id] = agr
+
+                matches.representations[word_component_id[word.id]] = word.most_frequent_text(word_renderer)
+                break
+                
+                # othw = are_agreements_ok(word, found_agreements)
+                # if othw is not None:
+                #     matches.representations[word_component_id[othw.id]] = othw.most_frequent_text(word_renderer)
+                #     return
         
-        def are_agreements_ok(word, words_to_try):
-            for w_id, other_word, agreements in words_to_try:
-                if check_agreement(word, other_word, agreements):
+        def are_agreements_ok(w1_msd, ow_lemma, ow_msd, agreements):
+            for w2_msd, w2_txt in word_renderer.available_words(ow_lemma):
+                if ow_msd[0] != w2_msd[0]:
+                    continue
+
+                print(w1_msd, w2_msd)
+                if check_agreement(w1_msd, w2_msd, agreements):
                     if doprint:
                         print("GOOD :)")
-                    return other_word
+                    return w2_txt
 
         def check_msd(word, selectors):
             for key, value in selectors.items():
@@ -259,27 +273,32 @@ class ComponentRendition:
 
             return True
         
-        def check_agreement(w1, w2, agreements):
-            if doprint:
-                print("CHECK", w1.text, w1, w2.text, w2)
-
+        def check_agreement(msd1, msd2, agreements):
             for agr_case in agreements:
-                t1 = w1.msd[0]
+                t1 = msd1[0]
+                # if not in msd, some strange msd was tries, skipping...
+                if agr_case not in TAGSET[t1]:
+                    logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1))
+                    print("BAAAD")
+                    return False
+
                 v1 = TAGSET[t1].index(agr_case)
-                assert(v1 >= 0)
                 # if none specified: nedolocnik, always agrees
-                if v1 + 1 >= len(w1.msd): 
+                if v1 + 1 >= len(msd1): 
                     continue 
                 # first is uppercase, not in TAGSET
-                m1 = w1.msd[v1 + 1]
+                m1 = msd1[v1 + 1]
 
                 # REPEAT (not DRY!)
-                t2 = w2.msd[0]
+                t2 = msd2[0]
+                if agr_case not in TAGSET[t2]:
+                    logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2))
+                    print("BAAAD")
+                    return False
                 v2 = TAGSET[t2].index(agr_case)
-                assert(v2 >= 0)
-                if v2 + 1 >= len(w2.msd): 
+                if v2 + 1 >= len(msd2): 
                     continue 
-                m2 = w2.msd[v2 + 1]
+                m2 = msd2[v2 + 1]
 
                 # match!
                 if '-' not in [m1, m2] and m1 != m2:
@@ -321,18 +340,18 @@ class ComponentRendition:
                     else:
                         assert(wf_type is WordFormSelection.Agreement)
                         other_w, agreements = more
-                        representations_to_check.append((other_w, w, agreements))
+                        found_agreements[other_w] = (w_id, w.lemma, agreements)
                         add = True
-                        func = lambda x: None
+                        func = lambda *x: None
 
                     if add:
                         representations[w_id][0].append(w)
                         representations[w_id][1] = func
 
         if doprint:
-            print(len(matches), len(representations_to_check))
+            print(len(matches), len(found_agreements))
 
-        # for w1i, w2i, agreements in representations_to_check:
+        # for w1i, w2i, agreements in found_agreements:
         #     w1, w2 = words[w1i], words[w2i]
         #     if doprint:
         #         print("? ", w1.msd, w2.msd, end="")
@@ -368,7 +387,7 @@ class ComponentRendition:
             elif len(data[0]) == 0:
                 matches.representations[w_id] = None
             else:
-                data[1](data[0])
+                data[1](str(w_id), data[0])
             
         if doprint:
             print(matches.representations)
@@ -882,6 +901,11 @@ class WordMsdRenderer:
         if lemma in self.rendered_words:
             if msd in self.rendered_words[lemma]:
                 return self.rendered_words[lemma][msd]
+    
+    def available_words(self, lemma):
+        if lemma in self.rendered_words:
+            for msd in self.rendered_words[lemma].keys():
+                yield (msd, self.rendered_words[lemma][msd])
 
 def is_root_id(id_):
     return len(id_.split('.')) == 3
@@ -928,7 +952,7 @@ def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
 
         if lfrom in words:
             if not skip_id_check and is_root_id(lfrom):
-                logging.error("NOO: ", lfrom)
+                logging.error("NOO: {}".format(lfrom))
                 sys.exit(1)
 
             if dest in words:
@@ -986,7 +1010,7 @@ class Writer:
         elif self.all:
             return [word.id, word.text, word.lemma, word.msd]
         else:
-            print("1", word)
+            # print("1", word)
             if representation is None:
                 return [word.lemma, word.lemma, "lemma_fallback"]
             else:
@@ -1021,7 +1045,6 @@ class Writer:
             for idx, _comp in enumerate(components):
                 idx = str(idx + 1)
                 word = m[idx] if idx in m else None
-                print(rprsnt)
                 rep = rprsnt[idx] if idx in rprsnt else None
                 to_write.extend(self.from_word(word, rep))
                 representation += " " + to_write[-2]