From dce55d04a3d8634c0de247594a20ea861dc05e41 Mon Sep 17 00:00:00 2001
From: Ozbolt Menegatti <ozbolt.menegatti@gmail.com>
Date: Mon, 20 May 2019 18:14:11 +0200
Subject: [PATCH] Does not yet work, agreements in representation

---
 wani.py | 153 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 97 insertions(+), 56 deletions(-)

diff --git a/wani.py b/wani.py
index f5d89a6..ef49952 100644
--- a/wani.py
+++ b/wani.py
@@ -132,6 +132,11 @@ class Rendition(Enum):
     Lexis = 2
     Unknown = 3
 
+class WordFormSelection(Enum):
+    All = 0
+    Msd = 1
+    Agreement = 2
+
 class Order(Enum):
     FromTo = 0
     ToFrom = 1
@@ -192,9 +197,15 @@ class ComponentRendition:
         elif 'selection' in feature:
             if feature['selection'] == "msd":
                 selectors = {k: v for k, v in feature.items() if k != 'selection'}
-                self._set_more(selectors)
+                self._set_more((WordFormSelection.Msd, selectors))
             elif feature['selection'] == "all":
-                self._set_more("all")
+                self._set_more((WordFormSelection.All, None))
+            elif feature['selection'] == 'agreement':
+                assert(feature['head'][:4] == 'cid_')
+                assert(feature['msd'] is not None)
+
+                self._set_more((WordFormSelection.Agreement,
+                    (feature['head'][4:], feature['msd'].split('+'))))
             else:
                 raise NotImplementedError("Representation selection: {}".format(feature))
 
@@ -211,13 +222,49 @@ class ComponentRendition:
         def render_all(lst):
             return "/".join(set(lst))
         
-        def render_form(_lst):
-            return ":("
+        def render_form(lst):
+            # find most frequent
+            return max(set(lst), key=lst.count)
+        
+        def check_msd(word, selectors):
+            for key, value in selectors.items():
+                t = word.msd[0]
+                v = TAGSET[t].index(key.lower())
+                f1 = word.msd[v + 1]
+                f2 = CODES[value]
 
-        for words, agreement in matches:
-            if not agreement:
-                continue
+                if '-' not in [f1, f2] and f1 != f2:
+                    return False
+
+            return True
+        
+        def check_agreement(w1, w2, agreements):
+            for agr_case in agreements:
+                t1 = w1.msd[0]
+                v1 = TAGSET[t1].index(agr_case)
+                assert(v1 >= 0)
+                # if none specified: nedolocnik, always agrees
+                if v1 + 1 >= len(w1.msd): 
+                    continue 
+                # first is uppercase, not in TAGSET
+                m1 = w1.msd[v1 + 1]
+
+                # REPEAT (not DRY!)
+                t2 = w2.msd[0]
+                v2 = TAGSET[t2].index(agr_case)
+                assert(v2 >= 0)
+                if v2 + 1 >= len(w2.msd): 
+                    continue 
+                m2 = w2.msd[v2 + 1]
 
+                # match!
+                if '-' not in [m1, m2] and m1 != m2:
+                    return False
+
+            return True
+
+
+        for words in matches:
             for w_id, w in words.items():
                 component = structure.get_component(w_id)
                 rep = component.representation
@@ -234,23 +281,46 @@ class ComponentRendition:
                 
                 # it HAS to be word_form now
                 else:
+                    wf_type, more = rep.more
+
                     # set correct type first
                     if type(representations[w_id][1]) is str:
                         representations[w_id] = (
-                            [], render_all if rep.more == "all" else render_form
+                            [], render_all if wf_type is WordFormSelection.All else render_form
                         )
-                    representations[w_id][0].append(w.text)
+                    
+                    if wf_type is WordFormSelection.All:
+                        add = True
+                    elif wf_type is WordFormSelection.Msd:
+                        add = check_msd(w, more)
+                    else:
+                        assert(wf_type is WordFormSelection.Agreement)
+                        other_w, agreements = more
+                        add = check_agreement(w, words[other_w], agreements)
+
+                    if add:
+                        representations[w_id][0].append(w.text)
+
+        doprint = matches[0]['1'].text.startswith('evrop')
 
         # just need to set representation to first group...
-        for w_id, w in matches[0][0].items():
+        for w_id, w in matches[0].items():
             data = representations[w_id]
+            if doprint:
+                print(data)
 
             if type(data[1]) is str:
                 w.representation_failed = data[0]
                 w.representation = w.lemma if w.representation_failed else data[1]
             else:
-                w.representation_failed = len(data[0]) > 0
+                w.representation_failed = len(data[0]) == 0
                 w.representation = w.lemma if w.representation_failed else data[1](data[0])
+            
+            if doprint:
+                print(w.representation_failed, w.representation)
+        
+        if doprint:
+            print('--')
    
     def __str__(self):
         return str(self.rendition)
@@ -563,7 +633,6 @@ class SyntacticStructure:
     def __init__(self):
         self.id = None
         self.lbs = None
-        self.agreements = []
         self.components = []
 
     @staticmethod
@@ -611,38 +680,19 @@ class SyntacticStructure:
             assert(el.tag == "feature")
             if 'rendition' in el.attrib:
                 forms[n].append(el)
-            elif 'selection' in el.attrib and el.attrib["selection"] != "agreement":
-                forms[n].append(el)
             elif 'selection' in el.attrib:
-                self.add_agreement(n, el)
+                forms[n].append(el)
             else:
                 logging.warning("Strange representation feature in structure {}. Skipping"
                         .format(self.id))
                 continue
 
-    def add_agreement(self, n, el):
-        assert(el.get('head')[:4] == 'cid_')
-
-        n1 = n
-        n2 = el.get('head')[4:]
-        agreement_str = el.get('msd')
-        assert(agreement_str is not None)
-
-        self.agreements.append({
-            'n1': n1,
-            'n2': n2,
-            'match': agreement_str.split('+')})
-
     def __str__(self):
         comp_str = "\n".join(str(comp) for comp in self.components)
-
-        agrs = "\n".join("({} -[{}]- {}) ".format(
-            a['n1'], "|".join(a['match']), a['n2']) for a in self.agreements)
-
         links_str = "\n".join(self.components[0].tree())
 
-        return "{} LBS {}\nCOMPONENTS\n{}\nAGREEMENTS\n{}\nLINKS\n{}\n{}".format(
-                self.id, self.lbs, comp_str, agrs, links_str, "-" * 40)
+        return "{} LBS {}\nCOMPONENTS\n{}\n\nLINKS\n{}\n{}".format(
+                self.id, self.lbs, comp_str, links_str, "-" * 40)
 
     def get_component(self, idx):
         for c in self.components:
@@ -695,21 +745,10 @@ class SyntacticStructure:
 
     def match(self, word):
         matches = self.components[0].match(word)
-        if matches is None:
-            return []
-
-        to_ret = []
-        for m in matches:
-            # if not self.check_agreements(m):
-            #     bad = "Agreement"
-            # elif not self.check_form(m):
-            #     bad = "Form"
-            # else:
-            #     bad = "OK"
-
-            to_ret.append((m, self.check_agreements(m)))
-
-        return to_ret
+        return [] if matches is None else matches
+        
+        # for m in matches:
+        #     to_ret.append((m, self.check_agreements(m)))
 
 
 def build_structures(filename):
@@ -898,11 +937,11 @@ class Writer:
     def write_out_worker(self, file_handler, structure_id, components, colocation_ids):
         rows = []
 
-        for cid, m, reason, freq in colocation_ids.get_matches_for(structure_id, not self.all):
+        for cid, m, freq in colocation_ids.get_matches_for(structure_id, not self.all):
             to_write = []
             representation = ""
 
-            for idx, comp in enumerate(components):
+            for idx, _comp in enumerate(components):
                 idx = str(idx + 1)
                 word = m[idx] if idx in m else None
                 to_write.extend(self.from_word(word))
@@ -978,21 +1017,23 @@ class ColocationIds:
     def add_matches(self, matches):
         for sid, nms in matches.items():
             for nm in nms:
-                self._add_match(nm[2], sid, (nm[0], nm[1]))
+                self._add_match(nm[1], sid, nm[0])
     
     def get_matches_for(self, structure_id, group):
         for _cid_tup, (cid, cid_matches, sid) in self.data.items():
             if sid != structure_id:
                 continue
 
-            for words, reason in cid_matches:
-                yield (cid, words, reason, len(cid_matches))
+            for words in cid_matches:
+                yield (cid, words, len(cid_matches))
                 if group:
                     break
 
     def set_representations(self, structures):
         components_dict = {structure.id: structure for structure in structures}
         for _1, (_2, cid_matches, sid) in self.data.items():
+            if _2 == '1309':
+                a = 1
             ComponentRendition.set_representations(cid_matches, components_dict[sid])
 
 
@@ -1004,12 +1045,12 @@ def match_file(words, structures):
         for w in words:
             mhere = s.match(w)
             logging.debug("  GOT: {}".format(len(mhere)))
-            for match, reason in mhere: 
+            for match in mhere: 
                 colocation_id = [(idx, w.lemma) for idx, w in match.items()]
                 colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0]))
                 colocation_id = tuple(colocation_id)
 
-                matches[s.id].append((match, reason, colocation_id))
+                matches[s.id].append((match, colocation_id))
 
     return matches