Does not yet work, agreements in representation

2019-05-20 18:14:11 +02:00
parent 5bd0b4a064
commit dce55d04a3
1 changed files with 97 additions and 56 deletions
@@ -132,6 +132,11 @@ class Rendition(Enum):
    Lexis = 2
    Unknown = 3

+class WordFormSelection(Enum):
+    All = 0
+    Msd = 1
+    Agreement = 2
+
 class Order(Enum):
    FromTo = 0
    ToFrom = 1
@@ -192,9 +197,15 @@ class ComponentRendition:
        elif 'selection' in feature:
            if feature['selection'] == "msd":
                selectors = {k: v for k, v in feature.items() if k != 'selection'}
-                self._set_more(selectors)
+                self._set_more((WordFormSelection.Msd, selectors))
            elif feature['selection'] == "all":
-                self._set_more("all")
+                self._set_more((WordFormSelection.All, None))
+            elif feature['selection'] == 'agreement':
+                assert(feature['head'][:4] == 'cid_')
+                assert(feature['msd'] is not None)
+
+                self._set_more((WordFormSelection.Agreement,
+                    (feature['head'][4:], feature['msd'].split('+'))))
            else:
                raise NotImplementedError("Representation selection: {}".format(feature))

@@ -211,13 +222,49 @@ class ComponentRendition:
        def render_all(lst):
            return "/".join(set(lst))
        
-        def render_form(_lst):
-            return ":("
+        def render_form(lst):
+            # find most frequent
+            return max(set(lst), key=lst.count)
        
-        for words, agreement in matches:
-            if not agreement:
-                continue
+        def check_msd(word, selectors):
+            for key, value in selectors.items():
+                t = word.msd[0]
+                v = TAGSET[t].index(key.lower())
+                f1 = word.msd[v + 1]
+                f2 = CODES[value]

+                if '-' not in [f1, f2] and f1 != f2:
+                    return False
+
+            return True
+        
+        def check_agreement(w1, w2, agreements):
+            for agr_case in agreements:
+                t1 = w1.msd[0]
+                v1 = TAGSET[t1].index(agr_case)
+                assert(v1 >= 0)
+                # if none specified: nedolocnik, always agrees
+                if v1 + 1 >= len(w1.msd): 
+                    continue 
+                # first is uppercase, not in TAGSET
+                m1 = w1.msd[v1 + 1]
+
+                # REPEAT (not DRY!)
+                t2 = w2.msd[0]
+                v2 = TAGSET[t2].index(agr_case)
+                assert(v2 >= 0)
+                if v2 + 1 >= len(w2.msd): 
+                    continue 
+                m2 = w2.msd[v2 + 1]
+
+                # match!
+                if '-' not in [m1, m2] and m1 != m2:
+                    return False
+
+            return True
+
+
+        for words in matches:
            for w_id, w in words.items():
                component = structure.get_component(w_id)
                rep = component.representation
@@ -234,24 +281,47 @@ class ComponentRendition:
                
                # it HAS to be word_form now
                else:
+                    wf_type, more = rep.more
+
                    # set correct type first
                    if type(representations[w_id][1]) is str:
                        representations[w_id] = (
-                            [], render_all if rep.more == "all" else render_form
+                            [], render_all if wf_type is WordFormSelection.All else render_form
                        )
-                    representations[w_id][0].append(w.text)
+                    
+                    if wf_type is WordFormSelection.All:
+                        add = True
+                    elif wf_type is WordFormSelection.Msd:
+                        add = check_msd(w, more)
+                    else:
+                        assert(wf_type is WordFormSelection.Agreement)
+                        other_w, agreements = more
+                        add = check_agreement(w, words[other_w], agreements)
+
+                    if add:
+                        representations[w_id][0].append(w.text)
+
+        doprint = matches[0]['1'].text.startswith('evrop')

        # just need to set representation to first group...
-        for w_id, w in matches[0][0].items():
+        for w_id, w in matches[0].items():
            data = representations[w_id]
+            if doprint:
+                print(data)

            if type(data[1]) is str:
                w.representation_failed = data[0]
                w.representation = w.lemma if w.representation_failed else data[1]
            else:
-                w.representation_failed = len(data[0]) > 0
+                w.representation_failed = len(data[0]) == 0
                w.representation = w.lemma if w.representation_failed else data[1](data[0])
            
+            if doprint:
+                print(w.representation_failed, w.representation)
+        
+        if doprint:
+            print('--')
+   
    def __str__(self):
        return str(self.rendition)

@@ -563,7 +633,6 @@ class SyntacticStructure:
    def __init__(self):
        self.id = None
        self.lbs = None
-        self.agreements = []
        self.components = []

    @staticmethod
@@ -611,38 +680,19 @@ class SyntacticStructure:
            assert(el.tag == "feature")
            if 'rendition' in el.attrib:
                forms[n].append(el)
-            elif 'selection' in el.attrib and el.attrib["selection"] != "agreement":
-                forms[n].append(el)
            elif 'selection' in el.attrib:
-                self.add_agreement(n, el)
+                forms[n].append(el)
            else:
                logging.warning("Strange representation feature in structure {}. Skipping"
                        .format(self.id))
                continue

-    def add_agreement(self, n, el):
-        assert(el.get('head')[:4] == 'cid_')
-
-        n1 = n
-        n2 = el.get('head')[4:]
-        agreement_str = el.get('msd')
-        assert(agreement_str is not None)
-
-        self.agreements.append({
-            'n1': n1,
-            'n2': n2,
-            'match': agreement_str.split('+')})
-
    def __str__(self):
        comp_str = "\n".join(str(comp) for comp in self.components)
-
-        agrs = "\n".join("({} -[{}]- {}) ".format(
-            a['n1'], "|".join(a['match']), a['n2']) for a in self.agreements)
-
        links_str = "\n".join(self.components[0].tree())

-        return "{} LBS {}\nCOMPONENTS\n{}\nAGREEMENTS\n{}\nLINKS\n{}\n{}".format(
-                self.id, self.lbs, comp_str, agrs, links_str, "-" * 40)
+        return "{} LBS {}\nCOMPONENTS\n{}\n\nLINKS\n{}\n{}".format(
+                self.id, self.lbs, comp_str, links_str, "-" * 40)

    def get_component(self, idx):
        for c in self.components:
@@ -695,21 +745,10 @@ class SyntacticStructure:

    def match(self, word):
        matches = self.components[0].match(word)
-        if matches is None:
-            return []
+        return [] if matches is None else matches
        
-        to_ret = []
-        for m in matches:
-            # if not self.check_agreements(m):
-            #     bad = "Agreement"
-            # elif not self.check_form(m):
-            #     bad = "Form"
-            # else:
-            #     bad = "OK"
-
-            to_ret.append((m, self.check_agreements(m)))
-
-        return to_ret
+        # for m in matches:
+        #     to_ret.append((m, self.check_agreements(m)))


 def build_structures(filename):
@@ -898,11 +937,11 @@ class Writer:
    def write_out_worker(self, file_handler, structure_id, components, colocation_ids):
        rows = []

-        for cid, m, reason, freq in colocation_ids.get_matches_for(structure_id, not self.all):
+        for cid, m, freq in colocation_ids.get_matches_for(structure_id, not self.all):
            to_write = []
            representation = ""

-            for idx, comp in enumerate(components):
+            for idx, _comp in enumerate(components):
                idx = str(idx + 1)
                word = m[idx] if idx in m else None
                to_write.extend(self.from_word(word))
@@ -978,21 +1017,23 @@ class ColocationIds:
    def add_matches(self, matches):
        for sid, nms in matches.items():
            for nm in nms:
-                self._add_match(nm[2], sid, (nm[0], nm[1]))
+                self._add_match(nm[1], sid, nm[0])
    
    def get_matches_for(self, structure_id, group):
        for _cid_tup, (cid, cid_matches, sid) in self.data.items():
            if sid != structure_id:
                continue

-            for words, reason in cid_matches:
-                yield (cid, words, reason, len(cid_matches))
+            for words in cid_matches:
+                yield (cid, words, len(cid_matches))
                if group:
                    break

    def set_representations(self, structures):
        components_dict = {structure.id: structure for structure in structures}
        for _1, (_2, cid_matches, sid) in self.data.items():
+            if _2 == '1309':
+                a = 1
            ComponentRendition.set_representations(cid_matches, components_dict[sid])


@@ -1004,12 +1045,12 @@ def match_file(words, structures):
        for w in words:
            mhere = s.match(w)
            logging.debug("  GOT: {}".format(len(mhere)))
-            for match, reason in mhere: 
+            for match in mhere: 
                colocation_id = [(idx, w.lemma) for idx, w in match.items()]
                colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0]))
                colocation_id = tuple(colocation_id)

-                matches[s.id].append((match, reason, colocation_id))
+                matches[s.id].append((match, colocation_id))

    return matches