Does not yet work, agreements in representation

2019-05-20 18:14:11 +02:00
parent 5bd0b4a064
commit dce55d04a3
1 changed files with 97 additions and 56 deletions
@@ -132,6 +132,11 @@ class Rendition(Enum):
    Lexis = 2
    Unknown = 3
 class WordFormSelection(Enum):
    All = 0
    Msd = 1
    Agreement = 2
 class Order(Enum):
    FromTo = 0
    ToFrom = 1
@@ -192,9 +197,15 @@ class ComponentRendition:
        elif 'selection' in feature:
            if feature['selection'] == "msd":
                selectors = {k: v for k, v in feature.items() if k != 'selection'}
-                self._set_more(selectors)
+                self._set_more((WordFormSelection.Msd, selectors))
            elif feature['selection'] == "all":
-                self._set_more("all")
+                self._set_more((WordFormSelection.All, None))
            elif feature['selection'] == 'agreement':
                assert(feature['head'][:4] == 'cid_')
                assert(feature['msd'] is not None)
                self._set_more((WordFormSelection.Agreement,
                    (feature['head'][4:], feature['msd'].split('+'))))
            else:
                raise NotImplementedError("Representation selection: {}".format(feature))
@@ -211,13 +222,49 @@ class ComponentRendition:
        def render_all(lst):
            return "/".join(set(lst))
-        def render_form(_lst):
+        def render_form(lst):
-            return ":("
+            # find most frequent
            return max(set(lst), key=lst.count)
-        for words, agreement in matches:
+        def check_msd(word, selectors):
-            if not agreement:
+            for key, value in selectors.items():
                t = word.msd[0]
                v = TAGSET[t].index(key.lower())
                f1 = word.msd[v + 1]
                f2 = CODES[value]
                if '-' not in [f1, f2] and f1 != f2:
                    return False
            return True
        def check_agreement(w1, w2, agreements):
            for agr_case in agreements:
                t1 = w1.msd[0]
                v1 = TAGSET[t1].index(agr_case)
                assert(v1 >= 0)
                # if none specified: nedolocnik, always agrees
                if v1 + 1 >= len(w1.msd): 
                    continue 
                # first is uppercase, not in TAGSET
                m1 = w1.msd[v1 + 1]
                # REPEAT (not DRY!)
                t2 = w2.msd[0]
                v2 = TAGSET[t2].index(agr_case)
                assert(v2 >= 0)
                if v2 + 1 >= len(w2.msd): 
                    continue 
                m2 = w2.msd[v2 + 1]
                # match!
                if '-' not in [m1, m2] and m1 != m2:
                    return False
            return True
        for words in matches:
            for w_id, w in words.items():
                component = structure.get_component(w_id)
                rep = component.representation
@@ -234,24 +281,47 @@ class ComponentRendition:
                # it HAS to be word_form now
                else:
                    wf_type, more = rep.more
                    # set correct type first
                    if type(representations[w_id][1]) is str:
                        representations[w_id] = (
-                            [], render_all if rep.more == "all" else render_form
+                            [], render_all if wf_type is WordFormSelection.All else render_form
                        )
                    if wf_type is WordFormSelection.All:
                        add = True
                    elif wf_type is WordFormSelection.Msd:
                        add = check_msd(w, more)
                    else:
                        assert(wf_type is WordFormSelection.Agreement)
                        other_w, agreements = more
                        add = check_agreement(w, words[other_w], agreements)
                    if add:
                        representations[w_id][0].append(w.text)
        doprint = matches[0]['1'].text.startswith('evrop')
        # just need to set representation to first group...
-        for w_id, w in matches[0][0].items():
+        for w_id, w in matches[0].items():
            data = representations[w_id]
            if doprint:
                print(data)
            if type(data[1]) is str:
                w.representation_failed = data[0]
                w.representation = w.lemma if w.representation_failed else data[1]
            else:
-                w.representation_failed = len(data[0]) > 0
+                w.representation_failed = len(data[0]) == 0
                w.representation = w.lemma if w.representation_failed else data[1](data[0])
            if doprint:
                print(w.representation_failed, w.representation)
        if doprint:
            print('--')
    def __str__(self):
        return str(self.rendition)
@@ -563,7 +633,6 @@ class SyntacticStructure:
    def __init__(self):
        self.id = None
        self.lbs = None
        self.agreements = []
        self.components = []
    @staticmethod
@@ -611,38 +680,19 @@ class SyntacticStructure:
            assert(el.tag == "feature")
            if 'rendition' in el.attrib:
                forms[n].append(el)
            elif 'selection' in el.attrib and el.attrib["selection"] != "agreement":
                forms[n].append(el)
            elif 'selection' in el.attrib:
-                self.add_agreement(n, el)
+                forms[n].append(el)
            else:
                logging.warning("Strange representation feature in structure {}. Skipping"
                        .format(self.id))
                continue
    def add_agreement(self, n, el):
        assert(el.get('head')[:4] == 'cid_')
        n1 = n
        n2 = el.get('head')[4:]
        agreement_str = el.get('msd')
        assert(agreement_str is not None)
        self.agreements.append({
            'n1': n1,
            'n2': n2,
            'match': agreement_str.split('+')})
    def __str__(self):
        comp_str = "\n".join(str(comp) for comp in self.components)
        agrs = "\n".join("({} -[{}]- {}) ".format(
            a['n1'], "|".join(a['match']), a['n2']) for a in self.agreements)
        links_str = "\n".join(self.components[0].tree())
-        return "{} LBS {}\nCOMPONENTS\n{}\nAGREEMENTS\n{}\nLINKS\n{}\n{}".format(
+        return "{} LBS {}\nCOMPONENTS\n{}\n\nLINKS\n{}\n{}".format(
-                self.id, self.lbs, comp_str, agrs, links_str, "-" * 40)
+                self.id, self.lbs, comp_str, links_str, "-" * 40)
    def get_component(self, idx):
        for c in self.components:
@@ -695,21 +745,10 @@ class SyntacticStructure:
    def match(self, word):
        matches = self.components[0].match(word)
-        if matches is None:
+        return [] if matches is None else matches
            return []
-        to_ret = []
+        # for m in matches:
-        for m in matches:
+        #     to_ret.append((m, self.check_agreements(m)))
            # if not self.check_agreements(m):
            #     bad = "Agreement"
            # elif not self.check_form(m):
            #     bad = "Form"
            # else:
            #     bad = "OK"
            to_ret.append((m, self.check_agreements(m)))
        return to_ret
 def build_structures(filename):
@@ -898,11 +937,11 @@ class Writer:
    def write_out_worker(self, file_handler, structure_id, components, colocation_ids):
        rows = []
-        for cid, m, reason, freq in colocation_ids.get_matches_for(structure_id, not self.all):
+        for cid, m, freq in colocation_ids.get_matches_for(structure_id, not self.all):
            to_write = []
            representation = ""
-            for idx, comp in enumerate(components):
+            for idx, _comp in enumerate(components):
                idx = str(idx + 1)
                word = m[idx] if idx in m else None
                to_write.extend(self.from_word(word))
@@ -978,21 +1017,23 @@ class ColocationIds:
    def add_matches(self, matches):
        for sid, nms in matches.items():
            for nm in nms:
-                self._add_match(nm[2], sid, (nm[0], nm[1]))
+                self._add_match(nm[1], sid, nm[0])
    def get_matches_for(self, structure_id, group):
        for _cid_tup, (cid, cid_matches, sid) in self.data.items():
            if sid != structure_id:
                continue
-            for words, reason in cid_matches:
+            for words in cid_matches:
-                yield (cid, words, reason, len(cid_matches))
+                yield (cid, words, len(cid_matches))
                if group:
                    break
    def set_representations(self, structures):
        components_dict = {structure.id: structure for structure in structures}
        for _1, (_2, cid_matches, sid) in self.data.items():
            if _2 == '1309':
                a = 1
            ComponentRendition.set_representations(cid_matches, components_dict[sid])
@@ -1004,12 +1045,12 @@ def match_file(words, structures):
        for w in words:
            mhere = s.match(w)
            logging.debug("  GOT: {}".format(len(mhere)))
-            for match, reason in mhere: 
+            for match in mhere: 
                colocation_id = [(idx, w.lemma) for idx, w in match.items()]
                colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0]))
                colocation_id = tuple(colocation_id)
-                matches[s.id].append((match, reason, colocation_id))
+                matches[s.id].append((match, colocation_id))
    return matches