Refactoring representations. Now muuuuch nicer code, not yet working though :)

Added: multiple representations per component id
2019-05-30 11:34:31 +02:00
parent 307007218d
commit bfd4d4a747
1 changed files with 345 additions and 185 deletions
@@ -126,19 +126,6 @@ class RestrictionType(Enum):
    Lexis = 1
    MatchAll = 2

-
-class Rendition(Enum):
-    Lemma = 0
-    WordForm = 1
-    Lexis = 2
-    Unknown = 3
-
-class WordFormSelection(Enum):
-    All = 0
-    Msd = 1
-    Agreement = 2
-    Any = 3
-
 class Order(Enum):
    FromTo = 0
    ToFrom = 1
@@ -171,14 +158,163 @@ class Order(Enum):
        else:
            raise NotImplementedError("Should not be here: Order match")

+
+class ComponentRepresentation:
+    def __init__(self, data, word_renderer):
+        self.data = data
+        self.word_renderer = word_renderer
+
+        self.words = []
+        self.rendition_text = None
+        self.agreement = None
+    
+    def get_agreement(self):
+        return None
+
+    def add_word(self, word):
+        self.words.append(word)
+
+    def render(self):
+        if self.rendition_text is None:
+            print(type(self))
+            self.rendition_text = self._render()
+    
+    def rendition(self):
+        return "" if self.rendition_text is None else self.rendition_text
+
+    def _render(self):
+        raise NotImplementedError("Not implemented for class: {}".format(type(self)))
+
+class LemmaCR(ComponentRepresentation):
+    def _render(self):
+        return self.words[0].lemma if len(self.words) > 0 else None
+
+class LexisCR(ComponentRepresentation):
+    def _render(self):
+        return self.data
+    
+class WordFormAllCR(ComponentRepresentation):
+    def _render(self):
+        txt = "/".join(set([w.text for w in set(self.words)])) if len(self.words) > 0 else None
+        return txt
+
+class WordFormAnyCR(ComponentRepresentation):
+    def _render(self):
+        text_forms = {}
+        msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
+        for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
+            text_forms[(msd, lemma)] = text
+
+        words_counter = []
+        for word in self.words:
+            words_counter.append((word.msd, word.lemma))
+        sorted_words = sorted(set(words_counter), key=words_counter.count)
+
+        for word_msd, word_lemma in sorted_words:
+            if self.agreement is not None:
+                if self.agreement.match(word_msd):
+                    if word_lemma is None:
+                        return None
+                    else:
+                        return text_forms[(word_msd, word_lemma)]
+        
+class WordFormMsdCR(WordFormAnyCR):
+    def __init__(self, *args):
+        super().__init__(*args)
+        self.backup_word = None
+
+    def check_msd(self, word):
+        selectors = self.data
+        for key, value in selectors.items():
+            t = word.msd[0]
+            v = TAGSET[t].index(key.lower())
+            f1 = word.msd[v + 1]
+            f2 = CODES[value]
+
+            if '-' not in [f1, f2] and f1 != f2:
+                return False
+
+        return True
+    pass
+    
+    def add_word(self, word):
+        if self.backup_word is None:
+            msd = self.word_renderer.get_lemma_msd(word.lemma, word.msd)
+            WordLemma = namedtuple('WordLemmaOnly', 'msd most_frequent_text lemma text')
+            self.backup_word = WordLemma(msd=msd, most_frequent_text=lambda *x: None, lemma=None, text=None)
+
+        if self.check_msd(word):
+            super().add_word(word)
+    
+    def _render(self):
+        self.words.append(self.backup_word)
+        return super()._render()
+
+class WordFormAgreementCR(ComponentRepresentation):
+    def __init__(self, data, word_renderer):
+        super().__init__(data, word_renderer)
+        self.agree_with, self.data = self.data
+    
+    def get_agreement(self):
+        return self.agree_with
+    
+    def match(self, word_msd):
+        word_category = self.words[0].msd[0]
+        word_lemma = self.words[0].lemma
+        agreements = self.data
+
+        existing = [(w.msd, w.text) for w in self.words]
+
+        for candidate_msd, candidate_text in self.word_renderer.available_words(word_lemma, existing):
+            if word_category != candidate_msd[0]:
+                continue
+
+            if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, agreements):
+                self.rendition_text = candidate_text
+                return True
+
+        return False
+
+    @staticmethod
+    def check_agreement(msd1, msd2, agreements):
+        for agr_case in agreements:
+            t1 = msd1[0]
+            # if not in msd, some strange msd was tries, skipping...
+            if agr_case not in TAGSET[t1]:
+                logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1))
+                return False
+
+            v1 = TAGSET[t1].index(agr_case)
+            # if none specified: nedolocnik, always agrees
+            if v1 + 1 >= len(msd1): 
+                continue 
+            # first is uppercase, not in TAGSET
+            m1 = msd1[v1 + 1]
+
+            # REPEAT (not DRY!)
+            t2 = msd2[0]
+            if agr_case not in TAGSET[t2]:
+                logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2))
+                return False
+            v2 = TAGSET[t2].index(agr_case)
+            if v2 + 1 >= len(msd2): 
+                continue 
+            m2 = msd2[v2 + 1]
+
+            # match!
+            if '-' not in [m1, m2] and m1 != m2:
+                return False
+
+        return True
+    
+    def render(self):
+        pass
+
+
 class ComponentRendition:
    def __init__(self):
        self.more = None
-        self.rendition = Rendition.Unknown
-    
-    def _set_rendition(self, r):
-        assert(self.rendition is Rendition.Unknown)
-        self.rendition = r
+        self.representation_factory = ComponentRepresentation
    
    def _set_more(self, m):
        self.more = m
@@ -186,190 +322,205 @@ class ComponentRendition:
    def add_feature(self, feature):
        if 'rendition' in feature:
            if feature['rendition'] == "lemma":
-                self._set_rendition(Rendition.Lemma)
+                self.representation_factory = LemmaCR
            elif feature['rendition'] == "word_form":
-                self._set_rendition(Rendition.WordForm)
-                self._set_more((WordFormSelection.Any, None))
+                # just by default, changes with selection
+                self.representation_factory = WordFormAnyCR
            elif feature['rendition'] == "lexis":
-                self._set_rendition(Rendition.Lexis)
-                self._set_more(feature['string'])
+                self.representation_factory = LexisCR
+                self.mor = feature['string']
            else:
                raise NotImplementedError("Representation rendition: {}".format(feature))

        elif 'selection' in feature:
            if feature['selection'] == "msd":
-                selectors = {k: v for k, v in feature.items() if k != 'selection'}
-                self._set_more((WordFormSelection.Msd, selectors))
+                self.representation_factory = WordFormMsdCR
+                self.more = {k: v for k, v in feature.items() if k != 'selection'}
            elif feature['selection'] == "all":
-                self._set_more((WordFormSelection.All, None))
+                self.representation_factory = WordFormAllCR
            elif feature['selection'] == 'agreement':
                assert(feature['head'][:4] == 'cid_')
                assert(feature['msd'] is not None)
-
-                self._set_more((WordFormSelection.Agreement,
-                    (feature['head'][4:], feature['msd'].split('+'))))
+                self.representation_factory = WordFormAgreementCR
+                self.more = (feature['head'][4:], feature['msd'].split('+'))
            else:
                raise NotImplementedError("Representation selection: {}".format(feature))

        else:
            return None
    
-    def isit(self, rendition):
-        return self.rendition is rendition
+    def cr_instance(self, word_renderer):
+        return self.representation_factory(self.more, word_renderer)
    
    @staticmethod
    def set_representations(matches, structure, word_renderer):
-        representations = {
-            c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""]
-            for c in structure.components
-        }
-        found_agreements = {}
-        word_component_id = {}
-
-        def render_all(component_id, lst, _bw):
-            rep = "/".join(set([w.text for w in set(lst)])) if len(lst) > 0 else None
-            matches.representations[component_id] = rep
+        representations = {}
+        for c in structure.components:
+            representations[c.idx] = []
+            for rep in c.representation:
+                representations[c.idx].append(rep.cr_instance(word_renderer))
        
-        def render_form(component_id, lst, backup_word):
-            if backup_word is not None:
-                lst.append(backup_word)
-
-            text_forms = {}
-            msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in lst])
-            for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
-                text_forms[(msd, lemma)] = text
-
-            lst_ctr = []
-            for word in lst:
-                lst_ctr.append((word.msd, word.lemma))
-            sorted_lst = sorted(set(lst_ctr), key=lst.count)
-
-            for word_msd, word_lemma in sorted_lst:
-                if component_id in found_agreements:
-                    other_component_id, other_word, agreements, other_texts = found_agreements[component_id]
-                    agr = are_agreements_ok(word_msd, other_word.lemma, other_word.msd, agreements, other_texts)
-                    if agr is None:
-                        continue
-
-                    matches.representations[other_component_id] = agr
-
-                if word_lemma is not None:
-                    matches.representations[component_id] = text_forms[(msd, lemma)] #word_renderer.render(word_lemma, word_msd)
-
-                break
-        
-        def are_agreements_ok(w1_msd, ow_lemma, ow_msd, agreements, ow_texts):
-            for w2_msd, w2_txt in word_renderer.available_words(ow_lemma, ow_texts):
-                if ow_msd[0] != w2_msd[0]:
+        for cid, reps in representations.items():
+            for rep in reps:
+                agr = rep.get_agreement()
+                if agr is None:
                    continue

-                if check_agreement(w1_msd, w2_msd, agreements):
-                    return w2_txt
+                if len(representations[agr]) != 1:
+                    n = len(representations[agr])
+                    raise NotImplementedError(
+                        "Structure {}: ".format(structure.id) +
+                        "component {} has agreement".format(cid) +
+                        " with component {}".format(agr) +
+                        ", however there are {} (!= 1) representations".format(n) +
+                        " of component {}!".format(agr))

-        def check_msd(word, selectors):
-            for key, value in selectors.items():
-                t = word.msd[0]
-                v = TAGSET[t].index(key.lower())
-                f1 = word.msd[v + 1]
-                f2 = CODES[value]
+                representations[agr][0].agreement = rep

-                if '-' not in [f1, f2] and f1 != f2:
-                    return False
+        # representations = {
+        #     c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""]
+        #     for c in structure.components
+        # }
+        # found_agreements = {}

-            return True
+        # def render_form(component_id, lst, backup_word):
+        #     if backup_word is not None:
+        #         lst.append(backup_word)
+
+        #     text_forms = {}
+        #     msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in lst])
+        #     for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
+        #         text_forms[(msd, lemma)] = text
+
+        #     lst_ctr = []
+        #     for word in lst:
+        #         lst_ctr.append((word.msd, word.lemma))
+        #     sorted_lst = sorted(set(lst_ctr), key=lst.count)
+
+        #     for word_msd, word_lemma in sorted_lst:
+        #         if component_id in found_agreements:
+        #             other_component_id, other_word, agreements, other_texts = found_agreements[component_id]
+        #             agr = are_agreements_ok(word_msd, other_word.lemma, other_word.msd, agreements, other_texts)
+        #             if agr is None:
+        #                 continue
+
+        #             matches.representations[other_component_id] = agr
+
+        #         if word_lemma is not None:
+        #             matches.representations[component_id] = text_forms[(msd, lemma)] #word_renderer.render(word_lemma, word_msd)
+
+        #         break
        
-        def check_agreement(msd1, msd2, agreements):
-            for agr_case in agreements:
-                t1 = msd1[0]
-                # if not in msd, some strange msd was tries, skipping...
-                if agr_case not in TAGSET[t1]:
-                    logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1))
-                    return False
+        # def are_agreements_ok(w1_msd, ow_lemma, ow_msd, agreements, ow_texts):
+        #     for w2_msd, w2_txt in word_renderer.available_words(ow_lemma, ow_texts):
+        #         if ow_msd[0] != w2_msd[0]:
+        #             continue

-                v1 = TAGSET[t1].index(agr_case)
-                # if none specified: nedolocnik, always agrees
-                if v1 + 1 >= len(msd1): 
-                    continue 
-                # first is uppercase, not in TAGSET
-                m1 = msd1[v1 + 1]
+        #         if check_agreement(w1_msd, w2_msd, agreements):
+        #             return w2_txt

-                # REPEAT (not DRY!)
-                t2 = msd2[0]
-                if agr_case not in TAGSET[t2]:
-                    logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2))
-                    return False
-                v2 = TAGSET[t2].index(agr_case)
-                if v2 + 1 >= len(msd2): 
-                    continue 
-                m2 = msd2[v2 + 1]
+        
+        # def check_agreement(msd1, msd2, agreements):
+        #     for agr_case in agreements:
+        #         t1 = msd1[0]
+        #         # if not in msd, some strange msd was tries, skipping...
+        #         if agr_case not in TAGSET[t1]:
+        #             logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1))
+        #             return False

-                # match!
-                if '-' not in [m1, m2] and m1 != m2:
-                    return False
+        #         v1 = TAGSET[t1].index(agr_case)
+        #         # if none specified: nedolocnik, always agrees
+        #         if v1 + 1 >= len(msd1): 
+        #             continue 
+        #         # first is uppercase, not in TAGSET
+        #         m1 = msd1[v1 + 1]

-            return True
+        #         # REPEAT (not DRY!)
+        #         t2 = msd2[0]
+        #         if agr_case not in TAGSET[t2]:
+        #             logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2))
+        #             return False
+        #         v2 = TAGSET[t2].index(agr_case)
+        #         if v2 + 1 >= len(msd2): 
+        #             continue 
+        #         m2 = msd2[v2 + 1]
+
+        #         # match!
+        #         if '-' not in [m1, m2] and m1 != m2:
+        #             return False
+
+        #     return True

        for words in matches.matches:
            # first pass, check everything but agreements
            for w_id, w in words.items():
                component = structure.get_component(w_id)
-                rep = component.representation
-                word_component_id[w.id] = w_id
-
-                if rep.isit(Rendition.Lemma):
-                    representations[w_id][0] = False
-                    representations[w_id][1] = w.lemma
-                elif rep.isit(Rendition.Lexis):
-                    representations[w_id][0] = False
-                    representations[w_id][1] = rep.more
-                elif rep.isit(Rendition.Unknown):
-                    representations[w_id][0] = False
-                    representations[w_id][1] = ""
+                component_representations = representations[component.idx]
+                for representation in component_representations:
+                    representation.add_word(w)
                
-                # it HAS to be word_form now
-                else:
-                    assert(rep.isit(Rendition.WordForm))
-                    wf_type, more = rep.more
-                    add = True
+                # if rep.isit(Rendition.Lemma):
+                #     representations[w_id][0] = False
+                #     representations[w_id][1] = w.lemma
+                # elif rep.isit(Rendition.Lexis):
+                #     representations[w_id][0] = False
+                #     representations[w_id][1] = rep.more
+                # elif rep.isit(Rendition.Unknown):
+                #     representations[w_id][0] = False
+                #     representations[w_id][1] = ""
+                
+                # # it HAS to be word_form now
+                # else:
+                #     assert(rep.isit(Rendition.WordForm))
+                #     wf_type, more = rep.more
+                #     add = True

-                    if wf_type is WordFormSelection.Msd:
-                        add = check_msd(w, more)
-                        func = render_form
-                    elif wf_type is WordFormSelection.All:
-                        func = render_all
-                    elif wf_type is WordFormSelection.Any:
-                        func = render_form
-                    else:
-                        assert(wf_type is WordFormSelection.Agreement)
-                        other_w, agreements = more
-                        if other_w not in found_agreements:
-                            found_agreements[other_w] = (w_id, w, agreements, [])
+                #     if wf_type is WordFormSelection.Msd:
+                #         add = check_msd(w, more)
+                #         func = render_form
+                #     elif wf_type is WordFormSelection.All:
+                #         func = render_all
+                #     elif wf_type is WordFormSelection.Any:
+                #         func = render_form
+                #     else:
+                #         assert(wf_type is WordFormSelection.Agreement)
+                #         other_w, agreements = more
+                #         if other_w not in found_agreements:
+                #             found_agreements[other_w] = (w_id, w, agreements, [])

-                        found_agreements[other_w][-1].append((w.msd, w.text))
-                        func = lambda *x: None
+                #         found_agreements[other_w][-1].append((w.msd, w.text))
+                #         func = lambda *x: None

-                    representations[w_id][1] = func
-                    if add:
-                        representations[w_id][0].append(w)
+                #     representations[w_id][1] = func
+                #     if add:
+                #         representations[w_id][0].append(w)

-        # just need to set representation to first group,
-        # but in correct order, agreements last!
-        representation_sorted_words = []
-        for w_id, w in matches.matches[0].items():
-            rep = component.representation
-            if rep.isit(Rendition.WordForm) and rep.more[0] is WordFormSelection.Agreement:
-                representation_sorted_words.append((w_id, w))
-            else:
-                representation_sorted_words.insert(0, (w_id, w))
+        for cid, reps in representations.items():
+            for rep in reps:
+                rep.render()

-        for w_id, w in representation_sorted_words:
-            data = representations[w_id]
-            if type(data[1]) is str:
-                matches.representations[w_id] = None if data[0] else data[1]
-            else:
-                backup_msd = word_renderer.get_lemma_msd(w.lemma)
-                backup_word = lemma_only_word(backup_msd)
-                data[1](str(w_id), data[0], backup_word)
+        for cid, reps in representations.items():
+            rep = " ".join(rep.rendition() for rep in reps)
+            matches.representations[cid] = rep
+        
+        # # just need to set representation to first group,
+        # # but in correct order, agreements last!
+        # representation_sorted_words = []
+        # for w_id, w in matches.matches[0].items():
+        #     rep = component.representation
+        #     if rep.isit(Rendition.WordForm) and rep.more[0] is WordFormSelection.Agreement:
+        #         representation_sorted_words.append((w_id, w))
+        #     else:
+        #         representation_sorted_words.insert(0, (w_id, w))
+
+        # for w_id, w in representation_sorted_words:
+        #     data = representations[w_id]
+        #     if type(data[1]) is str:
+        #         matches.representations[w_id] = None if data[0] else data[1]
+        #     else:
+        #         backup_msd = word_renderer.get_lemma_msd(w.lemma)
+        #         backup_word = lemma_only_word(backup_msd)
+        #         data[1](str(w_id), data[0], backup_word)
            
    def __str__(self):
        return str(self.rendition)
@@ -519,7 +670,7 @@ class Component:
        self.idx = idx
        self.restriction = None
        self.next_element = []
-        self.representation = ComponentRendition()
+        self.representation = []
        self.selection = {}

        self.iter_ctr = 0
@@ -541,8 +692,11 @@ class Component:
            raise RuntimeError("Unreachable")

    def set_representation(self, representation):
-        for feature in representation:
-            self.representation.add_feature(feature.attrib)
+        for rep in representation:
+            crend = ComponentRendition()
+            for feature in rep:
+                crend.add_feature(feature.attrib)
+            self.representation.append(crend)

    def find_next(self, deps, comps, restrs, reprs):
        to_ret = []
@@ -721,21 +875,17 @@ class SyntacticStructure:
        return st

    def add_representation(self, n, rep_el, forms):
-        if rep_el.tag == "representation_and":
-            rep_el = rep_el[0]
-            logging.warning("Only using first reprentation in representation_and in structure {}".format(self.id))
-
        assert(rep_el.tag == "representation")
+        to_add = []
        for el in rep_el:
            assert(el.tag == "feature")
-            if 'rendition' in el.attrib:
-                forms[n].append(el)
-            elif 'selection' in el.attrib:
-                forms[n].append(el)
+            if 'rendition' in el.attrib or 'selection' in el.attrib:
+                to_add.append(el)
            else:
                logging.warning("Strange representation feature in structure {}. Skipping"
                        .format(self.id))
                continue
+        forms[n].append(to_add)

    def __str__(self):
        comp_str = "\n".join(str(comp) for comp in self.components)
@@ -892,16 +1042,17 @@ class Word:
        return word_renderer.render(self.lemma, self.msd)

 class WordMsdRenderer:
-    def __init__(self):
+    def __init__(self, lemma_features):
        self.all_words = []
        self.rendered_words = {}
        self.frequent_words = {}
        self.lemma_msd = {}
+        self.lemma_features = lemma_features
    
    def add_words(self, words):
        self.all_words.extend(words)
    
-    def generate_renders(self, lemma_features):
+    def generate_renders(self):
        data = defaultdict(lambda: defaultdict(list))
        for w in self.all_words:
            data[w.lemma][w.msd].append(w.text)
@@ -926,11 +1077,12 @@ class WordMsdRenderer:
            for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]):
                self.frequent_words[lemma].append((msd, txt, n))
        
+        lf = self.lemma_features
        for lemma in self.lemma_msd.keys():
            cmsd = self.lemma_msd[lemma]
-            if cmsd[0] in lemma_features:
+            if cmsd[0] in lf:
                self.lemma_msd[lemma] = "".join(
-                    l1 if l1 != "-" else l2 for l1, l2 in zip(lemma_features[cmsd[0]], cmsd)
+                    l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd)
                )
        
    @staticmethod
@@ -952,7 +1104,7 @@ class WordMsdRenderer:
    
    def available_words(self, lemma, existing_texts):
        counted_texts = Counter(existing_texts)
-        for (msd, text), n in counted_texts.most_common():
+        for (msd, text), _n in counted_texts.most_common():
            yield (msd, text)

        if lemma in self.frequent_words:
@@ -960,11 +1112,17 @@ class WordMsdRenderer:
                if (msd, text) not in counted_texts:
                    yield (msd, text)
    
-    def get_lemma_msd(self, lemma):
-        if lemma in self.lemma_msd and self.lemma_msd[lemma][0] != '-':
-            return self.lemma_msd[lemma]
+    def get_lemma_msd(self, lemma, word_msd):
+        # should be here, since we collect every lemmas
+        lemma_msd = self.lemma_msd[lemma]
+
+        if lemma_msd[0] == '-':
+            if word_msd[0] in self.lemma_features:
+                return self.lemma_features[word_msd[0]]
+            else:
+                return '-'
        else:
-            return None
+            return lemma_msd

 def is_root_id(id_):
    return len(id_.split('.')) == 3
@@ -1200,6 +1358,7 @@ class ColocationIds:
        idx = 1
        for _1, sm in tqdm(self.data.items()):
            ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer)
+            print(idx)
            idx += 1


@@ -1227,7 +1386,7 @@ def main(input_file, structures_file, args):
        logging.debug(str(s))

    colocation_ids = ColocationIds()
-    word_renderer = WordMsdRenderer()
+    word_renderer = WordMsdRenderer(lemma_msds)

    # if True:
    #     with open("match_word.p", "rb") as fp:
@@ -1279,13 +1438,14 @@ def main(input_file, structures_file, args):
                word_renderer.add_words(words)

    # get word renders for lemma/msd
-    word_renderer.generate_renders(lemma_msds)
-    # figure out representations!
-    colocation_ids.set_representations(structures, word_renderer)
+    word_renderer.generate_renders()

+    if args.output:
+        # figure out representations!
+        colocation_ids.set_representations(structures, word_renderer)
+        Writer.make_output_writer(args).write_out(structures, colocation_ids)
    if args.all:
        Writer.make_all_writer(args).write_out(structures, colocation_ids)
-    Writer.make_output_writer(args).write_out(structures, colocation_ids)

    logging.debug([(k, len(v)) for k, v in matches.items()])
    logging.debug(sum(len(v) for _, v in matches.items()))