From dce55d04a3d8634c0de247594a20ea861dc05e41 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Mon, 20 May 2019 18:14:11 +0200 Subject: [PATCH] Does not yet work, agreements in representation --- wani.py | 153 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 97 insertions(+), 56 deletions(-) diff --git a/wani.py b/wani.py index f5d89a6..ef49952 100644 --- a/wani.py +++ b/wani.py @@ -132,6 +132,11 @@ class Rendition(Enum): Lexis = 2 Unknown = 3 +class WordFormSelection(Enum): + All = 0 + Msd = 1 + Agreement = 2 + class Order(Enum): FromTo = 0 ToFrom = 1 @@ -192,9 +197,15 @@ class ComponentRendition: elif 'selection' in feature: if feature['selection'] == "msd": selectors = {k: v for k, v in feature.items() if k != 'selection'} - self._set_more(selectors) + self._set_more((WordFormSelection.Msd, selectors)) elif feature['selection'] == "all": - self._set_more("all") + self._set_more((WordFormSelection.All, None)) + elif feature['selection'] == 'agreement': + assert(feature['head'][:4] == 'cid_') + assert(feature['msd'] is not None) + + self._set_more((WordFormSelection.Agreement, + (feature['head'][4:], feature['msd'].split('+')))) else: raise NotImplementedError("Representation selection: {}".format(feature)) @@ -211,13 +222,49 @@ class ComponentRendition: def render_all(lst): return "/".join(set(lst)) - def render_form(_lst): - return ":(" + def render_form(lst): + # find most frequent + return max(set(lst), key=lst.count) + + def check_msd(word, selectors): + for key, value in selectors.items(): + t = word.msd[0] + v = TAGSET[t].index(key.lower()) + f1 = word.msd[v + 1] + f2 = CODES[value] - for words, agreement in matches: - if not agreement: - continue + if '-' not in [f1, f2] and f1 != f2: + return False + + return True + + def check_agreement(w1, w2, agreements): + for agr_case in agreements: + t1 = w1.msd[0] + v1 = TAGSET[t1].index(agr_case) + assert(v1 >= 0) + # if none specified: nedolocnik, always agrees + if v1 + 1 >= len(w1.msd): + continue + # first is uppercase, not in TAGSET + m1 = w1.msd[v1 + 1] + + # REPEAT (not DRY!) + t2 = w2.msd[0] + v2 = TAGSET[t2].index(agr_case) + assert(v2 >= 0) + if v2 + 1 >= len(w2.msd): + continue + m2 = w2.msd[v2 + 1] + # match! + if '-' not in [m1, m2] and m1 != m2: + return False + + return True + + + for words in matches: for w_id, w in words.items(): component = structure.get_component(w_id) rep = component.representation @@ -234,23 +281,46 @@ class ComponentRendition: # it HAS to be word_form now else: + wf_type, more = rep.more + # set correct type first if type(representations[w_id][1]) is str: representations[w_id] = ( - [], render_all if rep.more == "all" else render_form + [], render_all if wf_type is WordFormSelection.All else render_form ) - representations[w_id][0].append(w.text) + + if wf_type is WordFormSelection.All: + add = True + elif wf_type is WordFormSelection.Msd: + add = check_msd(w, more) + else: + assert(wf_type is WordFormSelection.Agreement) + other_w, agreements = more + add = check_agreement(w, words[other_w], agreements) + + if add: + representations[w_id][0].append(w.text) + + doprint = matches[0]['1'].text.startswith('evrop') # just need to set representation to first group... - for w_id, w in matches[0][0].items(): + for w_id, w in matches[0].items(): data = representations[w_id] + if doprint: + print(data) if type(data[1]) is str: w.representation_failed = data[0] w.representation = w.lemma if w.representation_failed else data[1] else: - w.representation_failed = len(data[0]) > 0 + w.representation_failed = len(data[0]) == 0 w.representation = w.lemma if w.representation_failed else data[1](data[0]) + + if doprint: + print(w.representation_failed, w.representation) + + if doprint: + print('--') def __str__(self): return str(self.rendition) @@ -563,7 +633,6 @@ class SyntacticStructure: def __init__(self): self.id = None self.lbs = None - self.agreements = [] self.components = [] @staticmethod @@ -611,38 +680,19 @@ class SyntacticStructure: assert(el.tag == "feature") if 'rendition' in el.attrib: forms[n].append(el) - elif 'selection' in el.attrib and el.attrib["selection"] != "agreement": - forms[n].append(el) elif 'selection' in el.attrib: - self.add_agreement(n, el) + forms[n].append(el) else: logging.warning("Strange representation feature in structure {}. Skipping" .format(self.id)) continue - def add_agreement(self, n, el): - assert(el.get('head')[:4] == 'cid_') - - n1 = n - n2 = el.get('head')[4:] - agreement_str = el.get('msd') - assert(agreement_str is not None) - - self.agreements.append({ - 'n1': n1, - 'n2': n2, - 'match': agreement_str.split('+')}) - def __str__(self): comp_str = "\n".join(str(comp) for comp in self.components) - - agrs = "\n".join("({} -[{}]- {}) ".format( - a['n1'], "|".join(a['match']), a['n2']) for a in self.agreements) - links_str = "\n".join(self.components[0].tree()) - return "{} LBS {}\nCOMPONENTS\n{}\nAGREEMENTS\n{}\nLINKS\n{}\n{}".format( - self.id, self.lbs, comp_str, agrs, links_str, "-" * 40) + return "{} LBS {}\nCOMPONENTS\n{}\n\nLINKS\n{}\n{}".format( + self.id, self.lbs, comp_str, links_str, "-" * 40) def get_component(self, idx): for c in self.components: @@ -695,21 +745,10 @@ class SyntacticStructure: def match(self, word): matches = self.components[0].match(word) - if matches is None: - return [] - - to_ret = [] - for m in matches: - # if not self.check_agreements(m): - # bad = "Agreement" - # elif not self.check_form(m): - # bad = "Form" - # else: - # bad = "OK" - - to_ret.append((m, self.check_agreements(m))) - - return to_ret + return [] if matches is None else matches + + # for m in matches: + # to_ret.append((m, self.check_agreements(m))) def build_structures(filename): @@ -898,11 +937,11 @@ class Writer: def write_out_worker(self, file_handler, structure_id, components, colocation_ids): rows = [] - for cid, m, reason, freq in colocation_ids.get_matches_for(structure_id, not self.all): + for cid, m, freq in colocation_ids.get_matches_for(structure_id, not self.all): to_write = [] representation = "" - for idx, comp in enumerate(components): + for idx, _comp in enumerate(components): idx = str(idx + 1) word = m[idx] if idx in m else None to_write.extend(self.from_word(word)) @@ -978,21 +1017,23 @@ class ColocationIds: def add_matches(self, matches): for sid, nms in matches.items(): for nm in nms: - self._add_match(nm[2], sid, (nm[0], nm[1])) + self._add_match(nm[1], sid, nm[0]) def get_matches_for(self, structure_id, group): for _cid_tup, (cid, cid_matches, sid) in self.data.items(): if sid != structure_id: continue - for words, reason in cid_matches: - yield (cid, words, reason, len(cid_matches)) + for words in cid_matches: + yield (cid, words, len(cid_matches)) if group: break def set_representations(self, structures): components_dict = {structure.id: structure for structure in structures} for _1, (_2, cid_matches, sid) in self.data.items(): + if _2 == '1309': + a = 1 ComponentRendition.set_representations(cid_matches, components_dict[sid]) @@ -1004,12 +1045,12 @@ def match_file(words, structures): for w in words: mhere = s.match(w) logging.debug(" GOT: {}".format(len(mhere))) - for match, reason in mhere: + for match in mhere: colocation_id = [(idx, w.lemma) for idx, w in match.items()] colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0])) colocation_id = tuple(colocation_id) - matches[s.id].append((match, reason, colocation_id)) + matches[s.id].append((match, colocation_id)) return matches