From 6a221ae8fedd7dab5270b60940c2b9f09a17006a Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Fri, 25 Jan 2019 11:58:40 +0100 Subject: [PATCH] Fixes for msd length matching and pc matching Also some cleanup and fix output formatting --- msd_translate.py | 3 +- wani.py | 80 ++++++++++++++++++++++++------------------------ 2 files changed, 42 insertions(+), 41 deletions(-) diff --git a/msd_translate.py b/msd_translate.py index ccab003..016f918 100644 --- a/msd_translate.py +++ b/msd_translate.py @@ -1900,4 +1900,5 @@ MSD_TRANSLATE = { "N": "X", "Nj": "Xf", "Nt": "Xt", - "Np": "Xp"} + "Np": "Xp", + "U": "N"} diff --git a/wani.py b/wani.py index 5da191d..0c9d8cc 100644 --- a/wani.py +++ b/wani.py @@ -8,6 +8,9 @@ import logging from msd_translate import MSD_TRANSLATE +MAX_NUM_COMPONENTS = 5 + + STAVKI = sys.argv[1] STRUKTURE = sys.argv[2] FILE_OUT = sys.argv[3] @@ -215,6 +218,7 @@ def build_morphology_regex(restriction): rgx = [cat_code] + CATEGORY_BASES[cat_code] del restr_dict['POS'] + min_msd_length = 1 for attribute, (value, typ) in restr_dict.items(): index = TAGSET[cat_code].index(attribute.lower()) @@ -228,7 +232,13 @@ def build_morphology_regex(restriction): match = "[{}{}]".format("" if typ else "^", match) rgx[index + 1] = match + if typ: + min_msd_length = max(index + 1, min_msd_length) + def matcher(text): + if len(text) <= min_msd_length: + return False + for c, r in zip(text, rgx): if not re.match(r, c): return False @@ -443,16 +453,6 @@ class Component: logging.debug("SELF MATCH({}: {} -> {}".format(self.idx, word.text, matched)) - # check with status - # if self.status is ComponentStatus.Optional: - # if not matched: - # # nothing to add, but still good... - # return {} - # elif self.status is ComponentStatus.Forbidden: - # # forbiddent is handled at return stage in _match_next - # # just process normally... - # pass - # recurse to next if not matched: return None @@ -465,12 +465,13 @@ class Component: # need to get all links that match for next, link in self.next_element: - logging.debug("FIND LINKS FOR: {} -> {}".format(self.idx, next.idx)) + next_links = word.get_links(link) + logging.debug("FIND LINKS FOR: {} -> {}: #{}".format(self.idx, next.idx, len(next_links))) to_ret.append([]) # good flag good = next.status != ComponentStatus.Required - for next_word in word.get_links(link): + for next_word in next_links: logging.debug("link: {}: {} -> {}".format(link, word.id, next_word.id)) match = next.match(next_word) @@ -679,6 +680,11 @@ class Word: assert(None not in (self.id, self.lemma, self.msd)) + @staticmethod + def pcWord(pc): + pc.set('lemma', pc.text) + return Word(pc) + def add_link(self, link, to): self.links[link].append(to) @@ -700,14 +706,11 @@ def load_corpus(filename): xmlstring = xmlstring.replace(' xml:', ' ') et = ElementTree.XML(xmlstring) - root_words = set() words = {} for w in et.iter("w"): words[w.get('id')] = Word(w) - - pcs = set() for pc in et.iter("pc"): - pcs.add(pc.get('id')) + words[pc.get('id')] = Word.pcWord(pc) for l in et.iter("link"): if 'dep' in l.keys(): @@ -723,29 +726,20 @@ def load_corpus(filename): if lfrom in words: if is_root_id(lfrom): - logging.error("NOO: ", lfrom, file=sys.stderr) + logging.error("NOO: ", lfrom) sys.exit(1) if dest in words: next_word = words[dest] words[lfrom].add_link(ana, next_word) - - # catch links from root - elif is_root_id(lfrom): - root_words.add(dest) - - # catch links from :S - elif lfrom in pcs: - logging.warning(str(("link from : ", lfrom))) + else: + logging.error("Unknown id: {}".format(dest)) + sys.exit(1) else: # strange errors, just skip... pass - no_root_words = [w for k, w in words.items() if k in root_words] - missing = root_words - set(w.id for w in no_root_words) - # what should i do with this I forgot :( - return list(words.values()) @@ -774,15 +768,16 @@ def main(): logging.debug(" GOT: {}".format(len(mhere))) for match, reason in mhere: matches[s.id].append((match, reason)) + print("") - header = [ - "Structure_ID", "Component_ID", "Token_ID", "Word_form", - "Lemma", "Msd", "Representative_form_1", "Component_ID", - "Token_ID", "Word_form", "Lemma", "Msd", "Representative_form_2", - "Collocation_ID", "Joint_representative_form"] - csv = [", ".join(header)] + header = ["Structure_ID"] + for i in range(MAX_NUM_COMPONENTS): + header.extend("C{}_{}".format(i + 1, thd) for thd in + ["Token_ID", "Word_form", "Lemma", "Msd", "Representative_form"]) + header.extend(["Collocation_ID", "Joint_representative_form"]) + csv = [", ".join(header)] colocation_ids = {} for s in structures: @@ -790,28 +785,33 @@ def main(): for m, reason in ms: colocation_id = [s.id] - to_print = [s.id] + to_print = [] m_sorted = defaultdict(lambda: None, m.items()) for idx, comp in enumerate(s.components): idx = str(idx + 1) if idx not in m_sorted: - to_print.extend([idx, "", "", "", "", ""]) + to_print.extend(["", "", "", "", ""]) else: w = m_sorted[idx] # if comp.render_word(m_sorted[idx]) is not None: if True: - to_print.extend([idx, w.id, w.text, w.lemma, w.msd, ""]) + to_print.extend([w.id, w.text, w.lemma, w.msd, ""]) colocation_id.append(w.lemma) colocation_id = tuple(colocation_id) if colocation_id in colocation_ids: cid = colocation_ids[colocation_id] else: - cid = len(colocation_ids) + cid = len(colocation_ids) + 1 colocation_ids[colocation_id] = cid - + + to_print = [s.id] + to_print + length = 1 + MAX_NUM_COMPONENTS * 5 + # make them equal size + to_print.extend([""] * (length - len(to_print))) to_print.extend([str(cid), ""]) + csv.append(", ".join(to_print))