diff --git a/wani.py b/wani.py index 2e14537..5da191d 100644 --- a/wani.py +++ b/wani.py @@ -3,12 +3,14 @@ import re from enum import Enum from collections import defaultdict import sys +import logging from msd_translate import MSD_TRANSLATE STAVKI = sys.argv[1] -STRUKTURE = sys.argv[2] # "Kolokacije_strukture_09_new-system.xml" +STRUKTURE = sys.argv[2] +FILE_OUT = sys.argv[3] CODES = { "Noun": "N", @@ -98,41 +100,96 @@ TAGSET = { } CATEGORY_BASES = { - "N": ['.', '.', '.', '.', '.?'], - "V": ['.', '.', '.', '.', '.?', '.?', '.?'], - "A": ['.', '.', '.', '.', '.', '.?'], - "R": ['.', '.?'], - "P": ['. ', '.', '.', '.', '.', '.', '.?', '.?'], - "M": ['.', '.', '.', '.?', '.?', '.?'], - "S": ['.'], - "C": ['.'], + "N": ['.'] * 5, + "V": ['.'] * 7, + "A": ['.'] * 6, + "R": ['.'] * 2, + "P": ['.'] * 6, + "M": ['.'] * 6, + "S": ['.'] * 1, + "C": ['.'] * 1, "Q": [], "I": [], "Y": [], - "X": ['.?'] + "X": ['.'] * 1 } class RestrictionType(Enum): Morphology = 0 Lexis = 1 + MatchAll = 2 -class ComponentLevel(Enum): +class Rendition(Enum): Lemma = 0 WordForm = 1 + Unknown = 2 + +class ComponentRendition: + def __init__(self, rendition=Rendition.Unknown): + self.word_form = {} + self.rendition = rendition + + def render(self, word): + if self.rendition == Rendition.Lemma: + return word.lemma + elif self.rendition == Rendition.WordForm: + return word.text + elif self.rendition == Rendition.Unknown: + return None + else: + raise RuntimeError("Unknown rendition: {}".format(self.rendition)) + + def __str__(self): + return str(self.rendition) + + +# dont know... +class StructureSelection(Enum): + All = 0 + Frequency = 1 + +class ComponentRepresentation: + def new(s): + if 'rendition' in s: + if s['rendition'] == "lemma": + return ComponentRendition(Rendition.Lemma) + elif s['rendition'] == "word_form": + return ComponentRendition(Rendition.WordForm) + else: + raise NotImplementedError("Rendition: {}".format(s)) + elif 'selection' in s: + if s['selection'] == "frequency": + return StructureSelection.Frequency + elif s['selection'] == "all": + return StructureSelection.All + else: + return {s['selection']: s['value']} + else: + raise NotImplementedError("Representation: {}".format(s)) + + +class ComponentStatus(Enum): + Optional = 0 + Required = 1 + Forbidden = 2 + + def __str__(self): + if self == ComponentStatus.Optional: + return "?" + elif self == ComponentStatus.Required: + return "!" + else: #Forbidden + return "X" def get_level(restriction): for feature in restriction: if "level" in feature.keys(): lvl = feature.get("level") - if lvl == "lemma": - return ComponentLevel.Lemma - elif lvl == "word_form": - return ComponentLevel.WordForm - else: - continue + else: + continue raise RuntimeError("Unreachable!") @@ -140,28 +197,44 @@ def get_level(restriction): def build_morphology_regex(restriction): restr_dict = {} for feature in restriction: - restr_dict.update(feature.items()) + feature_dict = dict(feature.items()) + + match_type = True + if "filter" in feature_dict: + assert(feature_dict['filter'] == "negative") + match_type = False + del feature_dict['filter'] + + assert(len(feature_dict) == 1) + key, value = next(iter(feature_dict.items())) + restr_dict[key] = (value, match_type) assert('POS' in restr_dict) - category = restr_dict['POS'].capitalize() + category = restr_dict['POS'][0].capitalize() cat_code = CODES[category] rgx = [cat_code] + CATEGORY_BASES[cat_code] del restr_dict['POS'] - del restr_dict['level'] - for attribute, value in restr_dict.items(): + for attribute, (value, typ) in restr_dict.items(): index = TAGSET[cat_code].index(attribute.lower()) assert(index >= 0) if '|' in value: - match = '[' + "".join(CODES[val] for val in value.split('|')) + ']' + match = "".join(CODES[val] for val in value.split('|')) else: match = CODES[value] + match = "[{}{}]".format("" if typ else "^", match) rgx[index + 1] = match - return re.compile("".join(rgx)) + def matcher(text): + for c, r in zip(text, rgx): + if not re.match(r, c): + return False + return True + + return " ".join(rgx), matcher def build_lexis_regex(restriction): @@ -169,18 +242,27 @@ def build_lexis_regex(restriction): for feature in restriction: restr_dict.update(feature.items()) - return re.compile(restr_dict['lemma']) + assert("lemma" in restr_dict) + match_list = restr_dict['lemma'].split('|') + + return match_list, lambda text: text in match_list class Restriction: def __init__(self, restriction_tag): + if restriction_tag is None: + self.type = RestrictionType.MatchAll + self.matcher = None + self.present = None + return + restriction_type = restriction_tag.get('type') if restriction_type == "morphology": self.type = RestrictionType.Morphology - self.matcher = build_morphology_regex(list(restriction_tag)) + self.present, self.matcher = build_morphology_regex(list(restriction_tag)) elif restriction_type == "lexis": self.type = RestrictionType.Lexis - self.matcher = build_lexis_regex(list(restriction_tag)) + self.present, self.matcher = build_lexis_regex(list(restriction_tag)) else: raise NotImplementedError() @@ -189,155 +271,380 @@ class Restriction: match_to = word.msd elif self.type == RestrictionType.Lexis: match_to = word.lemma + elif self.type == RestrictionType.MatchAll: + return True else: raise RuntimeError("Unreachable!") - return self.matcher.match(match_to) + return self.matcher(match_to) def __str__(self): - return "({:s} {})".format(str(self.type).split('.')[1], self.matcher) + return "({:s} {})".format(str(self.type).split('.')[1], self.present) def __repr__(self): return str(self) class Component: - def __init__(self, name, idx): - assert(idx is not None) + def __init__(self, info): + idx = info['cid'] + name = info['name'] if 'name' in info else None - self.name = name if name is not None else "" # for printing... + if 'status' not in info: + status = ComponentStatus.Required + elif info['status'] == 'forbidden': + status = ComponentStatus.Forbidden + elif info['status'] == 'obligatory': + status = ComponentStatus.Required + elif info['status'] == 'optional': + status = ComponentStatus.Optional + else: + raise NotImplementedError("strange status: {}".format(info['status'])) + + self.status = status + self.name = name self.idx = idx self.restriction = None self.next_element = [] - self.level = None + self.rendition = ComponentRendition() + self.selection = {} self.iter_ctr = 0 - def word_to_str(self, word): - if self.level == ComponentLevel.Lemma: - return word.lemma, word.msd - elif self.level == ComponentLevel.WordForm: - return word.text, word.msd - else: - raise RuntimeError("Unreachable") - - def __iter__(self): - self.iter_ctr = 0 - return self - - def __next__(self): - if self.iter_ctr < len(self.next_element): - to_ret = self.next_element[self.iter_ctr] - self.iter_ctr += 1 - return to_ret - else: - raise StopIteration + def render_word(self, word): + return self.rendition.render(word) def add_next(self, next_component, link_label): self.next_element.append((next_component, link_label)) def set_restriction(self, restrictions_tag): - if restrictions_tag.tag == "restriction": + if restrictions_tag is None: + self.restriction = Restriction(None) + + elif restrictions_tag.tag == "restriction": self.restriction = Restriction(restrictions_tag) - self.level = get_level(restrictions_tag) elif restrictions_tag.tag == "restriction_or": self.restriction = [Restriction(el) for el in restrictions_tag] - self.level = get_level(restrictions_tag[0]) - - # same level for every restriction for now and only or available - levels = [get_level(el) for el in restrictions_tag] - assert(len(set(levels)) == 1) else: raise RuntimeError("Unreachable") - def find_next(self, deps, comps, restrs): + def set_representation(self, representation): + cr = None + if representation is not None: + self.representation = [] + + for feature in representation: + f = ComponentRepresentation.new(dict(feature.attrib)) + + if type(f) is StructureSelection: + assert(cr is None) + cr = f + elif type(f) is ComponentRendition: + self.rendition = f + elif type(f) is dict: + self.selection.update(f) + else: + raise RuntimeError("Unreachable: {}".format(f)) + + return cr + + def find_next(self, deps, comps, restrs, reprs): + representation = StructureSelection.All + + to_ret = [] for d in deps: if d[0] == self.idx: _, idx, dep_label = d - next_component = Component(comps[idx], idx) + next_component = Component(comps[idx]) next_component.set_restriction(restrs[idx]) + r1 = next_component.set_representation(reprs[idx]) + to_ret.append(next_component) self.add_next(next_component, dep_label) - next_component.find_next(deps, comps, restrs) + others, r2 = next_component.find_next(deps, comps, restrs, reprs) + to_ret.extend(others) + + if StructureSelection.Frequency in (r1, r2): + representation = StructureSelection.Frequency + + return to_ret, representation + + def name_str(self): + return "_" if self.name is None else self.name + def __str__(self): - el = "({:10} {})".format(self.name, str(self.restriction)) - for next, link in self: - el += "\n{:10} -- {:10} --> {}".format(self.name, link, str(next)) + n = self.name_str() + return "{:s}) {:7s}:{} [{}] :{}".format( + self.idx, n, self.status, self.restriction, self.rendition) + + def tree(self): + el = [] + for next, link in self.next_element: + el.append("{:3} -- {:5} --> {:3}".format(self.idx, link, next.idx)) + el.extend(next.tree()) return el def __repr__(self): return str(self) def match(self, word): + m1 = self._match_self(word) + if m1 is None: + return None + + mn = self._match_next(word) + if mn is None: + return None + + to_ret = [m1] + for cmatch in mn: + # if good match but nothing to add, just continue + if len(cmatch) == 0: + continue + + # if more than one match found for particular component + elif len(cmatch) > 1: + logging.debug("MULTIPLE: {}, {}".format(self.idx, cmatch)) + # if more than one match in multiple components, NOPE! + if len(to_ret) > 1: + logging.warning("Strange multiple match: {}".format( + str([w.id for w in cmatch[0].values()]))) + + for tr in to_ret: + tr.update(cmatch[0]) + continue + + # yeah, so we have found more than one match, => + # more than one element in to_ret + to_ret = [{**dict(to_ret[0]), **m} for m in cmatch] + + else: + for tr in to_ret: + tr.update(cmatch[0]) + + logging.debug("MA: {}".format(str(to_ret))) + return to_ret + + def _match_self(self, word): matched = None # matching if type(self.restriction) is list: for restr in self.restriction: matched = restr.match(word) - if matched is not None: + if matched: # match either break else: matched = self.restriction.match(word) - # recurse to next - if matched: - to_ret = [self.word_to_str(word)] + logging.debug("SELF MATCH({}: {} -> {}".format(self.idx, word.text, matched)) - for next, link in self: - # need to get all links that match - for next_word in word.get_links(link): - match = next.match(next_word) - # if matches, return - if match is not None: - to_ret.extend(match) + # check with status + # if self.status is ComponentStatus.Optional: + # if not matched: + # # nothing to add, but still good... + # return {} + # elif self.status is ComponentStatus.Forbidden: + # # forbiddent is handled at return stage in _match_next + # # just process normally... + # pass + + # recurse to next + if not matched: + return None + else: + return {self.idx: word} + + def _match_next(self, word): + # matches for every component in links from this component + to_ret = [] + + # need to get all links that match + for next, link in self.next_element: + logging.debug("FIND LINKS FOR: {} -> {}".format(self.idx, next.idx)) + to_ret.append([]) + + # good flag + good = next.status != ComponentStatus.Required + for next_word in word.get_links(link): + logging.debug("link: {}: {} -> {}".format(link, word.id, next_word.id)) + match = next.match(next_word) + + if match is not None: + # special treatement for forbidden + if next.status == ComponentStatus.Forbidden: + good = False break - # if none matched, nothing found! - else: - return None + else: + assert(type(match) is list) + to_ret[-1].extend(match) + good = True - return to_ret + # if none matched, nothing found! + if not good: + logging.debug("BAD") + return None - # return None... + return to_ret class SyntacticStructure: def __init__(self): - self.root_component = Component("", 'root') self.id = None self.lbs = None + self.agreements = [] + self.components = [] + self.selection = StructureSelection.All @staticmethod def from_xml(xml): st = SyntacticStructure() st.id = xml.get('id') st.lbs = xml.get('LBS') + + if float(st.id.replace('-','.')) >= 17: + return None - components, system = list(xml) - dependencies, restrictions = list(system) + assert(len(list(xml)) == 1) + system = next(iter(xml)) assert(system.get('type') == 'JOS') + components, dependencies, definitions = list(system) deps = [ (dep.get('from'), dep.get('to'), dep.get('label')) for dep in dependencies ] - comps = { comp.get('cid'): comp.get('name') for comp in components } - restrs = { r.get('cid'): next(iter(r)) for r in restrictions } + comps = { comp.get('cid'): dict(comp.items()) for comp in components } - st.root_component.find_next(deps, comps, restrs) - st.root_component = list(st.root_component)[0][0] # get first next + restrs, forms = {}, {} + for comp in definitions: + n = comp.get('cid') + restrs[n] = None + forms[n] = None + + for el in comp: + if el.tag.startswith("restriction"): + assert(restrs[n] is None) + restrs[n] = el + elif el.tag.startswith("representation"): + st.add_representation(n, el, forms) + else: + raise NotImplementedError("definition??") + + fake_root_component = Component({'cid': '#', 'type': 'other'}) + st.components, st.selection = fake_root_component.find_next(deps, comps, restrs, forms) return st + def add_representation(self, n, el, forms): + if el.tag == "representation": + els = [el] + elif el.tag == "representation_and": + els = list(el) + else: + raise NotImplementedError("repr what?: {}".format(el.tag)) + + for el in els: + if el.get('basic') == 'form': + assert(forms[n] is None) + forms[n] = el + elif el.get('basic') == "agreement": + self.add_agreement(n, el) + else: + raise NotImplementedError("representation?: {}".format(el.tag)) + + def add_agreement(self, n, el): + assert(el.get('head')[:4] == 'cid_') + + n1 = n + n2 = el.get('head')[4:] + agreement_str = next(iter(el)).get('agreement') + + self.agreements.append({ + 'n1': n1, + 'n2': n2, + 'match': agreement_str.split('|')}) + def __str__(self): - arrow = "root -- modra --> " - return "{} LBS {}\n------\n{}{}".format(self.id, self.lbs, arrow, str(self.root_component)) + comp_str = "\n".join(str(comp) for comp in self.components) + + agrs = "\n".join("({} -[{}]- {}) ".format( + a['n1'], "|".join(a['match']), a['n2']) for a in self.agreements) + + links_str = "\n".join(self.components[0].tree()) + + return "{} LBS {}\nCOMPONENTS\n{}\nAGREEMENTS\n{}\nLINKS\n{}\n{}".format( + self.id, self.lbs, comp_str, agrs, links_str, "-" * 40) + + def get_component(self, idx): + for c in self.components: + if c.idx == idx: + return c + raise RuntimeError("Unknown component id: {}".format(idx)) + + def check_agreements(self, match): + for agr in self.agreements: + w1 = match[agr['n1']] + w2 = match[agr['n2']] + + for agr_case in agr['match']: + t1 = w1.msd[0] + v1 = TAGSET[t1].index(agr_case) + assert(v1 >= 0) + # if none specified: nedolocnik, always agrees + if v1 + 1 >= len(w1.msd): + continue + # first is uppercase, not in TAGSET + m1 = w1.msd[v1 + 1] + + # REPEAT (not DRY!) + t2 = w2.msd[0] + v2 = TAGSET[t2].index(agr_case) + assert(v2 >= 0) + if v2 + 1 >= len(w2.msd): + continue + m2 = w2.msd[v2 + 1] + + # match! + if '-' not in [m1, m2] and m1 != m2: + return False + + return True + + def check_form(self, match): + for midx, w in match.items(): + c = self.get_component(midx) + for key, value in c.selection.items(): + t = w.msd[0] + v = TAGSET[t].index(key.lower()) + f1 = w.msd[v + 1] + f2 = CODES[value] + + if '-' not in [f1, f2] and f1 != f2: + return False + + return True def match(self, word): - return self.root_component.match(word) + matches = self.components[0].match(word) + if matches is None: + return [] + + to_ret = [] + for m in matches: + if not self.check_agreements(m): + bad = "Agreement" + elif not self.check_form(m): + bad = "Form" + else: + bad = "OK" + + to_ret.append((m, bad)) + + return to_ret def build_structures(filename): @@ -345,14 +652,27 @@ def build_structures(filename): with open(filename, 'r') as fp: et = ElementTree.XML(fp.read()) for structure in et.iter('syntactic_structure'): - structures.append(SyntacticStructure.from_xml(structure)) + to_append = SyntacticStructure.from_xml(structure) + if to_append is None: + continue + structures.append(to_append) return structures +def get_msd(comp): + d = dict(comp.items()) + if 'msd' in d: + return d['msd'] + elif 'ana' in d: + return d['ana'][4:] + else: + logging.error(d, file=sys.stderr) + raise NotImplementedError("MSD?") + class Word: def __init__(self, xml): self.lemma = xml.get('lemma') - self.msd = MSD_TRANSLATE[xml.get('msd')] + self.msd = MSD_TRANSLATE[get_msd(xml)] self.id = xml.get('id') self.text = xml.text self.links = defaultdict(list) @@ -370,6 +690,10 @@ class Word: return self.links[link] +def is_root_id(id_): + return len(id_.split('.')) == 3 + + def load_corpus(filename): with open(filename, 'r') as fp: xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) @@ -381,20 +705,38 @@ def load_corpus(filename): for w in et.iter("w"): words[w.get('id')] = Word(w) + pcs = set() + for pc in et.iter("pc"): + pcs.add(pc.get('id')) + for l in et.iter("link"): - assert('dep' in l.keys() and 'from' in l.keys() and 'afun' in l.keys()) + if 'dep' in l.keys(): + ana = l.get('afun') + lfrom = l.get('from') + dest = l.get('dep') + else: + ana = l.get('ana') + if ana[:4] != 'syn:': # dont bother... + continue + ana = ana[4:] + lfrom, dest = l.get('target').replace('#', '').split() - lfrom = l.get('from') if lfrom in words: - assert(not lfrom.endswith('.0')) - next_word_id = l.get('dep') - if next_word_id in words: - next_word = words[next_word_id] - words[l.get('from')].add_link(l.get('afun'), next_word) + if is_root_id(lfrom): + logging.error("NOO: ", lfrom, file=sys.stderr) + sys.exit(1) - # catch modra links from root - elif lfrom[-1] == '0' and l.get('afun') == 'modra': - root_words.add(l.get('dep')) + if dest in words: + next_word = words[dest] + words[lfrom].add_link(ana, next_word) + + # catch links from root + elif is_root_id(lfrom): + root_words.add(dest) + + # catch links from :S + elif lfrom in pcs: + logging.warning(str(("link from : ", lfrom))) else: # strange errors, just skip... @@ -408,8 +750,6 @@ def load_corpus(filename): def main(): - words = load_corpus(STAVKI) - import time t = time.time() @@ -417,21 +757,86 @@ def main(): for s in structures: print(s) - num_matches = 0 - for w in words: - for s in structures: - m = s.match(w) - if m is not None: - num_matches += 1 - print(s.id, m) + # words = load_corpus(STAVKI) + import pickle + # with open("words.p", "wb") as fp: + # pickle.dump(words, fp) + with open("words.p", "rb") as fp: + words = pickle.load(fp) + print("MATCHES...") + matches = {s.id: [] for s in structures} + + for idx, s in enumerate(structures): + print("\r{}/{}: {:7s}".format(idx, len(structures), s.id)) #, end="") + for w in words: + mhere = s.match(w) + logging.debug(" GOT: {}".format(len(mhere))) + for match, reason in mhere: + matches[s.id].append((match, reason)) + print("") + + header = [ + "Structure_ID", "Component_ID", "Token_ID", "Word_form", + "Lemma", "Msd", "Representative_form_1", "Component_ID", + "Token_ID", "Word_form", "Lemma", "Msd", "Representative_form_2", + "Collocation_ID", "Joint_representative_form"] + csv = [", ".join(header)] + + colocation_ids = {} + + for s in structures: + ms = matches[s.id] + + for m, reason in ms: + colocation_id = [s.id] + to_print = [s.id] + + m_sorted = defaultdict(lambda: None, m.items()) + for idx, comp in enumerate(s.components): + idx = str(idx + 1) + if idx not in m_sorted: + to_print.extend([idx, "", "", "", "", ""]) + else: + w = m_sorted[idx] + # if comp.render_word(m_sorted[idx]) is not None: + if True: + to_print.extend([idx, w.id, w.text, w.lemma, w.msd, ""]) + colocation_id.append(w.lemma) + + colocation_id = tuple(colocation_id) + if colocation_id in colocation_ids: + cid = colocation_ids[colocation_id] + else: + cid = len(colocation_ids) + colocation_ids[colocation_id] = cid + + to_print.extend([str(cid), ""]) + csv.append(", ".join(to_print)) + + + with open(FILE_OUT, "w") as fp: + print("\n".join(csv), file=fp) + + # groups = defaultdict(int) + # for m, reason in ms: + # if reason != "OK": + # continue + # lemmas = [(n, w.lemma) for n, w in m.items()] + # lemmas = tuple(sorted(lemmas, key=lambda x: x[0])) + # groups[lemmas] += 1 + + # print(s.id) + # print(groups) + + + print("") print("TIME", time.time() - t) - print(num_matches) - + print([(k, len(v)) for k, v in matches.items()]) + print(sum(len(v) for _, v in matches.items())) if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) main() - - - +# 6, 7 primeri laznih zadetkov?