from xml.etree import ElementTree import re from enum import Enum from collections import defaultdict import sys from msd_translate import MSD_TRANSLATE STAVKI = sys.argv[1] STRUKTURE = sys.argv[2] # "Kolokacije_strukture_09_new-system.xml" CODES = { "Noun": "N", "Verb": "V", "Adjective": "A", "Adverb": "R", "Pronoun": "P", "Numeral": "M", "Preposition": "S", "Conjunction": "C", "Particle": "Q", "Interjection": "I", "Abbreviation": "Y", "Residual": "X", 'common': 'c', 'proper': 'p', 'masculine': 'm', 'feminine': 'f', 'neuter': 'n', "singular": "s", "dual": "d", "plural": "p", "nominative": "n", "genitive": "g", "dative": "d", "accusative": "a", "locative": "l", "instrumental": "i", "no": "n", "yes": "y", "main": "m", "auxiliary": "a", "perfective": "e", "progressive": "p", "biaspectual": "b", "infinitive": "n", "supine": "u", "participle": "p", "present": "r", "future": "f", "conditional": "c", "imperative": "m", "first": "1", "second": "2", "third": "3", "general": "g", "possessive": "s", "positive": "p", "comparative": "c", "superlative": "s", "personal": "p", "demonstrative": "d", "relative": "r", "reflexive": "x", "interrogative": "q", "indefinite": "i", "negative": "z", "bound": "b", "digit": "d", "roman": "r", "letter": "l", "cardinal": "c", "ordinal": "o", "pronominal": "p", "special": "s", "coordinating": "c", "subordinating": "s", "foreign": "f", "typo": "t", "program": "p", } TAGSET = { "N": ['type', 'gender', 'number', 'case', 'animate'], "V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'], "A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'], "R": ['type', 'degree'], "P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'], "M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'], "S": ['case'], "C": ['type'], "Q": [], "I": [], "Y": [], "X": ['type'] } CATEGORY_BASES = { "N": ['.', '.', '.', '.', '.?'], "V": ['.', '.', '.', '.', '.?', '.?', '.?'], "A": ['.', '.', '.', '.', '.', '.?'], "R": ['.', '.?'], "P": ['. ', '.', '.', '.', '.', '.', '.?', '.?'], "M": ['.', '.', '.', '.?', '.?', '.?'], "S": ['.'], "C": ['.'], "Q": [], "I": [], "Y": [], "X": ['.?'] } class RestrictionType(Enum): Morphology = 0 Lexis = 1 class ComponentLevel(Enum): Lemma = 0 WordForm = 1 def get_level(restriction): for feature in restriction: if "level" in feature.keys(): lvl = feature.get("level") if lvl == "lemma": return ComponentLevel.Lemma elif lvl == "word_form": return ComponentLevel.WordForm else: continue raise RuntimeError("Unreachable!") def build_morphology_regex(restriction): restr_dict = {} for feature in restriction: restr_dict.update(feature.items()) assert('POS' in restr_dict) category = restr_dict['POS'].capitalize() cat_code = CODES[category] rgx = [cat_code] + CATEGORY_BASES[cat_code] del restr_dict['POS'] del restr_dict['level'] for attribute, value in restr_dict.items(): index = TAGSET[cat_code].index(attribute.lower()) assert(index >= 0) if '|' in value: match = '[' + "".join(CODES[val] for val in value.split('|')) + ']' else: match = CODES[value] rgx[index + 1] = match return re.compile("".join(rgx)) def build_lexis_regex(restriction): restr_dict = {} for feature in restriction: restr_dict.update(feature.items()) return re.compile(restr_dict['lemma']) class Restriction: def __init__(self, restriction_tag): restriction_type = restriction_tag.get('type') if restriction_type == "morphology": self.type = RestrictionType.Morphology self.matcher = build_morphology_regex(list(restriction_tag)) elif restriction_type == "lexis": self.type = RestrictionType.Lexis self.matcher = build_lexis_regex(list(restriction_tag)) else: raise NotImplementedError() def match(self, word): if self.type == RestrictionType.Morphology: match_to = word.msd elif self.type == RestrictionType.Lexis: match_to = word.lemma else: raise RuntimeError("Unreachable!") return self.matcher.match(match_to) def __str__(self): return "({:s} {})".format(str(self.type).split('.')[1], self.matcher) def __repr__(self): return str(self) class Component: def __init__(self, name, idx): assert(idx is not None) self.name = name if name is not None else "" # for printing... self.idx = idx self.restriction = None self.next_element = [] self.level = None self.iter_ctr = 0 def word_to_str(self, word): if self.level == ComponentLevel.Lemma: return word.lemma, word.msd elif self.level == ComponentLevel.WordForm: return word.text, word.msd else: raise RuntimeError("Unreachable") def __iter__(self): self.iter_ctr = 0 return self def __next__(self): if self.iter_ctr < len(self.next_element): to_ret = self.next_element[self.iter_ctr] self.iter_ctr += 1 return to_ret else: raise StopIteration def add_next(self, next_component, link_label): self.next_element.append((next_component, link_label)) def set_restriction(self, restrictions_tag): if restrictions_tag.tag == "restriction": self.restriction = Restriction(restrictions_tag) self.level = get_level(restrictions_tag) elif restrictions_tag.tag == "restriction_or": self.restriction = [Restriction(el) for el in restrictions_tag] self.level = get_level(restrictions_tag[0]) # same level for every restriction for now and only or available levels = [get_level(el) for el in restrictions_tag] assert(len(set(levels)) == 1) else: raise RuntimeError("Unreachable") def find_next(self, deps, comps, restrs): for d in deps: if d[0] == self.idx: _, idx, dep_label = d next_component = Component(comps[idx], idx) next_component.set_restriction(restrs[idx]) self.add_next(next_component, dep_label) next_component.find_next(deps, comps, restrs) def __str__(self): el = "({:10} {})".format(self.name, str(self.restriction)) for next, link in self: el += "\n{:10} -- {:10} --> {}".format(self.name, link, str(next)) return el def __repr__(self): return str(self) def match(self, word): matched = None # matching if type(self.restriction) is list: for restr in self.restriction: matched = restr.match(word) if matched is not None: break else: matched = self.restriction.match(word) # recurse to next if matched: to_ret = [self.word_to_str(word)] for next, link in self: # need to get all links that match for next_word in word.get_links(link): match = next.match(next_word) # if matches, return if match is not None: to_ret.extend(match) break # if none matched, nothing found! else: return None return to_ret # return None... class SyntacticStructure: def __init__(self): self.root_component = Component("", 'root') self.id = None self.lbs = None @staticmethod def from_xml(xml): st = SyntacticStructure() st.id = xml.get('id') st.lbs = xml.get('LBS') components, system = list(xml) dependencies, restrictions = list(system) assert(system.get('type') == 'JOS') deps = [ (dep.get('from'), dep.get('to'), dep.get('label')) for dep in dependencies ] comps = { comp.get('cid'): comp.get('name') for comp in components } restrs = { r.get('cid'): next(iter(r)) for r in restrictions } st.root_component.find_next(deps, comps, restrs) st.root_component = list(st.root_component)[0][0] # get first next return st def __str__(self): arrow = "root -- modra --> " return "{} LBS {}\n------\n{}{}".format(self.id, self.lbs, arrow, str(self.root_component)) def match(self, word): return self.root_component.match(word) def build_structures(filename): structures = [] with open(filename, 'r') as fp: et = ElementTree.XML(fp.read()) for structure in et.iter('syntactic_structure'): structures.append(SyntacticStructure.from_xml(structure)) return structures class Word: def __init__(self, xml): self.lemma = xml.get('lemma') self.msd = MSD_TRANSLATE[xml.get('msd')] self.id = xml.get('id') self.text = xml.text self.links = defaultdict(list) assert(None not in (self.id, self.lemma, self.msd)) def add_link(self, link, to): self.links[link].append(to) def get_links(self, link): if link not in self.links and "|" in link: for l in link.split('|'): self.links[link].extend(self.links[l]) return self.links[link] def load_corpus(filename): with open(filename, 'r') as fp: xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) xmlstring = xmlstring.replace(' xml:', ' ') et = ElementTree.XML(xmlstring) root_words = set() words = {} for w in et.iter("w"): words[w.get('id')] = Word(w) for l in et.iter("link"): assert('dep' in l.keys() and 'from' in l.keys() and 'afun' in l.keys()) lfrom = l.get('from') if lfrom in words: assert(not lfrom.endswith('.0')) next_word_id = l.get('dep') if next_word_id in words: next_word = words[next_word_id] words[l.get('from')].add_link(l.get('afun'), next_word) # catch modra links from root elif lfrom[-1] == '0' and l.get('afun') == 'modra': root_words.add(l.get('dep')) else: # strange errors, just skip... pass no_root_words = [w for k, w in words.items() if k in root_words] missing = root_words - set(w.id for w in no_root_words) # what should i do with this I forgot :( return list(words.values()) def main(): words = load_corpus(STAVKI) import time t = time.time() structures = build_structures(STRUKTURE) for s in structures: print(s) num_matches = 0 for w in words: for s in structures: m = s.match(w) if m is not None: num_matches += 1 print(s.id, m) print("TIME", time.time() - t) print(num_matches) if __name__ == '__main__': main()