from xml.etree import ElementTree import re from enum import Enum from collections import defaultdict from msd_translate import MSD_TRANSLATE STRUKTURE = "Kolokacije_strukture_09_new-system.xml" STAVKI = "k2.xml" CODES = { "Noun": "N", "Verb": "V", "Adjective": "A", "Adverb": "R", "Pronoun": "P", "Numeral": "M", "Preposition": "S", "Conjunction": "C", "Particle": "Q", "Interjection": "I", "Abbreviation": "Y", "Residual": "X", 'common': 'c', 'proper': 'p', 'masculine': 'm', 'feminine': 'f', 'neuter': 'n', "singular": "s", "dual": "d", "plural": "p", "nominative": "n", "genitive": "g", "dative": "d", "accusative": "a", "locative": "l", "instrumental": "i", "no": "n", "yes": "y", "main": "m", "auxiliary": "a", "perfective": "e", "progressive": "p", "biaspectual": "b", "infinitive": "n", "supine": "u", "participle": "p", "present": "r", "future": "f", "conditional": "c", "imperative": "m", "first": "1", "second": "2", "third": "3", "general": "g", "possessive": "s", "positive": "p", "comparative": "c", "superlative": "s", "personal": "p", "demonstrative": "d", "relative": "r", "reflexive": "x", "interrogative": "q", "indefinite": "i", "negative": "z", "bound": "b", "digit": "d", "roman": "r", "letter": "l", "cardinal": "c", "ordinal": "o", "pronominal": "p", "special": "s", "coordinating": "c", "subordinating": "s", "foreign": "f", "typo": "t", "program": "p", } TAGSET = { "N": ['type', 'gender', 'number', 'case', 'animate'], "V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'], "A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'], "R": ['type', 'degree'], "P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'], "M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'], "S": ['case'], "C": ['type'], "Q": [], "I": [], "Y": [], "X": ['type'] } CATEGORY_BASES = { "N": ['.', '.', '.', '.', '.?'], "V": ['.', '.', '.', '.', '.?', '.?', '.?'], "A": ['.', '.', '.', '.', '.', '.?'], "R": ['.', '.?'], "P": ['. ', '.', '.', '.', '.', '.', '.?', '.?'], "M": ['.', '.', '.', '.?', '.?', '.?'], "S": ['.'], "C": ['.'], "Q": [], "I": [], "Y": [], "X": ['.?'] } class RestrictionType(Enum): Morphology = 0 Lexis = 1 class ComponentLevel(Enum): Lemma = 0 WordForm = 1 def get_level(restriction): for feature in restriction: if "level" in feature.keys(): lvl = feature.get("level") if lvl == "lemma": return ComponentLevel.Lemma elif lvl == "word_form": return ComponentLevel.WordForm else: continue raise RuntimeError("Unreachable!") def build_morphology_regex(restriction): restr_dict = {} for feature in restriction: restr_dict.update(feature.items()) assert('POS' in restr_dict) category = restr_dict['POS'].capitalize() cat_code = CODES[category] rgx = [cat_code] + CATEGORY_BASES[cat_code] del restr_dict['POS'] del restr_dict['level'] for attribute, value in restr_dict.items(): index = TAGSET[cat_code].index(attribute.lower()) assert(index >= 0) if '|' in value: match = '[' + "".join(CODES[val] for val in value.split('|')) + ']' else: match = CODES[value] rgx[index + 1] = match return re.compile("".join(rgx)) def build_lexis_regex(restriction): restr_dict = {} for feature in restriction: restr_dict.update(feature.items()) return re.compile(restr_dict['lemma']) class Restriction: def __init__(self, restriction_tag): restriction_type = restriction_tag.get('type') if restriction_type == "morphology": self.type = RestrictionType.Morphology self.matcher = build_morphology_regex(restriction_tag.getchildren()) elif restriction_type == "lexis": self.type = RestrictionType.Lexis self.matcher = build_lexis_regex(restriction_tag.getchildren()) else: raise NotImplementedError() def match(self, word): if self.type == RestrictionType.Morphology: match_to = word.msd elif self.type == RestrictionType.Lexis: match_to = word.lemma else: raise RuntimeError("Unreachable!") return self.matcher.match(match_to) def __str__(self): return "({:s} {})".format(str(self.type).split('.')[1], self.matcher) def __repr__(self): return str(self) class Component: def __init__(self, name): self.name = name if name is not None else "" self.restriction = None self.next_element = None self.level = None def word_to_str(self, word): if self.level == ComponentLevel.Lemma: return word.lemma, word.msd elif self.level == ComponentLevel.WordForm: return word.text, word.msd else: raise RuntimeError("Unreachable") def has_next(self): return self.next_element is not None def get_next(self): return self.next_element[0] def link_label(self): return self.next_element[1] def set_next(self, next_component, link_label): self.next_element = (next_component, link_label) def set_restriction(self, restrictions_tag): if restrictions_tag.tag == "restriction": self.restriction = Restriction(restrictions_tag) self.level = get_level(restrictions_tag) elif restrictions_tag.tag == "restriction_or": self.restriction = [Restriction(el) for el in restrictions_tag] self.level = get_level(restrictions_tag[0]) # same level for every restriction for now and only or available levels = [get_level(el) for el in restrictions_tag] assert(len(set(levels)) == 1) else: raise RuntimeError("Unreachable") def __str__(self): el = "(N.{:7s} {})".format(self.name, str(self.restriction)) if self.has_next(): el += " -- {} -->\n{}".format(self.link_label(), str(self.get_next())) return el def __repr__(self): return str(self) def match(self, word): matched = None # matching if type(self.restriction) is list: for restr in self.restriction: matched = restr.match(word) if matched is not None: break else: matched = self.restriction.match(word) # recurse to next if matched: to_ret = [self.word_to_str(word)] # already matched everything! if not self.has_next(): return to_ret # need to get all links that match for next_word in word.get_links(self.link_label()): match = self.get_next().match(next_word) # if matches, return if match is not None: to_ret.extend(match) return to_ret # return None... class SyntacticStructure: def __init__(self): self.root_component = Component('root') self.id = None self.lbs = None @staticmethod def from_xml(xml): st = SyntacticStructure() st.id = xml.get('id') st.lbs = xml.get('LBS') components, system = xml.getchildren() dependencies, restrictions = system.getchildren() assert(system.get('type') == 'JOS') deps = { dep.get('from'): (dep.get('to'), dep.get('label')) for dep in dependencies } comps = { comp.get('cid'): comp.get('name') for comp in components } restrs = { r.get('cid'): r.getchildren()[0] for r in restrictions } current_component = st.root_component idx = 'root' while idx in deps: idx, dep_label = deps[idx] next_component = Component(comps[idx]) next_component.set_restriction(restrs[idx]) current_component.set_next(next_component, dep_label) current_component = next_component st.root_component = st.root_component.get_next() return st def __str__(self): return "{} LBS {}\n------\n{}".format(self.id, self.lbs, str(self.root_component)) def match(self, word): return self.root_component.match(word) def build_structures(filename): structures = [] with open(filename, 'r') as fp: et = ElementTree.XML(fp.read()) for structure in et.iter('syntactic_structure'): structures.append(SyntacticStructure.from_xml(structure)) return structures class Word: def __init__(self, xml): self.lemma = xml.get('lemma') self.msd = MSD_TRANSLATE[xml.get('msd')] self.id = xml.get('id') self.text = xml.text self.links = defaultdict(list) assert(None not in (self.id, self.lemma, self.msd)) def add_link(self, link, to): self.links[link].append(to) def get_links(self, link): return self.links[link] def load_corpus(filename): with open(filename, 'r') as fp: xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) xmlstring = xmlstring.replace(' xml:', ' ') et = ElementTree.XML(xmlstring) words = {} for w in et.iter("w"): words[w.get('id')] = Word(w) for l in et.iter("link"): assert('dep' in l.keys() and 'from' in l.keys() and 'afun' in l.keys()) lfrom = l.get('from') if lfrom in words: next_word_id = l.get('dep') if next_word_id in words: next_word = words[next_word_id] words[l.get('from')].add_link(l.get('afun'), next_word) return list(words.values()) def main(): words = load_corpus(STAVKI) import time t = time.time() structures = build_structures(STRUKTURE) for s in structures: print(s) num_matches = 0 for w in words: for s in structures: m = s.match(w) if m is not None: num_matches += 1 print(s.id, m) print("TIME", time.time() - t) print(num_matches) if __name__ == '__main__': main()