First commit

2018-10-29 11:29:51 +01:00
parent 4604ac1878
commit 74a1e4834b
2 changed files with 2275 additions and 0 deletions
@@ -0,0 +1,372 @@
+from xml.etree import ElementTree
+import re
+from enum import Enum
+from collections import defaultdict
+
+from msd_translate import MSD_TRANSLATE
+
+
+STRUKTURE = "Kolokacije_strukture_08_new-system.xml"
+STAVKI = "k2.xml"
+
+CODES = {
+    "Noun": "N",
+    "Verb": "V",
+    "Adjective": "A",
+    "Adverb": "R",
+    "Pronoun": "P",
+    "Numeral": "M",
+    "Preposition": "S",
+    "Conjunction": "C",
+    "Particle": "Q",
+    "Interjection": "I",
+    "Abbreviation": "Y",
+    "Residual": "X",
+
+    'common': 'c',
+    'proper': 'p',
+    'masculine': 'm',
+    'feminine': 'f',
+    'neuter': 'n',
+    "singular": "s",
+    "dual": "d",
+    "plural": "p",
+    "nominative": "n",
+    "genitive": "g",
+    "dative": "d",
+    "accusative": "a",
+    "locative": "l",
+    "instrumental": "i",
+    "no": "n",
+    "yes": "y",
+    "main": "m",
+    "auxiliary": "a",
+    "perfective": "e",
+    "progressive": "p",
+    "biaspectual": "b",
+    "infinitive": "n",
+    "supine": "u",
+    "participle": "p",
+    "present": "r",
+    "future": "f",
+    "conditional": "c",
+    "imperative": "m",
+    "first": "1",
+    "second": "2",
+    "third": "3",
+    "general": "g",
+    "possessive": "s",
+    "positive": "p",
+    "comparative": "c",
+    "superlative": "s",
+    "personal": "p",
+    "demonstrative": "d",
+    "relative": "r",
+    "reflexive": "x",
+    "interrogative": "q",
+    "indefinite": "i",
+    "negative": "z",
+    "bound": "b",
+    "digit": "d",
+    "roman": "r",
+    "letter": "l",
+    "cardinal": "c",
+    "ordinal": "o",
+    "pronominal": "p",
+    "special": "s",
+    "coordinating": "c",
+    "subordinating": "s",
+    "foreign": "f",
+    "typo": "t",
+    "program": "p",
+}
+
+TAGSET = {
+    "N": ['type', 'gender', 'number', 'case', 'animate'],
+    "V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'],
+    "A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'],
+    "R": ['type', 'degree'],
+    "P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'],
+    "M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'],
+    "S": ['case'],
+    "C": ['type'],
+    "Q": [],
+    "I": [],
+    "Y": [],
+    "X": ['type']
+}
+
+CATEGORY_BASES = {
+    "N": ['.', '.', '.', '.', '.?'],
+    "V": ['.', '.', '.', '.', '.?', '.?', '.?'],
+    "A": ['.', '.', '.', '.', '.', '.?'],
+    "R": ['.', '.?'],
+    "P": ['. ', '.', '.', '.', '.', '.', '.?', '.?'],
+    "M": ['.', '.', '.', '.?', '.?', '.?'],
+    "S": ['.'],
+    "C": ['.'],
+    "Q": [],
+    "I": [],
+    "Y": [],
+    "X": ['.?']
+}
+
+
+class RestrictionType(Enum):
+    Morphology = 0
+    Lexis = 1
+
+
+class ComponentLevel(Enum):
+    Lemma = 0
+    WordForm = 1
+
+
+def get_level(restriction):
+    for feature in restriction:
+        if "level" in feature.keys():
+            lvl = feature.get("level")
+            if lvl == "lemma":
+                return ComponentLevel.Lemma
+            elif lvl == "word_form":
+                return ComponentLevel.WordForm
+            else:
+                continue
+
+    raise RuntimeError("Unreachable!")
+
+
+def build_morphology_regex(restriction):
+    restr_dict = {}
+    for feature in restriction:
+        restr_dict.update(feature.items())
+
+    assert('POS' in restr_dict)
+    category = restr_dict['POS'].capitalize()
+    cat_code = CODES[category]
+    rgx = [cat_code] + CATEGORY_BASES[cat_code]
+
+    del restr_dict['POS']
+    del restr_dict['level']
+
+    for attribute, value in restr_dict.items():
+        index = TAGSET[cat_code].index(attribute.lower())
+        assert(index >= 0)
+
+        if '|' in value:
+            match = '[' + "".join(CODES[val] for val in value.split('|')) + ']'
+        else:
+            match = CODES[value]
+
+        rgx[index + 1] = match
+
+    return re.compile("".join(rgx))
+
+
+def build_lexis_regex(restriction):
+    restr_dict = {}
+    for feature in restriction:
+        restr_dict.update(feature.items())
+
+    return re.compile(restr_dict['lemma'])
+
+
+class Component:
+    def __init__(self, name):
+        self.name = name if name is not None else ""
+        self.restriction_type = None
+        self.restriction = None
+        self.next_element = None
+        self.level = None
+
+    def word_to_str(self, word):
+        if self.level == ComponentLevel.Lemma:
+            return word.lemma, word.msd
+        elif self.level == ComponentLevel.WordForm:
+            return word.text, word.msd
+        else:
+            raise RuntimeError("Unreachable")
+
+    def has_next(self):
+        return self.next_element is not None
+
+    def get_next(self):
+        return self.next_element[0]
+
+    def link_label(self):
+        return self.next_element[1]
+
+    def set_next(self, next_component, link_label):
+        self.next_element = (next_component, link_label)
+
+    def set_restriction(self, restriction_tag):
+        restriction_type = restriction_tag.get('type')
+        if restriction_type == "morphology":
+            self.restriction_type = RestrictionType.Morphology
+            self.restriction = build_morphology_regex(restriction_tag.getchildren())
+        elif restriction_type == "lexis":
+            self.restriction_type = RestrictionType.Lexis
+            self.restriction = build_lexis_regex(restriction_tag.getchildren())
+        else:
+            raise NotImplementedError()
+
+        self.level = get_level(restriction_tag.getchildren())
+
+    def __str__(self):
+        el = "(N.{:7s} {:12s} {})".format(self.name, str(self.restriction_type).split('.')[1], self.restriction)
+        if self.has_next():
+            el += "  -- {} -->\n{}".format(self.link_label(), str(self.get_next()))
+        return el
+
+    def __repr__(self):
+        return str(self)
+
+    def match(self, word):
+        if self.restriction_type == RestrictionType.Morphology:
+            match_to = word.msd
+        elif self.restriction_type == RestrictionType.Lexis:
+            match_to = word.lemma
+        else:
+            raise RuntimeError("Unreachable!")
+
+        if self.restriction.match(match_to):
+            to_ret = [self.word_to_str(word)]
+
+            # already matched everything!
+            if not self.has_next():
+                return to_ret
+
+            # need to get all links that match
+            for next_word in word.get_links(self.link_label()):
+                match = self.get_next().match(next_word)
+                # if matches, return
+                if match is not None:
+                    to_ret.extend(match)
+                    return to_ret
+
+        # return None...
+
+
+class SyntacticStructure:
+    def __init__(self):
+        self.root_component = Component('root')
+        self.id = None
+        self.lbs = None
+
+    @staticmethod
+    def from_xml(xml):
+        st = SyntacticStructure()
+        st.id = int(xml.get('id'))
+        st.lbs = xml.get('LBS')
+
+        components, system = xml.getchildren()
+        dependencies, restrictions = system.getchildren()
+
+        assert(system.get('type') == 'JOS')
+
+        deps = { dep.get('from'): (dep.get('to'), dep.get('label')) for dep in dependencies }
+        comps = { comp.get('cid'): comp.get('name') for comp in components }
+        restrs = { r.get('cid'): r.getchildren()[0] for r in restrictions }
+
+        current_component = st.root_component
+        idx = 'root'
+
+        while idx in deps:
+            idx, dep_label = deps[idx]
+
+            next_component = Component(comps[idx])
+            next_component.set_restriction(restrs[idx])
+
+            current_component.set_next(next_component, dep_label)
+            current_component = next_component
+
+        st.root_component = st.root_component.get_next()
+        return st
+
+    def __str__(self):
+        return "{} LBS {}\n------\n{}".format(self.id, self.lbs, str(self.root_component))
+
+    def match(self, word):
+        return self.root_component.match(word)
+
+
+def build_structures(filename):
+    structures = []
+    with open(filename, 'r') as fp:
+        et = ElementTree.XML(fp.read())
+        for structure in et.iterfind('syntactic_structure'):
+            structures.append(SyntacticStructure.from_xml(structure))
+    return structures
+
+
+class Word:
+    def __init__(self, xml):
+        self.lemma = xml.get('lemma')
+        self.msd = MSD_TRANSLATE[xml.get('msd')]
+        self.id = xml.get('id')
+        self.text = xml.text
+        self.links = defaultdict(list)
+
+        assert(None not in (self.id, self.lemma, self.msd))
+
+    def add_link(self, link, to):
+        self.links[link].append(to)
+
+    def get_links(self, link):
+        return self.links[link]
+
+
+def load_corpus(filename):
+    with open(filename, 'r') as fp:
+        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
+        xmlstring = xmlstring.replace(' xml:', ' ')
+        et = ElementTree.XML(xmlstring)
+
+    words = {}
+    for w in et.iter("w"):
+        words[w.get('id')] = Word(w)
+
+    for l in et.iter("link"):
+        assert('dep' in l.keys() and 'from' in l.keys() and 'afun' in l.keys())
+
+        lfrom = l.get('from')
+        if lfrom in words:
+            next_word_id = l.get('dep')
+            if next_word_id in words:
+                next_word = words[next_word_id]
+                words[l.get('from')].add_link(l.get('afun'), next_word)
+
+    return list(words.values())
+
+
+def main():
+    words = load_corpus(STAVKI)
+
+    import time
+    t = time.time()
+
+    structures = build_structures(STRUKTURE)
+    for s in structures:
+        print(s)
+    exit(0)
+
+    print(STAVKI)
+
+    num_matches = 0
+    for w in words:
+        for s in structures:
+            m = s.match(w)
+            if m is not None:
+                num_matches += 1
+                print(s.id, m)
+
+    print("TIME", time.time() - t)
+    # print(num_matches)
+
+
+if __name__ == '__main__':
+    main()
+
+
+
+