luscenje_struktur/wani.py

from xml.etree import ElementTree
import re
from enum import Enum
from collections import defaultdict

from msd_translate import MSD_TRANSLATE


STRUKTURE = "Kolokacije_strukture_09_new-system.xml"
STAVKI = "k2.xml"

CODES = {
    "Noun": "N",
    "Verb": "V",
    "Adjective": "A",
    "Adverb": "R",
    "Pronoun": "P",
    "Numeral": "M",
    "Preposition": "S",
    "Conjunction": "C",
    "Particle": "Q",
    "Interjection": "I",
    "Abbreviation": "Y",
    "Residual": "X",

    'common': 'c',
    'proper': 'p',
    'masculine': 'm',
    'feminine': 'f',
    'neuter': 'n',
    "singular": "s",
    "dual": "d",
    "plural": "p",
    "nominative": "n",
    "genitive": "g",
    "dative": "d",
    "accusative": "a",
    "locative": "l",
    "instrumental": "i",
    "no": "n",
    "yes": "y",
    "main": "m",
    "auxiliary": "a",
    "perfective": "e",
    "progressive": "p",
    "biaspectual": "b",
    "infinitive": "n",
    "supine": "u",
    "participle": "p",
    "present": "r",
    "future": "f",
    "conditional": "c",
    "imperative": "m",
    "first": "1",
    "second": "2",
    "third": "3",
    "general": "g",
    "possessive": "s",
    "positive": "p",
    "comparative": "c",
    "superlative": "s",
    "personal": "p",
    "demonstrative": "d",
    "relative": "r",
    "reflexive": "x",
    "interrogative": "q",
    "indefinite": "i",
    "negative": "z",
    "bound": "b",
    "digit": "d",
    "roman": "r",
    "letter": "l",
    "cardinal": "c",
    "ordinal": "o",
    "pronominal": "p",
    "special": "s",
    "coordinating": "c",
    "subordinating": "s",
    "foreign": "f",
    "typo": "t",
    "program": "p",
}

TAGSET = {
    "N": ['type', 'gender', 'number', 'case', 'animate'],
    "V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'],
    "A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'],
    "R": ['type', 'degree'],
    "P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'],
    "M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'],
    "S": ['case'],
    "C": ['type'],
    "Q": [],
    "I": [],
    "Y": [],
    "X": ['type']
}

CATEGORY_BASES = {
    "N": ['.', '.', '.', '.', '.?'],
    "V": ['.', '.', '.', '.', '.?', '.?', '.?'],
    "A": ['.', '.', '.', '.', '.', '.?'],
    "R": ['.', '.?'],
    "P": ['. ', '.', '.', '.', '.', '.', '.?', '.?'],
    "M": ['.', '.', '.', '.?', '.?', '.?'],
    "S": ['.'],
    "C": ['.'],
    "Q": [],
    "I": [],
    "Y": [],
    "X": ['.?']
}


class RestrictionType(Enum):
    Morphology = 0
    Lexis = 1


class ComponentLevel(Enum):
    Lemma = 0
    WordForm = 1


def get_level(restriction):
    for feature in restriction:
        if "level" in feature.keys():
            lvl = feature.get("level")
            if lvl == "lemma":
                return ComponentLevel.Lemma
            elif lvl == "word_form":
                return ComponentLevel.WordForm
            else:
                continue

    raise RuntimeError("Unreachable!")


def build_morphology_regex(restriction):
    restr_dict = {}
    for feature in restriction:
        restr_dict.update(feature.items())

    assert('POS' in restr_dict)
    category = restr_dict['POS'].capitalize()
    cat_code = CODES[category]
    rgx = [cat_code] + CATEGORY_BASES[cat_code]

    del restr_dict['POS']
    del restr_dict['level']

    for attribute, value in restr_dict.items():
        index = TAGSET[cat_code].index(attribute.lower())
        assert(index >= 0)

        if '|' in value:
            match = '[' + "".join(CODES[val] for val in value.split('|')) + ']'
        else:
            match = CODES[value]

        rgx[index + 1] = match

    return re.compile("".join(rgx))


def build_lexis_regex(restriction):
    restr_dict = {}
    for feature in restriction:
        restr_dict.update(feature.items())

    return re.compile(restr_dict['lemma'])


class Restriction:
    def __init__(self, restriction_tag):
        restriction_type = restriction_tag.get('type')
        if restriction_type == "morphology":
            self.type = RestrictionType.Morphology
            self.matcher = build_morphology_regex(restriction_tag.getchildren())
        elif restriction_type == "lexis":
            self.type = RestrictionType.Lexis
            self.matcher = build_lexis_regex(restriction_tag.getchildren())
        else:
            raise NotImplementedError()

    def match(self, word):
        if self.type == RestrictionType.Morphology:
            match_to = word.msd
        elif self.type == RestrictionType.Lexis:
            match_to = word.lemma
        else:
            raise RuntimeError("Unreachable!")

        return self.matcher.match(match_to)

    def __str__(self):
        return "({:s} {})".format(str(self.type).split('.')[1], self.matcher)

    def __repr__(self):
        return str(self)


class Component:
    def __init__(self, name, idx):
        assert(idx is not None)

        self.name = name if name is not None else ""  # for printing...
        self.idx = idx
        self.restriction = None
        self.next_element = []
        self.level = None

        self.iter_ctr = 0

    def word_to_str(self, word):
        if self.level == ComponentLevel.Lemma:
            return word.lemma, word.msd
        elif self.level == ComponentLevel.WordForm:
            return word.text, word.msd
        else:
            raise RuntimeError("Unreachable")

    def __iter__(self):
        self.iter_ctr = 0
        return self

    def __next__(self):
        if self.iter_ctr < len(self.next_element):
            to_ret = self.next_element[self.iter_ctr]
            self.iter_ctr += 1
            return to_ret
        else:
            raise StopIteration

    def add_next(self, next_component, link_label):
        self.next_element.append((next_component, link_label))

    def set_restriction(self, restrictions_tag):
        if restrictions_tag.tag == "restriction":
            self.restriction = Restriction(restrictions_tag)
            self.level = get_level(restrictions_tag)

        elif restrictions_tag.tag == "restriction_or":
            self.restriction = [Restriction(el) for el in restrictions_tag]
            self.level = get_level(restrictions_tag[0])

            # same level for every restriction for now and only or available
            levels = [get_level(el) for el in restrictions_tag]
            assert(len(set(levels)) == 1)

        else:
            raise RuntimeError("Unreachable")

    def find_next(self, deps, comps, restrs):
        for d in deps:
            if d[0] == self.idx:
                _, idx, dep_label = d

                next_component = Component(comps[idx], idx)
                next_component.set_restriction(restrs[idx])

                self.add_next(next_component, dep_label)
                next_component.find_next(deps, comps, restrs)

    def __str__(self):
        el = "({:10} {})".format(self.name, str(self.restriction))
        for next, link in self:
            el += "\n{:10} -- {:10} --> {}".format(self.name, link, str(next))
        return el

    def __repr__(self):
        return str(self)

    def match(self, word):
        matched = None

        # matching
        if type(self.restriction) is list:
            for restr in self.restriction:
                matched = restr.match(word)
                if matched is not None:
                    break
        else:
            matched = self.restriction.match(word)

        # recurse to next
        if matched:
            to_ret = [self.word_to_str(word)]

            for next, link in self:
                # need to get all links that match
                for next_word in word.get_links(link):
                    match = next.match(next_word)
                    # if matches, return
                    if match is not None:
                        to_ret.extend(match)
                        break

                # if none matched, nothing found!
                else:
                    return None

            return to_ret

        # return None...


class SyntacticStructure:
    def __init__(self):
        self.root_component = Component("", 'root')
        self.id = None
        self.lbs = None

    @staticmethod
    def from_xml(xml):
        st = SyntacticStructure()
        st.id = xml.get('id')
        st.lbs = xml.get('LBS')

        components, system = xml.getchildren()
        dependencies, restrictions = system.getchildren()

        assert(system.get('type') == 'JOS')

        deps = [ (dep.get('from'), dep.get('to'), dep.get('label')) for dep in dependencies ]
        comps = { comp.get('cid'): comp.get('name') for comp in components }
        restrs = { r.get('cid'): r.getchildren()[0] for r in restrictions }

        st.root_component.find_next(deps, comps, restrs)
        st.root_component = list(st.root_component)[0][0]  # get first next

        return st

    def __str__(self):
        arrow = "root       -- modra      --> "
        return "{} LBS {}\n------\n{}{}".format(self.id, self.lbs, arrow, str(self.root_component))

    def match(self, word):
        return self.root_component.match(word)


def build_structures(filename):
    structures = []
    with open(filename, 'r') as fp:
        et = ElementTree.XML(fp.read())
        for structure in et.iter('syntactic_structure'):
            structures.append(SyntacticStructure.from_xml(structure))
    return structures


class Word:
    def __init__(self, xml):
        self.lemma = xml.get('lemma')
        self.msd = MSD_TRANSLATE[xml.get('msd')]
        self.id = xml.get('id')
        self.text = xml.text
        self.links = defaultdict(list)

        assert(None not in (self.id, self.lemma, self.msd))

    def add_link(self, link, to):
        self.links[link].append(to)

    def get_links(self, link):
        if link not in self.links and "|" in link:
            for l in link.split('|'):
                self.links[link].extend(self.links[l])

        return self.links[link]


def load_corpus(filename):
    with open(filename, 'r') as fp:
        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
        xmlstring = xmlstring.replace(' xml:', ' ')
        et = ElementTree.XML(xmlstring)

    words = {}
    for w in et.iter("w"):
        words[w.get('id')] = Word(w)

    for l in et.iter("link"):
        assert('dep' in l.keys() and 'from' in l.keys() and 'afun' in l.keys())

        lfrom = l.get('from')
        if lfrom in words:
            assert(not lfrom.endswith('.0'))
            next_word_id = l.get('dep')
            if next_word_id in words:
                next_word = words[next_word_id]
                words[l.get('from')].add_link(l.get('afun'), next_word)

        # catch modra links from root
        elif lfrom[-1] == '0' and l.get('afun') == 'modra':
            root_words.add(l.get('dep'))
            pass
        else:
            # strange errors, just skip...
            pass
    return list(words.values())


def main():
    words = load_corpus(STAVKI)

    import time
    t = time.time()

    structures = build_structures(STRUKTURE)
    for s in structures:
        print(s)

    num_matches = 0
    for w in words:
        for s in structures:
            m = s.match(w)
            if m is not None:
                num_matches += 1
                print(s.id, m)

    print("TIME", time.time() - t)
    print(num_matches)


if __name__ == '__main__':
    main()
First commit 2018-10-29 10:29:51 +00:00			`from xml.etree import ElementTree`
			`import re`
			`from enum import Enum`
			`from collections import defaultdict`

			`from msd_translate import MSD_TRANSLATE`


Two fixes, "10-1"-like structures and restriction_or 2018-10-29 11:16:42 +00:00			`STRUKTURE = "Kolokacije_strukture_09_new-system.xml"`
First commit 2018-10-29 10:29:51 +00:00			`STAVKI = "k2.xml"`

			`CODES = {`
			`"Noun": "N",`
			`"Verb": "V",`
			`"Adjective": "A",`
			`"Adverb": "R",`
			`"Pronoun": "P",`
			`"Numeral": "M",`
			`"Preposition": "S",`
			`"Conjunction": "C",`
			`"Particle": "Q",`
			`"Interjection": "I",`
			`"Abbreviation": "Y",`
			`"Residual": "X",`

			`'common': 'c',`
			`'proper': 'p',`
			`'masculine': 'm',`
			`'feminine': 'f',`
			`'neuter': 'n',`
			`"singular": "s",`
			`"dual": "d",`
			`"plural": "p",`
			`"nominative": "n",`
			`"genitive": "g",`
			`"dative": "d",`
			`"accusative": "a",`
			`"locative": "l",`
			`"instrumental": "i",`
			`"no": "n",`
			`"yes": "y",`
			`"main": "m",`
			`"auxiliary": "a",`
			`"perfective": "e",`
			`"progressive": "p",`
			`"biaspectual": "b",`
			`"infinitive": "n",`
			`"supine": "u",`
			`"participle": "p",`
			`"present": "r",`
			`"future": "f",`
			`"conditional": "c",`
			`"imperative": "m",`
			`"first": "1",`
			`"second": "2",`
			`"third": "3",`
			`"general": "g",`
			`"possessive": "s",`
			`"positive": "p",`
			`"comparative": "c",`
			`"superlative": "s",`
			`"personal": "p",`
			`"demonstrative": "d",`
			`"relative": "r",`
			`"reflexive": "x",`
			`"interrogative": "q",`
			`"indefinite": "i",`
			`"negative": "z",`
			`"bound": "b",`
			`"digit": "d",`
			`"roman": "r",`
			`"letter": "l",`
			`"cardinal": "c",`
			`"ordinal": "o",`
			`"pronominal": "p",`
			`"special": "s",`
			`"coordinating": "c",`
			`"subordinating": "s",`
			`"foreign": "f",`
			`"typo": "t",`
			`"program": "p",`
			`}`

			`TAGSET = {`
			`"N": ['type', 'gender', 'number', 'case', 'animate'],`
			`"V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'],`
			`"A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'],`
			`"R": ['type', 'degree'],`
			`"P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'],`
			`"M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'],`
			`"S": ['case'],`
			`"C": ['type'],`
			`"Q": [],`
			`"I": [],`
			`"Y": [],`
			`"X": ['type']`
			`}`

			`CATEGORY_BASES = {`
			`"N": ['.', '.', '.', '.', '.?'],`
			`"V": ['.', '.', '.', '.', '.?', '.?', '.?'],`
			`"A": ['.', '.', '.', '.', '.', '.?'],`
			`"R": ['.', '.?'],`
			`"P": ['. ', '.', '.', '.', '.', '.', '.?', '.?'],`
			`"M": ['.', '.', '.', '.?', '.?', '.?'],`
			`"S": ['.'],`
			`"C": ['.'],`
			`"Q": [],`
			`"I": [],`
			`"Y": [],`
			`"X": ['.?']`
			`}`


			`class RestrictionType(Enum):`
			`Morphology = 0`
			`Lexis = 1`


			`class ComponentLevel(Enum):`
			`Lemma = 0`
			`WordForm = 1`


			`def get_level(restriction):`
			`for feature in restriction:`
			`if "level" in feature.keys():`
			`lvl = feature.get("level")`
			`if lvl == "lemma":`
			`return ComponentLevel.Lemma`
			`elif lvl == "word_form":`
			`return ComponentLevel.WordForm`
			`else:`
			`continue`

			`raise RuntimeError("Unreachable!")`


			`def build_morphology_regex(restriction):`
			`restr_dict = {}`
			`for feature in restriction:`
			`restr_dict.update(feature.items())`

			`assert('POS' in restr_dict)`
			`category = restr_dict['POS'].capitalize()`
			`cat_code = CODES[category]`
			`rgx = [cat_code] + CATEGORY_BASES[cat_code]`

			`del restr_dict['POS']`
			`del restr_dict['level']`

			`for attribute, value in restr_dict.items():`
			`index = TAGSET[cat_code].index(attribute.lower())`
			`assert(index >= 0)`

			`if '\|' in value:`
			`match = '[' + "".join(CODES[val] for val in value.split('\|')) + ']'`
			`else:`
			`match = CODES[value]`

			`rgx[index + 1] = match`

			`return re.compile("".join(rgx))`


			`def build_lexis_regex(restriction):`
			`restr_dict = {}`
			`for feature in restriction:`
			`restr_dict.update(feature.items())`

			`return re.compile(restr_dict['lemma'])`


Two fixes, "10-1"-like structures and restriction_or 2018-10-29 11:16:42 +00:00			`class Restriction:`
			`def __init__(self, restriction_tag):`
			`restriction_type = restriction_tag.get('type')`
			`if restriction_type == "morphology":`
			`self.type = RestrictionType.Morphology`
			`self.matcher = build_morphology_regex(restriction_tag.getchildren())`
			`elif restriction_type == "lexis":`
			`self.type = RestrictionType.Lexis`
			`self.matcher = build_lexis_regex(restriction_tag.getchildren())`
			`else:`
			`raise NotImplementedError()`

			`def match(self, word):`
			`if self.type == RestrictionType.Morphology:`
			`match_to = word.msd`
			`elif self.type == RestrictionType.Lexis:`
			`match_to = word.lemma`
			`else:`
			`raise RuntimeError("Unreachable!")`

			`return self.matcher.match(match_to)`

			`def __str__(self):`
			`return "({:s} {})".format(str(self.type).split('.')[1], self.matcher)`

			`def __repr__(self):`
			`return str(self)`


First commit 2018-10-29 10:29:51 +00:00			`class Component:`
Moving from linkedlist of component to tree structure. 2018-10-30 12:33:08 +00:00			`def __init__(self, name, idx):`
			`assert(idx is not None)`

			`self.name = name if name is not None else "" # for printing...`
			`self.idx = idx`
First commit 2018-10-29 10:29:51 +00:00			`self.restriction = None`
Moving from linkedlist of component to tree structure. 2018-10-30 12:33:08 +00:00			`self.next_element = []`
First commit 2018-10-29 10:29:51 +00:00			`self.level = None`

Moving from linkedlist of component to tree structure. 2018-10-30 12:33:08 +00:00			`self.iter_ctr = 0`

First commit 2018-10-29 10:29:51 +00:00			`def word_to_str(self, word):`
			`if self.level == ComponentLevel.Lemma:`
			`return word.lemma, word.msd`
			`elif self.level == ComponentLevel.WordForm:`
			`return word.text, word.msd`
			`else:`
			`raise RuntimeError("Unreachable")`

Moving from linkedlist of component to tree structure. 2018-10-30 12:33:08 +00:00			`def __iter__(self):`
			`self.iter_ctr = 0`
			`return self`
First commit 2018-10-29 10:29:51 +00:00
Moving from linkedlist of component to tree structure. 2018-10-30 12:33:08 +00:00			`def __next__(self):`
			`if self.iter_ctr < len(self.next_element):`
			`to_ret = self.next_element[self.iter_ctr]`
			`self.iter_ctr += 1`
			`return to_ret`
			`else:`
			`raise StopIteration`
First commit 2018-10-29 10:29:51 +00:00
Moving from linkedlist of component to tree structure. 2018-10-30 12:33:08 +00:00			`def add_next(self, next_component, link_label):`
			`self.next_element.append((next_component, link_label))`
First commit 2018-10-29 10:29:51 +00:00
Two fixes, "10-1"-like structures and restriction_or 2018-10-29 11:16:42 +00:00			`def set_restriction(self, restrictions_tag):`
			`if restrictions_tag.tag == "restriction":`
			`self.restriction = Restriction(restrictions_tag)`
			`self.level = get_level(restrictions_tag)`
First commit 2018-10-29 10:29:51 +00:00
Two fixes, "10-1"-like structures and restriction_or 2018-10-29 11:16:42 +00:00			`elif restrictions_tag.tag == "restriction_or":`
			`self.restriction = [Restriction(el) for el in restrictions_tag]`
			`self.level = get_level(restrictions_tag[0])`

			`# same level for every restriction for now and only or available`
			`levels = [get_level(el) for el in restrictions_tag]`
			`assert(len(set(levels)) == 1)`

			`else:`
			`raise RuntimeError("Unreachable")`
First commit 2018-10-29 10:29:51 +00:00
Moving from linkedlist of component to tree structure. 2018-10-30 12:33:08 +00:00			`def find_next(self, deps, comps, restrs):`
			`for d in deps:`
			`if d[0] == self.idx:`
			`_, idx, dep_label = d`

			`next_component = Component(comps[idx], idx)`
			`next_component.set_restriction(restrs[idx])`

			`self.add_next(next_component, dep_label)`
			`next_component.find_next(deps, comps, restrs)`

First commit 2018-10-29 10:29:51 +00:00			`def __str__(self):`
Moving from linkedlist of component to tree structure. 2018-10-30 12:33:08 +00:00			`el = "({:10} {})".format(self.name, str(self.restriction))`
			`for next, link in self:`
			`el += "\n{:10} -- {:10} --> {}".format(self.name, link, str(next))`
First commit 2018-10-29 10:29:51 +00:00			`return el`

			`def __repr__(self):`
			`return str(self)`

			`def match(self, word):`
Two fixes, "10-1"-like structures and restriction_or 2018-10-29 11:16:42 +00:00			`matched = None`

			`# matching`
			`if type(self.restriction) is list:`
			`for restr in self.restriction:`
			`matched = restr.match(word)`
			`if matched is not None:`
			`break`
First commit 2018-10-29 10:29:51 +00:00			`else:`
Two fixes, "10-1"-like structures and restriction_or 2018-10-29 11:16:42 +00:00			`matched = self.restriction.match(word)`
First commit 2018-10-29 10:29:51 +00:00
Two fixes, "10-1"-like structures and restriction_or 2018-10-29 11:16:42 +00:00			`# recurse to next`
			`if matched:`
First commit 2018-10-29 10:29:51 +00:00			`to_ret = [self.word_to_str(word)]`

Moving from linkedlist of component to tree structure. 2018-10-30 12:33:08 +00:00			`for next, link in self:`
			`# need to get all links that match`
			`for next_word in word.get_links(link):`
			`match = next.match(next_word)`
			`# if matches, return`
			`if match is not None:`
			`to_ret.extend(match)`
			`break`

			`# if none matched, nothing found!`
			`else:`
			`return None`
First commit 2018-10-29 10:29:51 +00:00
Moving from linkedlist of component to tree structure. 2018-10-30 12:33:08 +00:00			`return to_ret`
First commit 2018-10-29 10:29:51 +00:00
			`# return None...`


			`class SyntacticStructure:`
			`def __init__(self):`
Moving from linkedlist of component to tree structure. 2018-10-30 12:33:08 +00:00			`self.root_component = Component("", 'root')`
First commit 2018-10-29 10:29:51 +00:00			`self.id = None`
			`self.lbs = None`

			`@staticmethod`
			`def from_xml(xml):`
			`st = SyntacticStructure()`
Two fixes, "10-1"-like structures and restriction_or 2018-10-29 11:16:42 +00:00			`st.id = xml.get('id')`
First commit 2018-10-29 10:29:51 +00:00			`st.lbs = xml.get('LBS')`

			`components, system = xml.getchildren()`
			`dependencies, restrictions = system.getchildren()`

			`assert(system.get('type') == 'JOS')`

Moving from linkedlist of component to tree structure. 2018-10-30 12:33:08 +00:00			`deps = [ (dep.get('from'), dep.get('to'), dep.get('label')) for dep in dependencies ]`
First commit 2018-10-29 10:29:51 +00:00			`comps = { comp.get('cid'): comp.get('name') for comp in components }`
			`restrs = { r.get('cid'): r.getchildren()[0] for r in restrictions }`

Moving from linkedlist of component to tree structure. 2018-10-30 12:33:08 +00:00			`st.root_component.find_next(deps, comps, restrs)`
			`st.root_component = list(st.root_component)[0][0] # get first next`
First commit 2018-10-29 10:29:51 +00:00
			`return st`

			`def __str__(self):`
Moving from linkedlist of component to tree structure. 2018-10-30 12:33:08 +00:00			`arrow = "root -- modra --> "`
			`return "{} LBS {}\n------\n{}{}".format(self.id, self.lbs, arrow, str(self.root_component))`
First commit 2018-10-29 10:29:51 +00:00
			`def match(self, word):`
			`return self.root_component.match(word)`


			`def build_structures(filename):`
			`structures = []`
			`with open(filename, 'r') as fp:`
			`et = ElementTree.XML(fp.read())`
Two fixes, "10-1"-like structures and restriction_or 2018-10-29 11:16:42 +00:00			`for structure in et.iter('syntactic_structure'):`
First commit 2018-10-29 10:29:51 +00:00			`structures.append(SyntacticStructure.from_xml(structure))`
			`return structures`


			`class Word:`
			`def __init__(self, xml):`
			`self.lemma = xml.get('lemma')`
			`self.msd = MSD_TRANSLATE[xml.get('msd')]`
			`self.id = xml.get('id')`
			`self.text = xml.text`
			`self.links = defaultdict(list)`

			`assert(None not in (self.id, self.lemma, self.msd))`

			`def add_link(self, link, to):`
			`self.links[link].append(to)`

			`def get_links(self, link):`
Links with \| now parsed 2018-10-29 11:43:07 +00:00			`if link not in self.links and "\|" in link:`
			`for l in link.split('\|'):`
			`self.links[link].extend(self.links[l])`

First commit 2018-10-29 10:29:51 +00:00			`return self.links[link]`


			`def load_corpus(filename):`
			`with open(filename, 'r') as fp:`
			`xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)`
			`xmlstring = xmlstring.replace(' xml:', ' ')`
			`et = ElementTree.XML(xmlstring)`

			`words = {}`
			`for w in et.iter("w"):`
			`words[w.get('id')] = Word(w)`

			`for l in et.iter("link"):`
			`assert('dep' in l.keys() and 'from' in l.keys() and 'afun' in l.keys())`

			`lfrom = l.get('from')`
			`if lfrom in words:`
Catching modra links from root 2019-01-08 18:37:28 +00:00			`assert(not lfrom.endswith('.0'))`
First commit 2018-10-29 10:29:51 +00:00			`next_word_id = l.get('dep')`
			`if next_word_id in words:`
			`next_word = words[next_word_id]`
			`words[l.get('from')].add_link(l.get('afun'), next_word)`

Catching modra links from root 2019-01-08 18:37:28 +00:00			`# catch modra links from root`
			`elif lfrom[-1] == '0' and l.get('afun') == 'modra':`
			`root_words.add(l.get('dep'))`
			`pass`
			`else:`
			`# strange errors, just skip...`
			`pass`
First commit 2018-10-29 10:29:51 +00:00			`return list(words.values())`


			`def main():`
			`words = load_corpus(STAVKI)`

			`import time`
			`t = time.time()`

			`structures = build_structures(STRUKTURE)`
			`for s in structures:`
			`print(s)`

			`num_matches = 0`
			`for w in words:`
			`for s in structures:`
			`m = s.match(w)`
			`if m is not None:`
			`num_matches += 1`
			`print(s.id, m)`

			`print("TIME", time.time() - t)`
Two fixes, "10-1"-like structures and restriction_or 2018-10-29 11:16:42 +00:00			`print(num_matches)`
First commit 2018-10-29 10:29:51 +00:00

			`if __name__ == '__main__':`
			`main()`