from xml.etree import ElementTree
import re
from enum import Enum
from collections import defaultdict
import sys

from msd_translate import MSD_TRANSLATE


STAVKI = sys.argv[1]
STRUKTURE = sys.argv[2] # "Kolokacije_strukture_09_new-system.xml"

CODES = {
    "Noun": "N",
    "Verb": "V",
    "Adjective": "A",
    "Adverb": "R",
    "Pronoun": "P",
    "Numeral": "M",
    "Preposition": "S",
    "Conjunction": "C",
    "Particle": "Q",
    "Interjection": "I",
    "Abbreviation": "Y",
    "Residual": "X",

    'common': 'c',
    'proper': 'p',
    'masculine': 'm',
    'feminine': 'f',
    'neuter': 'n',
    "singular": "s",
    "dual": "d",
    "plural": "p",
    "nominative": "n",
    "genitive": "g",
    "dative": "d",
    "accusative": "a",
    "locative": "l",
    "instrumental": "i",
    "no": "n",
    "yes": "y",
    "main": "m",
    "auxiliary": "a",
    "perfective": "e",
    "progressive": "p",
    "biaspectual": "b",
    "infinitive": "n",
    "supine": "u",
    "participle": "p",
    "present": "r",
    "future": "f",
    "conditional": "c",
    "imperative": "m",
    "first": "1",
    "second": "2",
    "third": "3",
    "general": "g",
    "possessive": "s",
    "positive": "p",
    "comparative": "c",
    "superlative": "s",
    "personal": "p",
    "demonstrative": "d",
    "relative": "r",
    "reflexive": "x",
    "interrogative": "q",
    "indefinite": "i",
    "negative": "z",
    "bound": "b",
    "digit": "d",
    "roman": "r",
    "letter": "l",
    "cardinal": "c",
    "ordinal": "o",
    "pronominal": "p",
    "special": "s",
    "coordinating": "c",
    "subordinating": "s",
    "foreign": "f",
    "typo": "t",
    "program": "p",
}

TAGSET = {
    "N": ['type', 'gender', 'number', 'case', 'animate'],
    "V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'],
    "A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'],
    "R": ['type', 'degree'],
    "P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'],
    "M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'],
    "S": ['case'],
    "C": ['type'],
    "Q": [],
    "I": [],
    "Y": [],
    "X": ['type']
}

CATEGORY_BASES = {
    "N": ['.', '.', '.', '.', '.?'],
    "V": ['.', '.', '.', '.', '.?', '.?', '.?'],
    "A": ['.', '.', '.', '.', '.', '.?'],
    "R": ['.', '.?'],
    "P": ['. ', '.', '.', '.', '.', '.', '.?', '.?'],
    "M": ['.', '.', '.', '.?', '.?', '.?'],
    "S": ['.'],
    "C": ['.'],
    "Q": [],
    "I": [],
    "Y": [],
    "X": ['.?']
}


class RestrictionType(Enum):
    Morphology = 0
    Lexis = 1


class ComponentLevel(Enum):
    Lemma = 0
    WordForm = 1


def get_level(restriction):
    for feature in restriction:
        if "level" in feature.keys():
            lvl = feature.get("level")
            if lvl == "lemma":
                return ComponentLevel.Lemma
            elif lvl == "word_form":
                return ComponentLevel.WordForm
            else:
                continue

    raise RuntimeError("Unreachable!")


def build_morphology_regex(restriction):
    restr_dict = {}
    for feature in restriction:
        restr_dict.update(feature.items())

    assert('POS' in restr_dict)
    category = restr_dict['POS'].capitalize()
    cat_code = CODES[category]
    rgx = [cat_code] + CATEGORY_BASES[cat_code]

    del restr_dict['POS']
    del restr_dict['level']

    for attribute, value in restr_dict.items():
        index = TAGSET[cat_code].index(attribute.lower())
        assert(index >= 0)

        if '|' in value:
            match = '[' + "".join(CODES[val] for val in value.split('|')) + ']'
        else:
            match = CODES[value]

        rgx[index + 1] = match

    return re.compile("".join(rgx))


def build_lexis_regex(restriction):
    restr_dict = {}
    for feature in restriction:
        restr_dict.update(feature.items())

    return re.compile(restr_dict['lemma'])


class Restriction:
    def __init__(self, restriction_tag):
        restriction_type = restriction_tag.get('type')
        if restriction_type == "morphology":
            self.type = RestrictionType.Morphology
            self.matcher = build_morphology_regex(list(restriction_tag))
        elif restriction_type == "lexis":
            self.type = RestrictionType.Lexis
            self.matcher = build_lexis_regex(list(restriction_tag))
        else:
            raise NotImplementedError()

    def match(self, word):
        if self.type == RestrictionType.Morphology:
            match_to = word.msd
        elif self.type == RestrictionType.Lexis:
            match_to = word.lemma
        else:
            raise RuntimeError("Unreachable!")

        return self.matcher.match(match_to)

    def __str__(self):
        return "({:s} {})".format(str(self.type).split('.')[1], self.matcher)

    def __repr__(self):
        return str(self)


class Component:
    def __init__(self, name, idx):
        assert(idx is not None)

        self.name = name if name is not None else ""  # for printing...
        self.idx = idx
        self.restriction = None
        self.next_element = []
        self.level = None

        self.iter_ctr = 0

    def word_to_str(self, word):
        if self.level == ComponentLevel.Lemma:
            return word.lemma, word.msd
        elif self.level == ComponentLevel.WordForm:
            return word.text, word.msd
        else:
            raise RuntimeError("Unreachable")

    def __iter__(self):
        self.iter_ctr = 0
        return self

    def __next__(self):
        if self.iter_ctr < len(self.next_element):
            to_ret = self.next_element[self.iter_ctr]
            self.iter_ctr += 1
            return to_ret
        else:
            raise StopIteration

    def add_next(self, next_component, link_label):
        self.next_element.append((next_component, link_label))

    def set_restriction(self, restrictions_tag):
        if restrictions_tag.tag == "restriction":
            self.restriction = Restriction(restrictions_tag)
            self.level = get_level(restrictions_tag)

        elif restrictions_tag.tag == "restriction_or":
            self.restriction = [Restriction(el) for el in restrictions_tag]
            self.level = get_level(restrictions_tag[0])

            # same level for every restriction for now and only or available
            levels = [get_level(el) for el in restrictions_tag]
            assert(len(set(levels)) == 1)

        else:
            raise RuntimeError("Unreachable")

    def find_next(self, deps, comps, restrs):
        for d in deps:
            if d[0] == self.idx:
                _, idx, dep_label = d

                next_component = Component(comps[idx], idx)
                next_component.set_restriction(restrs[idx])

                self.add_next(next_component, dep_label)
                next_component.find_next(deps, comps, restrs)

    def __str__(self):
        el = "({:10} {})".format(self.name, str(self.restriction))
        for next, link in self:
            el += "\n{:10} -- {:10} --> {}".format(self.name, link, str(next))
        return el

    def __repr__(self):
        return str(self)

    def match(self, word):
        matched = None

        # matching
        if type(self.restriction) is list:
            for restr in self.restriction:
                matched = restr.match(word)
                if matched is not None:
                    break
        else:
            matched = self.restriction.match(word)

        # recurse to next
        if matched:
            to_ret = [self.word_to_str(word)]

            for next, link in self:
                # need to get all links that match
                for next_word in word.get_links(link):
                    match = next.match(next_word)
                    # if matches, return
                    if match is not None:
                        to_ret.extend(match)
                        break

                # if none matched, nothing found!
                else:
                    return None

            return to_ret

        # return None...


class SyntacticStructure:
    def __init__(self):
        self.root_component = Component("", 'root')
        self.id = None
        self.lbs = None

    @staticmethod
    def from_xml(xml):
        st = SyntacticStructure()
        st.id = xml.get('id')
        st.lbs = xml.get('LBS')

        components, system = list(xml)
        dependencies, restrictions = list(system)

        assert(system.get('type') == 'JOS')

        deps = [ (dep.get('from'), dep.get('to'), dep.get('label')) for dep in dependencies ]
        comps = { comp.get('cid'): comp.get('name') for comp in components }
        restrs = { r.get('cid'): next(iter(r)) for r in restrictions }

        st.root_component.find_next(deps, comps, restrs)
        st.root_component = list(st.root_component)[0][0]  # get first next

        return st

    def __str__(self):
        arrow = "root       -- modra      --> "
        return "{} LBS {}\n------\n{}{}".format(self.id, self.lbs, arrow, str(self.root_component))

    def match(self, word):
        return self.root_component.match(word)


def build_structures(filename):
    structures = []
    with open(filename, 'r') as fp:
        et = ElementTree.XML(fp.read())
        for structure in et.iter('syntactic_structure'):
            structures.append(SyntacticStructure.from_xml(structure))
    return structures


class Word:
    def __init__(self, xml):
        self.lemma = xml.get('lemma')
        self.msd = MSD_TRANSLATE[xml.get('msd')]
        self.id = xml.get('id')
        self.text = xml.text
        self.links = defaultdict(list)

        assert(None not in (self.id, self.lemma, self.msd))

    def add_link(self, link, to):
        self.links[link].append(to)

    def get_links(self, link):
        if link not in self.links and "|" in link:
            for l in link.split('|'):
                self.links[link].extend(self.links[l])

        return self.links[link]


def load_corpus(filename):
    with open(filename, 'r') as fp:
        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
        xmlstring = xmlstring.replace(' xml:', ' ')
        et = ElementTree.XML(xmlstring)

    root_words = set()
    words = {}
    for w in et.iter("w"):
        words[w.get('id')] = Word(w)

    for l in et.iter("link"):
        assert('dep' in l.keys() and 'from' in l.keys() and 'afun' in l.keys())

        lfrom = l.get('from')
        if lfrom in words:
            assert(not lfrom.endswith('.0'))
            next_word_id = l.get('dep')
            if next_word_id in words:
                next_word = words[next_word_id]
                words[l.get('from')].add_link(l.get('afun'), next_word)

        # catch modra links from root
        elif lfrom[-1] == '0' and l.get('afun') == 'modra':
            root_words.add(l.get('dep'))

        else:
            # strange errors, just skip...
            pass

    no_root_words = [w for k, w in words.items() if k in root_words]
    missing = root_words - set(w.id for w in no_root_words)
    # what should i do with this I forgot :(

    return list(words.values())


def main():
    words = load_corpus(STAVKI)

    import time
    t = time.time()

    structures = build_structures(STRUKTURE)
    for s in structures:
        print(s)

    num_matches = 0
    for w in words:
        for s in structures:
            m = s.match(w)
            if m is not None:
                num_matches += 1
                print(s.id, m)

    print("TIME", time.time() - t)
    print(num_matches)


if __name__ == '__main__':
    main()