luscenje_struktur/wani.py

from xml.etree import ElementTree
import re
from enum import Enum
from collections import defaultdict, namedtuple, Counter
import sys
import logging
import argparse
import pickle
import time
import subprocess
import concurrent.futures
import tempfile

from msd_translate import MSD_TRANSLATE
from tqdm import tqdm


MAX_NUM_COMPONENTS = 5


CODES = {
    "Noun": "N",
    "Verb": "V",
    "Adjective": "A",
    "Adverb": "R",
    "Pronoun": "P",
    "Numeral": "M",
    "Preposition": "S",
    "Conjunction": "C",
    "Particle": "Q",
    "Interjection": "I",
    "Abbreviation": "Y",
    "Residual": "X",

    'common': 'c',
    'proper': 'p',
    'masculine': 'm',
    'feminine': 'f',
    'neuter': 'n',
    "singular": "s",
    "dual": "d",
    "plural": "p",
    "nominative": "n",
    "genitive": "g",
    "dative": "d",
    "accusative": "a",
    "locative": "l",
    "instrumental": "i",
    "no": "n",
    "yes": "y",
    "main": "m",
    "auxiliary": "a",
    "perfective": "e",
    "progressive": "p",
    "biaspectual": "b",
    "infinitive": "n",
    "supine": "u",
    "participle": "p",
    "present": "r",
    "future": "f",
    "conditional": "c",
    "imperative": "m",
    "first": "1",
    "second": "2",
    "third": "3",
    "general": "g",
    "possessive": "s",
    "positive": "p",
    "comparative": "c",
    "superlative": "s",
    "personal": "p",
    "demonstrative": "d",
    "relative": "r",
    "reflexive": "x",
    "interrogative": "q",
    "indefinite": "i",
    "negative": "z",
    "bound": "b",
    "digit": "d",
    "roman": "r",
    "letter": "l",
    "cardinal": "c",
    "ordinal": "o",
    "pronominal": "p",
    "special": "s",
    "coordinating": "c",
    "subordinating": "s",
    "foreign": "f",
    "typo": "t",
    "program": "p",
}

TAGSET = {
    "N": ['type', 'gender', 'number', 'case', 'animate'],
    "V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'],
    "A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'],
    "R": ['type', 'degree'],
    "P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'],
    "M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'],
    "S": ['case'],
    "C": ['type'],
    "Q": [],
    "I": [],
    "Y": [],
    "X": ['type']
}

CATEGORY_BASES = {
    "N": ['.'] * 5,
    "V": ['.'] * 7,
    "A": ['.'] * 6,
    "R": ['.'] * 2,
    "P": ['.'] * 6,
    "M": ['.'] * 6,
    "S": ['.'] * 1,
    "C": ['.'] * 1,
    "Q": [],
    "I": [],
    "Y": [],
    "X": ['.'] * 1
}


class RestrictionType(Enum):
    Morphology = 0
    Lexis = 1
    MatchAll = 2

class Order(Enum):
    FromTo = 0
    ToFrom = 1
    Any = 2

    @staticmethod
    def new(order):
        if order is not None:
            if order == "to-from":
                return Order.ToFrom 
            elif order == "from-to":
                return Order.FromTo 
            else:
                raise NotImplementedError("What kind of ordering is: {}".format(order))
        else:
            return Order.Any


    def match(self, from_w, to_w):
        if self is Order.Any:
            return True

        fi = from_w.int_id
        ti = to_w.int_id

        if self is Order.FromTo:
            return fi < ti
        elif self is Order.ToFrom:
            return ti < fi
        else:
            raise NotImplementedError("Should not be here: Order match")


class ComponentRepresentation:
    def __init__(self, data, word_renderer):
        self.data = data
        self.word_renderer = word_renderer

        self.words = []
        self.rendition_text = None
        self.agreement = None
    
    def get_agreement(self):
        return None

    def add_word(self, word):
        self.words.append(word)

    def render(self):
        if self.rendition_text is None:
            self.rendition_text = self._render()
    
    def rendition(self):
        return "" if self.rendition_text is None else self.rendition_text

    def _render(self):
        raise NotImplementedError("Not implemented for class: {}".format(type(self)))

class LemmaCR(ComponentRepresentation):
    def _render(self):
        return self.words[0].lemma if len(self.words) > 0 else None

class LexisCR(ComponentRepresentation):
    def _render(self):
        return self.data
    
class WordFormAllCR(ComponentRepresentation):
    def _render(self):
        txt = "/".join(set([w.text for w in set(self.words)])) if len(self.words) > 0 else None
        return txt

class WordFormAnyCR(ComponentRepresentation):
    def _render(self):
        text_forms = {}
        msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
        for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
            text_forms[(msd, lemma)] = text

        words_counter = []
        for word in self.words:
            words_counter.append((word.msd, word.lemma))
        sorted_words = sorted(set(words_counter), key=words_counter.count)

        for word_msd, word_lemma in sorted_words:
            if self.agreement is not None:
                if self.agreement.match(word_msd):
                    if word_lemma is None:
                        return None
                    else:
                        return text_forms[(word_msd, word_lemma)]
        
class WordFormMsdCR(WordFormAnyCR):
    def __init__(self, *args):
        super().__init__(*args)
        self.backup_word = None

    def check_msd(self, word):
        selectors = self.data
        for key, value in selectors.items():
            t = word.msd[0]
            v = TAGSET[t].index(key.lower())
            f1 = word.msd[v + 1]
            f2 = CODES[value]

            if '-' not in [f1, f2] and f1 != f2:
                return False

        return True
    pass
    
    def add_word(self, word):
        if self.backup_word is None:
            msd = self.word_renderer.get_lemma_msd(word.lemma, word.msd)
            WordLemma = namedtuple('WordLemmaOnly', 'msd most_frequent_text lemma text')
            self.backup_word = WordLemma(msd=msd, most_frequent_text=lambda *x: None, lemma=None, text=None)

        if self.check_msd(word):
            super().add_word(word)
    
    def _render(self):
        self.words.append(self.backup_word)
        return super()._render()

class WordFormAgreementCR(ComponentRepresentation):
    def __init__(self, data, word_renderer):
        super().__init__(data, word_renderer)
        self.agree_with, self.data = self.data
    
    def get_agreement(self):
        return self.agree_with
    
    def match(self, word_msd):
        word_category = self.words[0].msd[0]
        word_lemma = self.words[0].lemma
        agreements = self.data

        existing = [(w.msd, w.text) for w in self.words]

        for candidate_msd, candidate_text in self.word_renderer.available_words(word_lemma, existing):
            if word_category != candidate_msd[0]:
                continue

            if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, agreements):
                self.rendition_text = candidate_text
                return True

        return False

    @staticmethod
    def check_agreement(msd1, msd2, agreements):
        for agr_case in agreements:
            t1 = msd1[0]
            # if not in msd, some strange msd was tries, skipping...
            if agr_case not in TAGSET[t1]:
                logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1))
                return False

            v1 = TAGSET[t1].index(agr_case)
            # if none specified: nedolocnik, always agrees
            if v1 + 1 >= len(msd1): 
                continue 
            # first is uppercase, not in TAGSET
            m1 = msd1[v1 + 1]

            # REPEAT (not DRY!)
            t2 = msd2[0]
            if agr_case not in TAGSET[t2]:
                logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2))
                return False
            v2 = TAGSET[t2].index(agr_case)
            if v2 + 1 >= len(msd2): 
                continue 
            m2 = msd2[v2 + 1]

            # match!
            if '-' not in [m1, m2] and m1 != m2:
                return False

        return True
    
    def render(self):
        pass


class ComponentRendition:
    def __init__(self):
        self.more = None
        self.representation_factory = ComponentRepresentation
    
    def _set_more(self, m):
        self.more = m
    
    def add_feature(self, feature):
        if 'rendition' in feature:
            if feature['rendition'] == "lemma":
                self.representation_factory = LemmaCR
            elif feature['rendition'] == "word_form":
                # just by default, changes with selection
                self.representation_factory = WordFormAnyCR
            elif feature['rendition'] == "lexis":
                self.representation_factory = LexisCR
                self.more = feature['string']
            else:
                raise NotImplementedError("Representation rendition: {}".format(feature))

        elif 'selection' in feature:
            if feature['selection'] == "msd":
                self.representation_factory = WordFormMsdCR
                self.more = {k: v for k, v in feature.items() if k != 'selection'}
            elif feature['selection'] == "all":
                self.representation_factory = WordFormAllCR
            elif feature['selection'] == 'agreement':
                assert(feature['head'][:4] == 'cid_')
                assert(feature['msd'] is not None)
                self.representation_factory = WordFormAgreementCR
                self.more = (feature['head'][4:], feature['msd'].split('+'))
            else:
                raise NotImplementedError("Representation selection: {}".format(feature))

        else:
            return None
    
    def cr_instance(self, word_renderer):
        return self.representation_factory(self.more, word_renderer)
    
    @staticmethod
    def set_representations(matches, structure, word_renderer):
        representations = {}
        for c in structure.components:
            representations[c.idx] = []
            for rep in c.representation:
                representations[c.idx].append(rep.cr_instance(word_renderer))
        
        for cid, reps in representations.items():
            for rep in reps:
                agr = rep.get_agreement()
                if agr is None:
                    continue

                if len(representations[agr]) != 1:
                    n = len(representations[agr])
                    raise NotImplementedError(
                        "Structure {}: ".format(structure.id) +
                        "component {} has agreement".format(cid) +
                        " with component {}".format(agr) +
                        ", however there are {} (!= 1) representations".format(n) +
                        " of component {}!".format(agr))

                representations[agr][0].agreement = rep

        # representations = {
        #     c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""]
        #     for c in structure.components
        # }
        # found_agreements = {}

        # def render_form(component_id, lst, backup_word):
        #     if backup_word is not None:
        #         lst.append(backup_word)

        #     text_forms = {}
        #     msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in lst])
        #     for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
        #         text_forms[(msd, lemma)] = text

        #     lst_ctr = []
        #     for word in lst:
        #         lst_ctr.append((word.msd, word.lemma))
        #     sorted_lst = sorted(set(lst_ctr), key=lst.count)

        #     for word_msd, word_lemma in sorted_lst:
        #         if component_id in found_agreements:
        #             other_component_id, other_word, agreements, other_texts = found_agreements[component_id]
        #             agr = are_agreements_ok(word_msd, other_word.lemma, other_word.msd, agreements, other_texts)
        #             if agr is None:
        #                 continue

        #             matches.representations[other_component_id] = agr

        #         if word_lemma is not None:
        #             matches.representations[component_id] = text_forms[(msd, lemma)] #word_renderer.render(word_lemma, word_msd)

        #         break
        
        # def are_agreements_ok(w1_msd, ow_lemma, ow_msd, agreements, ow_texts):
        #     for w2_msd, w2_txt in word_renderer.available_words(ow_lemma, ow_texts):
        #         if ow_msd[0] != w2_msd[0]:
        #             continue

        #         if check_agreement(w1_msd, w2_msd, agreements):
        #             return w2_txt

        
        # def check_agreement(msd1, msd2, agreements):
        #     for agr_case in agreements:
        #         t1 = msd1[0]
        #         # if not in msd, some strange msd was tries, skipping...
        #         if agr_case not in TAGSET[t1]:
        #             logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1))
        #             return False

        #         v1 = TAGSET[t1].index(agr_case)
        #         # if none specified: nedolocnik, always agrees
        #         if v1 + 1 >= len(msd1): 
        #             continue 
        #         # first is uppercase, not in TAGSET
        #         m1 = msd1[v1 + 1]

        #         # REPEAT (not DRY!)
        #         t2 = msd2[0]
        #         if agr_case not in TAGSET[t2]:
        #             logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2))
        #             return False
        #         v2 = TAGSET[t2].index(agr_case)
        #         if v2 + 1 >= len(msd2): 
        #             continue 
        #         m2 = msd2[v2 + 1]

        #         # match!
        #         if '-' not in [m1, m2] and m1 != m2:
        #             return False

        #     return True

        for words in matches.matches:
            # first pass, check everything but agreements
            for w_id, w in words.items():
                component = structure.get_component(w_id)
                component_representations = representations[component.idx]
                for representation in component_representations:
                    representation.add_word(w)
                
                # if rep.isit(Rendition.Lemma):
                #     representations[w_id][0] = False
                #     representations[w_id][1] = w.lemma
                # elif rep.isit(Rendition.Lexis):
                #     representations[w_id][0] = False
                #     representations[w_id][1] = rep.more
                # elif rep.isit(Rendition.Unknown):
                #     representations[w_id][0] = False
                #     representations[w_id][1] = ""
                
                # # it HAS to be word_form now
                # else:
                #     assert(rep.isit(Rendition.WordForm))
                #     wf_type, more = rep.more
                #     add = True

                #     if wf_type is WordFormSelection.Msd:
                #         add = check_msd(w, more)
                #         func = render_form
                #     elif wf_type is WordFormSelection.All:
                #         func = render_all
                #     elif wf_type is WordFormSelection.Any:
                #         func = render_form
                #     else:
                #         assert(wf_type is WordFormSelection.Agreement)
                #         other_w, agreements = more
                #         if other_w not in found_agreements:
                #             found_agreements[other_w] = (w_id, w, agreements, [])

                #         found_agreements[other_w][-1].append((w.msd, w.text))
                #         func = lambda *x: None

                #     representations[w_id][1] = func
                #     if add:
                #         representations[w_id][0].append(w)

        for cid, reps in representations.items():
            for rep in reps:
                rep.render()

        for cid, reps in representations.items():
            rep = " ".join(rep.rendition() for rep in reps)
            matches.representations[cid] = rep
        
        # # just need to set representation to first group,
        # # but in correct order, agreements last!
        # representation_sorted_words = []
        # for w_id, w in matches.matches[0].items():
        #     rep = component.representation
        #     if rep.isit(Rendition.WordForm) and rep.more[0] is WordFormSelection.Agreement:
        #         representation_sorted_words.append((w_id, w))
        #     else:
        #         representation_sorted_words.insert(0, (w_id, w))

        # for w_id, w in representation_sorted_words:
        #     data = representations[w_id]
        #     if type(data[1]) is str:
        #         matches.representations[w_id] = None if data[0] else data[1]
        #     else:
        #         backup_msd = word_renderer.get_lemma_msd(w.lemma)
        #         backup_word = lemma_only_word(backup_msd)
        #         data[1](str(w_id), data[0], backup_word)
            
    def __str__(self):
        return str(self.rendition)


class ComponentStatus(Enum):
    Optional = 0
    Required = 1
    Forbidden = 2

    def __str__(self):
        if self == ComponentStatus.Optional:
            return "?"
        elif self == ComponentStatus.Required:
            return "!"
        else: #Forbidden
            return "X"


def get_level(restriction):
    for feature in restriction:
        if "level" in feature.keys():
            lvl = feature.get("level")
        else:
            continue

    raise RuntimeError("Unreachable!")


def build_morphology_regex(restriction):
    restr_dict = {}
    for feature in restriction:
        feature_dict = dict(feature.items())

        match_type = True
        if "filter" in feature_dict:
            assert(feature_dict['filter'] == "negative")
            match_type = False
            del feature_dict['filter']

        assert(len(feature_dict) == 1)
        key, value = next(iter(feature_dict.items()))
        restr_dict[key] = (value, match_type)

    assert('POS' in restr_dict)
    category = restr_dict['POS'][0].capitalize()
    cat_code = CODES[category]
    rgx = [cat_code] + CATEGORY_BASES[cat_code]

    del restr_dict['POS']
    min_msd_length = 1

    for attribute, (value, typ) in restr_dict.items():
        index = TAGSET[cat_code].index(attribute.lower())
        assert(index >= 0)

        if '|' in value:
            match = "".join(CODES[val] for val in value.split('|'))
        else:
            match = CODES[value]

        match = "[{}{}]".format("" if typ else "^", match)
        rgx[index + 1] = match

        if typ:
            min_msd_length = max(index + 1, min_msd_length)

    def matcher(text):
        if len(text) <= min_msd_length:
            return False

        for c, r in zip(text, rgx):
            if not re.match(r, c):
                return False
        return True

    return rgx, matcher


def build_lexis_regex(restriction):
    restr_dict = {}
    for feature in restriction:
        restr_dict.update(feature.items())

    assert("lemma" in restr_dict)
    match_list = restr_dict['lemma'].split('|')

    return match_list, lambda text: text in match_list


class Restriction:
    def __init__(self, restriction_tag):
        if restriction_tag is None:
            self.type = RestrictionType.MatchAll
            self.matcher = None
            self.present = None
            return
        
        restriction_type = restriction_tag.get('type')
        if restriction_type == "morphology":
            self.type = RestrictionType.Morphology
            present, self.matcher = build_morphology_regex(list(restriction_tag))
            self.present = " ".join(present)
        elif restriction_type == "lexis":
            self.type = RestrictionType.Lexis
            self.present, self.matcher = build_lexis_regex(list(restriction_tag))
        else:
            raise NotImplementedError()

    def match(self, word):
        if self.type == RestrictionType.Morphology:
            match_to = word.msd
        elif self.type == RestrictionType.Lexis:
            match_to = word.lemma
        elif self.type == RestrictionType.MatchAll:
            return True
        else:
            raise RuntimeError("Unreachable!")

        return self.matcher(match_to)

    def __str__(self):
        return "({:s} {})".format(str(self.type).split('.')[1], self.present)

    def __repr__(self):
        return str(self)


class Component:
    def __init__(self, info):
        idx = info['cid']
        name = info['name'] if 'name' in info else None

        if 'status' not in info:
            status = ComponentStatus.Required
        elif info['status'] == 'forbidden':
            status = ComponentStatus.Forbidden
        elif info['status'] == 'obligatory':
            status = ComponentStatus.Required
        elif info['status'] == 'optional':
            status = ComponentStatus.Optional
        else:
            raise NotImplementedError("strange status: {}".format(info['status']))

        self.status = status
        self.name = name
        self.idx = idx
        self.restriction = None
        self.next_element = []
        self.representation = []
        self.selection = {}

        self.iter_ctr = 0

    def add_next(self, next_component, link_label, order):
        self.next_element.append((next_component, link_label, Order.new(order)))

    def set_restriction(self, restrictions_tag):
        if restrictions_tag is None:
            self.restriction = Restriction(None)

        elif restrictions_tag.tag == "restriction":
            self.restriction = Restriction(restrictions_tag)

        elif restrictions_tag.tag == "restriction_or":
            self.restriction = [Restriction(el) for el in restrictions_tag]

        else:
            raise RuntimeError("Unreachable")

    def set_representation(self, representation):
        for rep in representation:
            crend = ComponentRendition()
            for feature in rep:
                crend.add_feature(feature.attrib)
            self.representation.append(crend)

    def find_next(self, deps, comps, restrs, reprs):
        to_ret = []
        for d in deps:
            if d[0] == self.idx:
                _, idx, dep_label, order = d

                next_component = Component(comps[idx])
                next_component.set_restriction(restrs[idx])
                next_component.set_representation(reprs[idx])
                to_ret.append(next_component)

                self.add_next(next_component, dep_label, order)
                others = next_component.find_next(deps, comps, restrs, reprs)
                to_ret.extend(others)

        return to_ret

    def name_str(self):
        return "_" if self.name is None else self.name

    def __str__(self):
        n = self.name_str()
        return "{:s}) {:7s}:{} [{}] :{}".format(
                self.idx, n, self.status, self.restriction, self.representation)

    def tree(self):
        el = []
        for next, link, order in self.next_element:
            s = "{:3} -- {:5} --> {:3}".format(self.idx, link, next.idx)
            if order != Order.Any:
                s += " " + str(order)[6:]

            el.append(s)
            el.extend(next.tree())
        return el

    def __repr__(self):
        return str(self)

    def match(self, word):
        m1 = self._match_self(word)
        if m1 is None:
            return None

        mn = self._match_next(word)
        if mn is None:
            return None
        
        to_ret = [m1]
        for cmatch in mn:
            # if good match but nothing to add, just continue
            if len(cmatch) == 0:
                continue

            # if more than one match found for particular component
            elif len(cmatch) > 1:
                logging.debug("MULTIPLE: {}, {}".format(self.idx, cmatch))
                # if more than one match in multiple components, NOPE!
                if len(to_ret) > 1:
                    logging.warning("Strange multiple match: {}".format(
                        str([w.id for w in cmatch[0].values()])))

                    for tr in to_ret:
                        tr.update(cmatch[0])
                    continue

                # yeah, so we have found more than one match, =>
                # more than one element in to_ret
                to_ret = [{**dict(to_ret[0]), **m} for m in cmatch]

            else:
                for tr in to_ret:
                    tr.update(cmatch[0])

        logging.debug("MA: {}".format(str(to_ret)))
        return to_ret

    def _match_self(self, word):
        matched = None

        # matching
        if type(self.restriction) is list:
            for restr in self.restriction:
                matched = restr.match(word)
                if matched: # match either
                    break
        else:
            matched = self.restriction.match(word)

        logging.debug("SELF MATCH({}: {} -> {}".format(self.idx, word.text, matched))

        # recurse to next
        if not matched:
            return None
        else:
            return {self.idx: word}

    def _match_next(self, word):
        # matches for every component in links from this component
        to_ret = []

        # need to get all links that match
        for next, link, order in self.next_element:
            next_links = word.get_links(link) 
            logging.debug("FIND LINKS FOR: {} -> {}: #{}".format(self.idx, next.idx, len(next_links)))
            to_ret.append([])

            # good flag
            good = next.status != ComponentStatus.Required
            for next_word in next_links:
                logging.debug("link: {}: {} -> {}".format(link, word.id, next_word.id))
                if not order.match(word, next_word):
                    continue

                match = next.match(next_word)

                if match is not None:
                    # special treatement for forbidden
                    if next.status == ComponentStatus.Forbidden:
                        good = False
                        break

                    else:
                        assert(type(match) is list)
                        to_ret[-1].extend(match)
                        good = True

            # if none matched, nothing found!
            if not good:
                logging.debug("BAD")
                return None

        return to_ret


class SyntacticStructure:
    def __init__(self):
        self.id = None
        self.lbs = None
        self.components = []

    @staticmethod
    def from_xml(xml):
        st = SyntacticStructure()
        st.id = xml.get('id')
        st.lbs = xml.get('LBS')
        
        assert(len(list(xml)) == 1)
        system = next(iter(xml))

        assert(system.get('type') == 'JOS')
        components, dependencies, definitions = list(system)

        deps = [ (dep.get('from'), dep.get('to'), dep.get('label'), dep.get('order')) for dep in dependencies ]
        comps = { comp.get('cid'): dict(comp.items()) for comp in components }

        restrs, forms = {}, {}

        for comp in definitions:
            n = comp.get('cid')
            restrs[n] = None
            forms[n] = []

            for el in comp:
                if el.tag.startswith("restriction"):
                    assert(restrs[n] is None)
                    restrs[n] = el
                elif el.tag.startswith("representation"):
                    st.add_representation(n, el, forms)
                else:
                    raise NotImplementedError("Unknown definition: {} in structure {}".format(el.tag, st.id))

        fake_root_component = Component({'cid': '#', 'type': 'other'})
        st.components = fake_root_component.find_next(deps, comps, restrs, forms)
        return st

    def add_representation(self, n, rep_el, forms):
        assert(rep_el.tag == "representation")
        to_add = []
        for el in rep_el:
            assert(el.tag == "feature")
            if 'rendition' in el.attrib or 'selection' in el.attrib:
                to_add.append(el)
            else:
                logging.warning("Strange representation feature in structure {}. Skipping"
                        .format(self.id))
                continue
        forms[n].append(to_add)

    def __str__(self):
        comp_str = "\n".join(str(comp) for comp in self.components)
        links_str = "\n".join(self.components[0].tree())

        return "{} LBS {}\nCOMPONENTS\n{}\n\nLINKS\n{}\n{}".format(
                self.id, self.lbs, comp_str, links_str, "-" * 40)

    def get_component(self, idx):
        for c in self.components:
            if c.idx == idx:
                return c
        raise RuntimeError("Unknown component id: {}".format(idx))

    def match(self, word):
        matches = self.components[0].match(word)
        return [] if matches is None else matches

def load_structures(filename):
    with open(filename, 'r') as fp:
        et = ElementTree.XML(fp.read())
    
    return build_structures(et), get_lemma_features(et)

def build_structures(et):
    structures = []
    for structure in et.iter('syntactic_structure'):
        to_append = SyntacticStructure.from_xml(structure)
        if to_append is None:
            continue
        structures.append(to_append)
    return structures

def get_lemma_features(et):
    lf = et.find('lemma_features')
    if lf is None:
        return {}

    result = {}
    for pos in lf.iter('POS'):
        rgx_list, _ = build_morphology_regex(pos)
        rgx_str = ""
        for position in rgx_list:
            if position == ".":
                rgx_str += " "
            elif len(position) == 1:
                rgx_str += position
            elif len(position) == 3 and position[0] == "[" and position[2] == "]":
                rgx_str += position[1]
            else:
                raise RuntimeError("Strange rgx for lemma_feature...")
        
        assert(rgx_str[0].isupper())
        result[rgx_str[0]] = rgx_str.strip().replace(' ', '-')

    return result

def get_msd(comp):
    d = dict(comp.items())
    if 'msd' in d:
        return d['msd']
    elif 'ana' in d:
        return d['ana'][4:]
    else:
        logging.error(d, file=sys.stderr)
        raise NotImplementedError("MSD?")

def lemma_only_word(msd):
    if msd is None:
        return None
    else:
        WordLemma = namedtuple('WordLemmaOnly', 'msd most_frequent_text lemma text')
        return WordLemma(msd=msd, most_frequent_text=lambda *x: None, lemma=None, text=None)

class Word:
    def __init__(self, xml, do_msd_translate):
        self.lemma = xml.get('lemma')
        self.msd = MSD_TRANSLATE[get_msd(xml)] if do_msd_translate else get_msd(xml)
        self.id = xml.get('id')
        self.text = xml.text
        self.links = defaultdict(list)

        last_num = self.id.split('.')[-1]
        if last_num[0] not in '0123456789':
            last_num = last_num[1:]
        self.int_id = int(last_num)

        assert(None not in (self.id, self.lemma, self.msd))

    @staticmethod
    def pcWord(pc, do_msd_translate):
        pc.set('lemma', pc.text)
        pc.set('msd', "N" if do_msd_translate else "U")
        return Word(pc, do_msd_translate)

    def add_link(self, link, to):
        self.links[link].append(to)

    def get_links(self, link):
        if link not in self.links and "|" in link:
            for l in link.split('|'):
                self.links[link].extend(self.links[l])

        return self.links[link]
    
    def most_frequent_text(self, word_renderer):
        return word_renderer.render(self.lemma, self.msd)

class WordMsdRenderer:
    def __init__(self, lemma_features):
        self.all_words = []
        self.rendered_words = {}
        self.frequent_words = {}
        self.lemma_msd = {}
        self.lemma_features = lemma_features
    
    def add_words(self, words):
        self.all_words.extend(words)
    
    def generate_renders(self):
        data = defaultdict(lambda: defaultdict(list))
        for w in self.all_words:
            data[w.lemma][w.msd].append(w.text)

        for lemma, ld in data.items():
            self.rendered_words[lemma] = {}
            freq_words = defaultdict(int)
            common_msd = "*" * 10

            for msd, texts in ld.items():
                rep = max(set(texts), key=texts.count)
                self.rendered_words[lemma][msd] = (rep, len(texts))

                for txt in texts:
                    freq_words[(msd, txt)] += 1
                
                common_msd = self.merge_msd(common_msd, msd)
            
            self.lemma_msd[lemma] = common_msd
            
            self.frequent_words[lemma] = []
            for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]):
                self.frequent_words[lemma].append((msd, txt, n))
        
        lf = self.lemma_features
        for lemma in self.lemma_msd.keys():
            cmsd = self.lemma_msd[lemma]
            if cmsd[0] in lf:
                self.lemma_msd[lemma] = "".join(
                    l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd)
                )
        
    @staticmethod
    def merge_msd(common_msd, new_msd):
        def merge_letter(l1, l2):
            if l1 == "*":
                return l2
            elif l1 != l2:
                return "-"
            else:
                return l1

        return "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd))
        
    def render(self, lemma, msd):
        if lemma in self.rendered_words:
            if msd in self.rendered_words[lemma]:
                return self.rendered_words[lemma][msd][0]
    
    def available_words(self, lemma, existing_texts):
        counted_texts = Counter(existing_texts)
        for (msd, text), _n in counted_texts.most_common():
            yield (msd, text)

        if lemma in self.frequent_words:
            for msd, text, _ in self.frequent_words[lemma]:
                if (msd, text) not in counted_texts:
                    yield (msd, text)
    
    def get_lemma_msd(self, lemma, word_msd):
        # should be here, since we collect every lemmas
        lemma_msd = self.lemma_msd[lemma]

        if lemma_msd[0] == '-':
            if word_msd[0] in self.lemma_features:
                return self.lemma_features[word_msd[0]]
            else:
                return '-'
        else:
            return lemma_msd

def is_root_id(id_):
    return len(id_.split('.')) == 3


def load_files(args):
    filenames = args.input
    skip_id_check = args.skip_id_check
    do_msd_translate = not args.no_msd_translate

    for n, fname in enumerate(filenames):
        if args.count_files:
            status = " :: {} / {}".format(n, len(filenames))
        else:
            status = ""
        yield load_tei_file(fname, skip_id_check, do_msd_translate, args.pc_tag, status)


def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
    logging.info("LOADING FILE: {}{}".format(filename, status))

    with open(filename, 'r') as fp:
        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
        xmlstring = xmlstring.replace(' xml:', ' ')
        et = ElementTree.XML(xmlstring)

    words = {}
    for w in et.iter("w"):
        words[w.get('id')] = Word(w, do_msd_translate)
    for pc in et.iter(pc_tag):
        words[pc.get('id')] = Word.pcWord(pc, do_msd_translate)

    for l in et.iter("link"):
        if 'dep' in l.keys():
            ana = l.get('afun')
            lfrom = l.get('from')
            dest = l.get('dep')
        else:
            ana = l.get('ana')
            if ana[:4] != 'syn:': # dont bother...
                continue
            ana = ana[4:]
            lfrom, dest = l.get('target').replace('#', '').split()

        if lfrom in words:
            if not skip_id_check and is_root_id(lfrom):
                logging.error("NOO: {}".format(lfrom))
                sys.exit(1)

            if dest in words:
                next_word = words[dest]
                words[lfrom].add_link(ana, next_word)
            else:
                logging.error("Unknown id: {}".format(dest))
                sys.exit(1)

        else:
            # strange errors, just skip...
            pass

    return list(words.values())

class Writer:
    @staticmethod
    def make_output_writer(args):
        return Writer(False, args.output, args.multiple_output, int(args.sort_by), args.sort_reversed)
    
    @staticmethod
    def make_all_writer(args):
        return Writer(True, args.all, False, -1, False)

    def __init__(self, all, filename, multiple_output, sort_by, sort_reversed):
        self.all = all
        self.output_file = filename
        self.multiple_output = multiple_output

        self.sort_by = sort_by
        self.sort_order = sort_reversed

    def header(self):
        cols = ["Lemma"]
        if self.all:
            cols = ["Token_ID", "Word_form"] + cols + ["Msd"]
        else:
            cols.extend(["Representative_form", "RF_scenario"])

        assert(len(cols) == self.length())
        cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS) for thd in cols]
        cols = ["Structure_ID"] + cols + ["Colocation_ID"]
        
        if not self.all:
            cols += ["Joint_representative_form", "Frequency"]

        return cols

    def length(self):
        return 4 if self.all else 3

    def from_word(self, word, representation):
        if word is None:
            return [""] * self.length()
        elif self.all:
            return [word.id, word.text, word.lemma, word.msd]
        else:
            if representation is None:
                return [word.lemma, word.lemma, "lemma_fallback"]
            else:
                return [word.lemma, representation, "ok"]
    
    def sorted_rows(self, rows):
        if self.sort_by < 0 or len(rows) < 2:
            return rows

        if len(rows[0]) <= self.sort_by:
            logging.warning("Cannot sort by column #{}: Not enough columns!".format(len(rows[0])))
            return rows
        
        try:
            int(rows[0][self.sort_by])
            key=lambda row: int(row[self.sort_by])
        except ValueError:
            key=lambda row: row[self.sort_by].lower()

        return sorted(rows, key=key, reverse=self.sort_order)

    def write_header(self, file_handler):
        file_handler.write(", ".join(self.header()) + "\n")

    def write_out_worker(self, file_handler, structure_id, components, colocation_ids):
        rows = []

        for cid, m, freq, rprsnt in colocation_ids.get_matches_for(structure_id, not self.all):
            to_write = []
            representation = ""

            for idx, _comp in enumerate(components):
                idx = str(idx + 1)
                word = m[idx] if idx in m else None
                rep = rprsnt[idx] if idx in rprsnt else None
                to_write.extend(self.from_word(word, rep))
                representation += " " + to_write[-2]

            # make them equal size
            to_write.extend([""] * (MAX_NUM_COMPONENTS * self.length() - len(to_write))) 
            to_write = [structure_id] + to_write + [cid]

            if not self.all:
                representation = re.sub(' +', ' ', representation)
                to_write.append(representation.strip())
                to_write.append(str(freq))

            rows.append(to_write)

        if len(rows) > 0:
            rows = self.sorted_rows(rows)
            file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n")
            file_handler.flush()

    def write_out(self, structures, colocation_ids):
        def fp_close(fp_):
            if fp_ != sys.stdout:
                fp_.close()

        def fp_open(snum=None):
            if self.output_file is None:
                return sys.stdout
            elif snum is None:
                return open(self.output_file, "w")
            else:
                return open("{}.{}".format(self.output_file, snum), "w")

        if not self.multiple_output:
            fp = fp_open()
            self.write_header(fp)

        for s in structures:
            if self.multiple_output:
                fp=fp_open(s.id)
                self.write_header(fp)

            self.write_out_worker(fp, s.id, s.components, colocation_ids)

            if self.multiple_output:
                fp_close(fp)
            
        if not self.multiple_output:
            fp_close(fp)

class StructureMatch:
    def __init__(self, match_id, structure_id):
        self.match_id = match_id
        self.structure_id = structure_id

        self.matches = []
        self.representations = {}
    
    def append(self, match):
        self.matches.append(match)

    def __len__(self):
        return len(self.matches)

class ColocationIds:
    def __init__(self):
        self.data = {}
        self.min_frequency = args.min_freq

    def _add_match(self, key, sid, match):
        if key not in self.data:
            self.data[key] = StructureMatch(str(len(self.data) + 1), sid)
        self.data[key].append(match)
    
    def get(self, key, n):
        return self.data[key][n]

    def add_matches(self, matches):
        for sid, nms in matches.items():
            for nm in nms:
                self._add_match(nm[1], sid, nm[0])
    
    def get_matches_for(self, structure_id, group):
        for _cid_tup, sm in self.data.items():
            if sm.structure_id != structure_id:
                continue

            for words in sm.matches:
                yield (sm.match_id, words, len(sm), sm.representations)
                if group:
                    break

    def set_representations(self, structures, word_renderer):
        components_dict = {structure.id: structure for structure in structures}
        idx = 1
        for _1, sm in tqdm(self.data.items()):
            ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer)
            idx += 1


def match_file(words, structures):
    matches = {s.id: [] for s in structures}

    for idx, s in tqdm(list(enumerate(structures))):
        # logging.info("{}/{}: {:7s}".format(idx, len(structures), s.id))
        for w in words:
            mhere = s.match(w)
            logging.debug("  GOT: {}".format(len(mhere)))
            for match in mhere: 
                colocation_id = [(idx, w.lemma) for idx, w in match.items()]
                colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0]))
                colocation_id = tuple(colocation_id)

                matches[s.id].append((match, colocation_id))

    return matches


def main(input_file, structures_file, args):
    structures, lemma_msds = load_structures(structures_file)

    colocation_ids = ColocationIds()
    word_renderer = WordMsdRenderer(lemma_msds)

    # if True:
    #     with open("match_word.p", "rb") as fp:
    #         words, matches = pickle.load(fp)
    #     colocation_ids.add_matches(matches)
    #     word_renderer.add_words(words)

    if args.parallel:
        num_parallel = int(args.parallel)

        # make temporary directory to hold temporary files
        with tempfile.TemporaryDirectory() as tmpdirname:
            cmd = sys.argv 
            for inpt in args.input:
                if inpt in cmd:
                    cmd.remove(inpt)

            # remove "--parallel X"
            pidx = cmd.index('--parallel')
            del cmd[pidx]
            del cmd[pidx]

            def func(n): 
                cmdn = [sys.executable] + cmd + [args.input[n], "--match-to-file", "{}/{}.p".format(tmpdirname, n)]
                subprocess.check_call(cmdn)
                return n

            # use ThreadPoolExecuter to run subprocesses in parallel using py threads
            with concurrent.futures.ThreadPoolExecutor(max_workers=num_parallel) as executor:
                # fancy interface to wait for threads to finish
                for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]):
                    with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp:
                        words, matches = pickle.load(fp)

                    colocation_ids.add_matches(matches)
                    word_renderer.add_words(words)

    else:
        for words in load_files(args):
            matches = match_file(words, structures)
            # just save to temporary file, used for children of a parallel process
            # MUST NOT have more than one file
            if args.match_to_file is not None:
                with open(args.match_to_file, "wb") as fp:
                    pickle.dump((words, matches), fp)
                    return
            else:
                colocation_ids.add_matches(matches)
                word_renderer.add_words(words)

    # get word renders for lemma/msd
    word_renderer.generate_renders()

    if args.output:
        # figure out representations!
        colocation_ids.set_representations(structures, word_renderer)
        Writer.make_output_writer(args).write_out(structures, colocation_ids)
    if args.all:
        Writer.make_all_writer(args).write_out(structures, colocation_ids)

    logging.debug([(k, len(v)) for k, v in matches.items()])
    logging.debug(sum(len(v) for _, v in matches.items()))

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Extract structures from a parsed corpus.')
    parser.add_argument('structures', help='Structures definitions in xml file')
    parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+')
    parser.add_argument('--output', help='Output file (if none given, then output to stdout)')
    parser.add_argument('--all', help='Additional output file, writes more data')

    parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true')
    parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')
    parser.add_argument('--min_freq', help='Minimal frequency in output', type=int, default=0, const=1, nargs='?')
    parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info", const="info", nargs='?')
    parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true')
    parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true')

    parser.add_argument('--sort-by', help="Sort by a this column (index)", type=int, default=-1)
    parser.add_argument('--sort-reversed', help="Sort in reversed ored", action='store_true')

    parser.add_argument('--pc-tag', help='Tag for separators, usually pc or c', default="pc")
    parser.add_argument('--parallel', help='Run in multiple processes, should speed things up')
    parser.add_argument('--match-to-file', help='Do not use!')

    args = parser.parse_args()
    logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())

    start = time.time()
    main(args.input, args.structures, args)
    logging.info("TIME: {}".format(time.time() - start))