accomodating for #773
This commit is contained in:
		
							parent
							
								
									106db9394e
								
							
						
					
					
						commit
						cddeb9c4e4
					
				
							
								
								
									
										619
									
								
								wani.py
									
									
									
									
									
								
							
							
						
						
									
										619
									
								
								wani.py
									
									
									
									
									
								
							@ -3,12 +3,14 @@ import re
 | 
			
		||||
from enum import Enum
 | 
			
		||||
from collections import defaultdict
 | 
			
		||||
import sys
 | 
			
		||||
import logging
 | 
			
		||||
 | 
			
		||||
from msd_translate import MSD_TRANSLATE
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
STAVKI = sys.argv[1]
 | 
			
		||||
STRUKTURE = sys.argv[2] # "Kolokacije_strukture_09_new-system.xml"
 | 
			
		||||
STRUKTURE = sys.argv[2]
 | 
			
		||||
FILE_OUT = sys.argv[3]
 | 
			
		||||
 | 
			
		||||
CODES = {
 | 
			
		||||
    "Noun": "N",
 | 
			
		||||
@ -98,39 +100,94 @@ TAGSET = {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
CATEGORY_BASES = {
 | 
			
		||||
    "N": ['.', '.', '.', '.', '.?'],
 | 
			
		||||
    "V": ['.', '.', '.', '.', '.?', '.?', '.?'],
 | 
			
		||||
    "A": ['.', '.', '.', '.', '.', '.?'],
 | 
			
		||||
    "R": ['.', '.?'],
 | 
			
		||||
    "P": ['. ', '.', '.', '.', '.', '.', '.?', '.?'],
 | 
			
		||||
    "M": ['.', '.', '.', '.?', '.?', '.?'],
 | 
			
		||||
    "S": ['.'],
 | 
			
		||||
    "C": ['.'],
 | 
			
		||||
    "N": ['.'] * 5,
 | 
			
		||||
    "V": ['.'] * 7,
 | 
			
		||||
    "A": ['.'] * 6,
 | 
			
		||||
    "R": ['.'] * 2,
 | 
			
		||||
    "P": ['.'] * 6,
 | 
			
		||||
    "M": ['.'] * 6,
 | 
			
		||||
    "S": ['.'] * 1,
 | 
			
		||||
    "C": ['.'] * 1,
 | 
			
		||||
    "Q": [],
 | 
			
		||||
    "I": [],
 | 
			
		||||
    "Y": [],
 | 
			
		||||
    "X": ['.?']
 | 
			
		||||
    "X": ['.'] * 1
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class RestrictionType(Enum):
 | 
			
		||||
    Morphology = 0
 | 
			
		||||
    Lexis = 1
 | 
			
		||||
    MatchAll = 2
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ComponentLevel(Enum):
 | 
			
		||||
class Rendition(Enum):
 | 
			
		||||
    Lemma = 0
 | 
			
		||||
    WordForm = 1
 | 
			
		||||
    Unknown = 2
 | 
			
		||||
 | 
			
		||||
class ComponentRendition:
 | 
			
		||||
    def __init__(self, rendition=Rendition.Unknown):
 | 
			
		||||
        self.word_form = {}
 | 
			
		||||
        self.rendition = rendition
 | 
			
		||||
 | 
			
		||||
    def render(self, word):
 | 
			
		||||
        if self.rendition == Rendition.Lemma:
 | 
			
		||||
            return word.lemma
 | 
			
		||||
        elif self.rendition == Rendition.WordForm:
 | 
			
		||||
            return word.text
 | 
			
		||||
        elif self.rendition == Rendition.Unknown:
 | 
			
		||||
            return None
 | 
			
		||||
        else:
 | 
			
		||||
            raise RuntimeError("Unknown rendition: {}".format(self.rendition))
 | 
			
		||||
    
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        return str(self.rendition)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# dont know...
 | 
			
		||||
class StructureSelection(Enum):
 | 
			
		||||
    All = 0
 | 
			
		||||
    Frequency = 1
 | 
			
		||||
 | 
			
		||||
class ComponentRepresentation:
 | 
			
		||||
    def new(s):
 | 
			
		||||
        if 'rendition' in s:
 | 
			
		||||
            if s['rendition'] == "lemma":
 | 
			
		||||
                return ComponentRendition(Rendition.Lemma)
 | 
			
		||||
            elif s['rendition'] == "word_form":
 | 
			
		||||
                return ComponentRendition(Rendition.WordForm)
 | 
			
		||||
            else:
 | 
			
		||||
                raise NotImplementedError("Rendition: {}".format(s))
 | 
			
		||||
        elif 'selection' in s:
 | 
			
		||||
            if s['selection'] == "frequency":
 | 
			
		||||
                return StructureSelection.Frequency
 | 
			
		||||
            elif s['selection'] == "all":
 | 
			
		||||
                return StructureSelection.All
 | 
			
		||||
            else:
 | 
			
		||||
                return {s['selection']: s['value']}
 | 
			
		||||
        else:
 | 
			
		||||
            raise NotImplementedError("Representation: {}".format(s))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ComponentStatus(Enum):
 | 
			
		||||
    Optional = 0
 | 
			
		||||
    Required = 1
 | 
			
		||||
    Forbidden = 2
 | 
			
		||||
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        if self == ComponentStatus.Optional:
 | 
			
		||||
            return "?"
 | 
			
		||||
        elif self == ComponentStatus.Required:
 | 
			
		||||
            return "!"
 | 
			
		||||
        else: #Forbidden
 | 
			
		||||
            return "X"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_level(restriction):
 | 
			
		||||
    for feature in restriction:
 | 
			
		||||
        if "level" in feature.keys():
 | 
			
		||||
            lvl = feature.get("level")
 | 
			
		||||
            if lvl == "lemma":
 | 
			
		||||
                return ComponentLevel.Lemma
 | 
			
		||||
            elif lvl == "word_form":
 | 
			
		||||
                return ComponentLevel.WordForm
 | 
			
		||||
        else:
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
@ -140,28 +197,44 @@ def get_level(restriction):
 | 
			
		||||
def build_morphology_regex(restriction):
 | 
			
		||||
    restr_dict = {}
 | 
			
		||||
    for feature in restriction:
 | 
			
		||||
        restr_dict.update(feature.items())
 | 
			
		||||
        feature_dict = dict(feature.items())
 | 
			
		||||
 | 
			
		||||
        match_type = True
 | 
			
		||||
        if "filter" in feature_dict:
 | 
			
		||||
            assert(feature_dict['filter'] == "negative")
 | 
			
		||||
            match_type = False
 | 
			
		||||
            del feature_dict['filter']
 | 
			
		||||
 | 
			
		||||
        assert(len(feature_dict) == 1)
 | 
			
		||||
        key, value = next(iter(feature_dict.items()))
 | 
			
		||||
        restr_dict[key] = (value, match_type)
 | 
			
		||||
 | 
			
		||||
    assert('POS' in restr_dict)
 | 
			
		||||
    category = restr_dict['POS'].capitalize()
 | 
			
		||||
    category = restr_dict['POS'][0].capitalize()
 | 
			
		||||
    cat_code = CODES[category]
 | 
			
		||||
    rgx = [cat_code] + CATEGORY_BASES[cat_code]
 | 
			
		||||
 | 
			
		||||
    del restr_dict['POS']
 | 
			
		||||
    del restr_dict['level']
 | 
			
		||||
 | 
			
		||||
    for attribute, value in restr_dict.items():
 | 
			
		||||
    for attribute, (value, typ) in restr_dict.items():
 | 
			
		||||
        index = TAGSET[cat_code].index(attribute.lower())
 | 
			
		||||
        assert(index >= 0)
 | 
			
		||||
 | 
			
		||||
        if '|' in value:
 | 
			
		||||
            match = '[' + "".join(CODES[val] for val in value.split('|')) + ']'
 | 
			
		||||
            match = "".join(CODES[val] for val in value.split('|'))
 | 
			
		||||
        else:
 | 
			
		||||
            match = CODES[value]
 | 
			
		||||
 | 
			
		||||
        match = "[{}{}]".format("" if typ else "^", match)
 | 
			
		||||
        rgx[index + 1] = match
 | 
			
		||||
 | 
			
		||||
    return re.compile("".join(rgx))
 | 
			
		||||
    def matcher(text):
 | 
			
		||||
        for c, r in zip(text, rgx):
 | 
			
		||||
            if not re.match(r, c):
 | 
			
		||||
                return False
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    return " ".join(rgx), matcher
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def build_lexis_regex(restriction):
 | 
			
		||||
@ -169,18 +242,27 @@ def build_lexis_regex(restriction):
 | 
			
		||||
    for feature in restriction:
 | 
			
		||||
        restr_dict.update(feature.items())
 | 
			
		||||
 | 
			
		||||
    return re.compile(restr_dict['lemma'])
 | 
			
		||||
    assert("lemma" in restr_dict)
 | 
			
		||||
    match_list = restr_dict['lemma'].split('|')
 | 
			
		||||
 | 
			
		||||
    return match_list, lambda text: text in match_list
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Restriction:
 | 
			
		||||
    def __init__(self, restriction_tag):
 | 
			
		||||
        if restriction_tag is None:
 | 
			
		||||
            self.type = RestrictionType.MatchAll
 | 
			
		||||
            self.matcher = None
 | 
			
		||||
            self.present = None
 | 
			
		||||
            return
 | 
			
		||||
        
 | 
			
		||||
        restriction_type = restriction_tag.get('type')
 | 
			
		||||
        if restriction_type == "morphology":
 | 
			
		||||
            self.type = RestrictionType.Morphology
 | 
			
		||||
            self.matcher = build_morphology_regex(list(restriction_tag))
 | 
			
		||||
            self.present, self.matcher = build_morphology_regex(list(restriction_tag))
 | 
			
		||||
        elif restriction_type == "lexis":
 | 
			
		||||
            self.type = RestrictionType.Lexis
 | 
			
		||||
            self.matcher = build_lexis_regex(list(restriction_tag))
 | 
			
		||||
            self.present, self.matcher = build_lexis_regex(list(restriction_tag))
 | 
			
		||||
        else:
 | 
			
		||||
            raise NotImplementedError()
 | 
			
		||||
 | 
			
		||||
@ -189,128 +271,235 @@ class Restriction:
 | 
			
		||||
            match_to = word.msd
 | 
			
		||||
        elif self.type == RestrictionType.Lexis:
 | 
			
		||||
            match_to = word.lemma
 | 
			
		||||
        elif self.type == RestrictionType.MatchAll:
 | 
			
		||||
            return True
 | 
			
		||||
        else:
 | 
			
		||||
            raise RuntimeError("Unreachable!")
 | 
			
		||||
 | 
			
		||||
        return self.matcher.match(match_to)
 | 
			
		||||
        return self.matcher(match_to)
 | 
			
		||||
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        return "({:s} {})".format(str(self.type).split('.')[1], self.matcher)
 | 
			
		||||
        return "({:s} {})".format(str(self.type).split('.')[1], self.present)
 | 
			
		||||
 | 
			
		||||
    def __repr__(self):
 | 
			
		||||
        return str(self)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Component:
 | 
			
		||||
    def __init__(self, name, idx):
 | 
			
		||||
        assert(idx is not None)
 | 
			
		||||
    def __init__(self, info):
 | 
			
		||||
        idx = info['cid']
 | 
			
		||||
        name = info['name'] if 'name' in info else None
 | 
			
		||||
 | 
			
		||||
        self.name = name if name is not None else ""  # for printing...
 | 
			
		||||
        if 'status' not in info:
 | 
			
		||||
            status = ComponentStatus.Required
 | 
			
		||||
        elif info['status'] == 'forbidden':
 | 
			
		||||
            status = ComponentStatus.Forbidden
 | 
			
		||||
        elif info['status'] == 'obligatory':
 | 
			
		||||
            status = ComponentStatus.Required
 | 
			
		||||
        elif info['status'] == 'optional':
 | 
			
		||||
            status = ComponentStatus.Optional
 | 
			
		||||
        else:
 | 
			
		||||
            raise NotImplementedError("strange status: {}".format(info['status']))
 | 
			
		||||
 | 
			
		||||
        self.status = status
 | 
			
		||||
        self.name = name
 | 
			
		||||
        self.idx = idx
 | 
			
		||||
        self.restriction = None
 | 
			
		||||
        self.next_element = []
 | 
			
		||||
        self.level = None
 | 
			
		||||
        self.rendition = ComponentRendition()
 | 
			
		||||
        self.selection = {}
 | 
			
		||||
 | 
			
		||||
        self.iter_ctr = 0
 | 
			
		||||
 | 
			
		||||
    def word_to_str(self, word):
 | 
			
		||||
        if self.level == ComponentLevel.Lemma:
 | 
			
		||||
            return word.lemma, word.msd
 | 
			
		||||
        elif self.level == ComponentLevel.WordForm:
 | 
			
		||||
            return word.text, word.msd
 | 
			
		||||
        else:
 | 
			
		||||
            raise RuntimeError("Unreachable")
 | 
			
		||||
 | 
			
		||||
    def __iter__(self):
 | 
			
		||||
        self.iter_ctr = 0
 | 
			
		||||
        return self
 | 
			
		||||
 | 
			
		||||
    def __next__(self):
 | 
			
		||||
        if self.iter_ctr < len(self.next_element):
 | 
			
		||||
            to_ret = self.next_element[self.iter_ctr]
 | 
			
		||||
            self.iter_ctr += 1
 | 
			
		||||
            return to_ret
 | 
			
		||||
        else:
 | 
			
		||||
            raise StopIteration
 | 
			
		||||
    def render_word(self, word):
 | 
			
		||||
        return self.rendition.render(word)
 | 
			
		||||
 | 
			
		||||
    def add_next(self, next_component, link_label):
 | 
			
		||||
        self.next_element.append((next_component, link_label))
 | 
			
		||||
 | 
			
		||||
    def set_restriction(self, restrictions_tag):
 | 
			
		||||
        if restrictions_tag.tag == "restriction":
 | 
			
		||||
        if restrictions_tag is None:
 | 
			
		||||
            self.restriction = Restriction(None)
 | 
			
		||||
 | 
			
		||||
        elif restrictions_tag.tag == "restriction":
 | 
			
		||||
            self.restriction = Restriction(restrictions_tag)
 | 
			
		||||
            self.level = get_level(restrictions_tag)
 | 
			
		||||
 | 
			
		||||
        elif restrictions_tag.tag == "restriction_or":
 | 
			
		||||
            self.restriction = [Restriction(el) for el in restrictions_tag]
 | 
			
		||||
            self.level = get_level(restrictions_tag[0])
 | 
			
		||||
 | 
			
		||||
            # same level for every restriction for now and only or available
 | 
			
		||||
            levels = [get_level(el) for el in restrictions_tag]
 | 
			
		||||
            assert(len(set(levels)) == 1)
 | 
			
		||||
 | 
			
		||||
        else:
 | 
			
		||||
            raise RuntimeError("Unreachable")
 | 
			
		||||
 | 
			
		||||
    def find_next(self, deps, comps, restrs):
 | 
			
		||||
    def set_representation(self, representation):
 | 
			
		||||
        cr = None
 | 
			
		||||
        if representation is not None:
 | 
			
		||||
            self.representation = []
 | 
			
		||||
 | 
			
		||||
            for feature in representation:
 | 
			
		||||
                f = ComponentRepresentation.new(dict(feature.attrib))
 | 
			
		||||
 | 
			
		||||
                if type(f) is StructureSelection:
 | 
			
		||||
                    assert(cr is None)
 | 
			
		||||
                    cr = f
 | 
			
		||||
                elif type(f) is ComponentRendition:
 | 
			
		||||
                    self.rendition = f
 | 
			
		||||
                elif type(f) is dict:
 | 
			
		||||
                    self.selection.update(f)
 | 
			
		||||
                else:
 | 
			
		||||
                    raise RuntimeError("Unreachable: {}".format(f))
 | 
			
		||||
 | 
			
		||||
        return cr
 | 
			
		||||
 | 
			
		||||
    def find_next(self, deps, comps, restrs, reprs):
 | 
			
		||||
        representation = StructureSelection.All
 | 
			
		||||
 | 
			
		||||
        to_ret = []
 | 
			
		||||
        for d in deps:
 | 
			
		||||
            if d[0] == self.idx:
 | 
			
		||||
                _, idx, dep_label = d
 | 
			
		||||
 | 
			
		||||
                next_component = Component(comps[idx], idx)
 | 
			
		||||
                next_component = Component(comps[idx])
 | 
			
		||||
                next_component.set_restriction(restrs[idx])
 | 
			
		||||
                r1 = next_component.set_representation(reprs[idx])
 | 
			
		||||
                to_ret.append(next_component)
 | 
			
		||||
 | 
			
		||||
                self.add_next(next_component, dep_label)
 | 
			
		||||
                next_component.find_next(deps, comps, restrs)
 | 
			
		||||
                others, r2 = next_component.find_next(deps, comps, restrs, reprs)
 | 
			
		||||
                to_ret.extend(others)
 | 
			
		||||
 | 
			
		||||
                if StructureSelection.Frequency in (r1, r2):
 | 
			
		||||
                    representation = StructureSelection.Frequency
 | 
			
		||||
 | 
			
		||||
        return to_ret, representation
 | 
			
		||||
 | 
			
		||||
    def name_str(self):
 | 
			
		||||
        return "_" if self.name is None else self.name
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        el = "({:10} {})".format(self.name, str(self.restriction))
 | 
			
		||||
        for next, link in self:
 | 
			
		||||
            el += "\n{:10} -- {:10} --> {}".format(self.name, link, str(next))
 | 
			
		||||
        n = self.name_str()
 | 
			
		||||
        return "{:s}) {:7s}:{} [{}] :{}".format(
 | 
			
		||||
                self.idx, n, self.status, self.restriction, self.rendition)
 | 
			
		||||
 | 
			
		||||
    def tree(self):
 | 
			
		||||
        el = []
 | 
			
		||||
        for next, link in self.next_element:
 | 
			
		||||
            el.append("{:3} -- {:5} --> {:3}".format(self.idx, link, next.idx))
 | 
			
		||||
            el.extend(next.tree())
 | 
			
		||||
        return el
 | 
			
		||||
 | 
			
		||||
    def __repr__(self):
 | 
			
		||||
        return str(self)
 | 
			
		||||
 | 
			
		||||
    def match(self, word):
 | 
			
		||||
        m1 = self._match_self(word)
 | 
			
		||||
        if m1 is None:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        mn = self._match_next(word)
 | 
			
		||||
        if mn is None:
 | 
			
		||||
            return None
 | 
			
		||||
        
 | 
			
		||||
        to_ret = [m1]
 | 
			
		||||
        for cmatch in mn:
 | 
			
		||||
            # if good match but nothing to add, just continue
 | 
			
		||||
            if len(cmatch) == 0:
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            # if more than one match found for particular component
 | 
			
		||||
            elif len(cmatch) > 1:
 | 
			
		||||
                logging.debug("MULTIPLE: {}, {}".format(self.idx, cmatch))
 | 
			
		||||
                # if more than one match in multiple components, NOPE!
 | 
			
		||||
                if len(to_ret) > 1:
 | 
			
		||||
                    logging.warning("Strange multiple match: {}".format(
 | 
			
		||||
                        str([w.id for w in cmatch[0].values()])))
 | 
			
		||||
 | 
			
		||||
                    for tr in to_ret:
 | 
			
		||||
                        tr.update(cmatch[0])
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
                # yeah, so we have found more than one match, =>
 | 
			
		||||
                # more than one element in to_ret
 | 
			
		||||
                to_ret = [{**dict(to_ret[0]), **m} for m in cmatch]
 | 
			
		||||
 | 
			
		||||
            else:
 | 
			
		||||
                for tr in to_ret:
 | 
			
		||||
                    tr.update(cmatch[0])
 | 
			
		||||
 | 
			
		||||
        logging.debug("MA: {}".format(str(to_ret)))
 | 
			
		||||
        return to_ret
 | 
			
		||||
 | 
			
		||||
    def _match_self(self, word):
 | 
			
		||||
        matched = None
 | 
			
		||||
 | 
			
		||||
        # matching
 | 
			
		||||
        if type(self.restriction) is list:
 | 
			
		||||
            for restr in self.restriction:
 | 
			
		||||
                matched = restr.match(word)
 | 
			
		||||
                if matched is not None:
 | 
			
		||||
                if matched: # match either
 | 
			
		||||
                    break
 | 
			
		||||
        else:
 | 
			
		||||
            matched = self.restriction.match(word)
 | 
			
		||||
 | 
			
		||||
        # recurse to next
 | 
			
		||||
        if matched:
 | 
			
		||||
            to_ret = [self.word_to_str(word)]
 | 
			
		||||
        logging.debug("SELF MATCH({}: {} -> {}".format(self.idx, word.text, matched))
 | 
			
		||||
 | 
			
		||||
        # check with status
 | 
			
		||||
        # if self.status is ComponentStatus.Optional:
 | 
			
		||||
        #     if not matched:
 | 
			
		||||
        #         # nothing to add, but still good...
 | 
			
		||||
        #         return {}
 | 
			
		||||
        # elif self.status is ComponentStatus.Forbidden:
 | 
			
		||||
        #     # forbiddent is handled at return stage in _match_next
 | 
			
		||||
        #     # just process normally...
 | 
			
		||||
        #     pass
 | 
			
		||||
 | 
			
		||||
        # recurse to next
 | 
			
		||||
        if not matched:
 | 
			
		||||
            return None
 | 
			
		||||
        else:
 | 
			
		||||
            return {self.idx: word}
 | 
			
		||||
 | 
			
		||||
    def _match_next(self, word):
 | 
			
		||||
        # matches for every component in links from this component
 | 
			
		||||
        to_ret = []
 | 
			
		||||
 | 
			
		||||
            for next, link in self:
 | 
			
		||||
        # need to get all links that match
 | 
			
		||||
        for next, link in self.next_element:
 | 
			
		||||
            logging.debug("FIND LINKS FOR: {} -> {}".format(self.idx, next.idx))
 | 
			
		||||
            to_ret.append([])
 | 
			
		||||
 | 
			
		||||
            # good flag
 | 
			
		||||
            good = next.status != ComponentStatus.Required
 | 
			
		||||
            for next_word in word.get_links(link):
 | 
			
		||||
                logging.debug("link: {}: {} -> {}".format(link, word.id, next_word.id))
 | 
			
		||||
                match = next.match(next_word)
 | 
			
		||||
                    # if matches, return
 | 
			
		||||
 | 
			
		||||
                if match is not None:
 | 
			
		||||
                        to_ret.extend(match)
 | 
			
		||||
                    # special treatement for forbidden
 | 
			
		||||
                    if next.status == ComponentStatus.Forbidden:
 | 
			
		||||
                        good = False
 | 
			
		||||
                        break
 | 
			
		||||
 | 
			
		||||
                # if none matched, nothing found!
 | 
			
		||||
                    else:
 | 
			
		||||
                        assert(type(match) is list)
 | 
			
		||||
                        to_ret[-1].extend(match)
 | 
			
		||||
                        good = True
 | 
			
		||||
 | 
			
		||||
            # if none matched, nothing found!
 | 
			
		||||
            if not good:
 | 
			
		||||
                logging.debug("BAD")
 | 
			
		||||
                return None
 | 
			
		||||
 | 
			
		||||
        return to_ret
 | 
			
		||||
 | 
			
		||||
        # return None...
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SyntacticStructure:
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self.root_component = Component("", 'root')
 | 
			
		||||
        self.id = None
 | 
			
		||||
        self.lbs = None
 | 
			
		||||
        self.agreements = []
 | 
			
		||||
        self.components = []
 | 
			
		||||
        self.selection = StructureSelection.All
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def from_xml(xml):
 | 
			
		||||
@ -318,26 +507,144 @@ class SyntacticStructure:
 | 
			
		||||
        st.id = xml.get('id')
 | 
			
		||||
        st.lbs = xml.get('LBS')
 | 
			
		||||
        
 | 
			
		||||
        components, system = list(xml)
 | 
			
		||||
        dependencies, restrictions = list(system)
 | 
			
		||||
        if float(st.id.replace('-','.')) >= 17:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        assert(len(list(xml)) == 1)
 | 
			
		||||
        system = next(iter(xml))
 | 
			
		||||
 | 
			
		||||
        assert(system.get('type') == 'JOS')
 | 
			
		||||
        components, dependencies, definitions = list(system)
 | 
			
		||||
 | 
			
		||||
        deps = [ (dep.get('from'), dep.get('to'), dep.get('label')) for dep in dependencies ]
 | 
			
		||||
        comps = { comp.get('cid'): comp.get('name') for comp in components }
 | 
			
		||||
        restrs = { r.get('cid'): next(iter(r)) for r in restrictions }
 | 
			
		||||
        comps = { comp.get('cid'): dict(comp.items()) for comp in components }
 | 
			
		||||
 | 
			
		||||
        st.root_component.find_next(deps, comps, restrs)
 | 
			
		||||
        st.root_component = list(st.root_component)[0][0]  # get first next
 | 
			
		||||
        restrs, forms = {}, {}
 | 
			
		||||
 | 
			
		||||
        for comp in definitions:
 | 
			
		||||
            n = comp.get('cid')
 | 
			
		||||
            restrs[n] = None
 | 
			
		||||
            forms[n] = None
 | 
			
		||||
 | 
			
		||||
            for el in comp:
 | 
			
		||||
                if el.tag.startswith("restriction"):
 | 
			
		||||
                    assert(restrs[n] is None)
 | 
			
		||||
                    restrs[n] = el
 | 
			
		||||
                elif el.tag.startswith("representation"):
 | 
			
		||||
                    st.add_representation(n, el, forms)
 | 
			
		||||
                else:
 | 
			
		||||
                    raise NotImplementedError("definition??")
 | 
			
		||||
 | 
			
		||||
        fake_root_component = Component({'cid': '#', 'type': 'other'})
 | 
			
		||||
        st.components, st.selection = fake_root_component.find_next(deps, comps, restrs, forms)
 | 
			
		||||
        return st
 | 
			
		||||
 | 
			
		||||
    def add_representation(self, n, el, forms):
 | 
			
		||||
        if el.tag == "representation":
 | 
			
		||||
            els = [el]
 | 
			
		||||
        elif el.tag == "representation_and":
 | 
			
		||||
            els = list(el)
 | 
			
		||||
        else:
 | 
			
		||||
            raise NotImplementedError("repr what?: {}".format(el.tag))
 | 
			
		||||
        
 | 
			
		||||
        for el in els:
 | 
			
		||||
            if el.get('basic') == 'form':
 | 
			
		||||
                assert(forms[n] is None)
 | 
			
		||||
                forms[n] = el
 | 
			
		||||
            elif el.get('basic') == "agreement":
 | 
			
		||||
                self.add_agreement(n, el)
 | 
			
		||||
            else:
 | 
			
		||||
                raise NotImplementedError("representation?: {}".format(el.tag))
 | 
			
		||||
 | 
			
		||||
    def add_agreement(self, n, el):
 | 
			
		||||
        assert(el.get('head')[:4] == 'cid_')
 | 
			
		||||
 | 
			
		||||
        n1 = n
 | 
			
		||||
        n2 = el.get('head')[4:]
 | 
			
		||||
        agreement_str = next(iter(el)).get('agreement')
 | 
			
		||||
 | 
			
		||||
        self.agreements.append({
 | 
			
		||||
            'n1': n1,
 | 
			
		||||
            'n2': n2,
 | 
			
		||||
            'match': agreement_str.split('|')})
 | 
			
		||||
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        arrow = "root       -- modra      --> "
 | 
			
		||||
        return "{} LBS {}\n------\n{}{}".format(self.id, self.lbs, arrow, str(self.root_component))
 | 
			
		||||
        comp_str = "\n".join(str(comp) for comp in self.components)
 | 
			
		||||
 | 
			
		||||
        agrs = "\n".join("({} -[{}]- {}) ".format(
 | 
			
		||||
            a['n1'], "|".join(a['match']), a['n2']) for a in self.agreements)
 | 
			
		||||
 | 
			
		||||
        links_str = "\n".join(self.components[0].tree())
 | 
			
		||||
 | 
			
		||||
        return "{} LBS {}\nCOMPONENTS\n{}\nAGREEMENTS\n{}\nLINKS\n{}\n{}".format(
 | 
			
		||||
                self.id, self.lbs, comp_str, agrs, links_str, "-" * 40)
 | 
			
		||||
 | 
			
		||||
    def get_component(self, idx):
 | 
			
		||||
        for c in self.components:
 | 
			
		||||
            if c.idx == idx:
 | 
			
		||||
                return c
 | 
			
		||||
        raise RuntimeError("Unknown component id: {}".format(idx))
 | 
			
		||||
 | 
			
		||||
    def check_agreements(self, match):
 | 
			
		||||
        for agr in self.agreements:
 | 
			
		||||
            w1 = match[agr['n1']]
 | 
			
		||||
            w2 = match[agr['n2']]
 | 
			
		||||
 | 
			
		||||
            for agr_case in agr['match']:
 | 
			
		||||
                t1 = w1.msd[0]
 | 
			
		||||
                v1 = TAGSET[t1].index(agr_case)
 | 
			
		||||
                assert(v1 >= 0)
 | 
			
		||||
                # if none specified: nedolocnik, always agrees
 | 
			
		||||
                if v1 + 1 >= len(w1.msd): 
 | 
			
		||||
                    continue 
 | 
			
		||||
                # first is uppercase, not in TAGSET
 | 
			
		||||
                m1 = w1.msd[v1 + 1]
 | 
			
		||||
 | 
			
		||||
                # REPEAT (not DRY!)
 | 
			
		||||
                t2 = w2.msd[0]
 | 
			
		||||
                v2 = TAGSET[t2].index(agr_case)
 | 
			
		||||
                assert(v2 >= 0)
 | 
			
		||||
                if v2 + 1 >= len(w2.msd): 
 | 
			
		||||
                    continue 
 | 
			
		||||
                m2 = w2.msd[v2 + 1]
 | 
			
		||||
 | 
			
		||||
                # match!
 | 
			
		||||
                if '-' not in [m1, m2] and m1 != m2:
 | 
			
		||||
                    return False
 | 
			
		||||
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    def check_form(self, match):
 | 
			
		||||
        for midx, w in match.items():
 | 
			
		||||
            c = self.get_component(midx)
 | 
			
		||||
            for key, value in c.selection.items():
 | 
			
		||||
                t = w.msd[0]
 | 
			
		||||
                v = TAGSET[t].index(key.lower())
 | 
			
		||||
                f1 = w.msd[v + 1]
 | 
			
		||||
                f2 = CODES[value]
 | 
			
		||||
                
 | 
			
		||||
                if '-' not in [f1, f2] and f1 != f2:
 | 
			
		||||
                    return False
 | 
			
		||||
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    def match(self, word):
 | 
			
		||||
        return self.root_component.match(word)
 | 
			
		||||
        matches = self.components[0].match(word)
 | 
			
		||||
        if matches is None:
 | 
			
		||||
            return []
 | 
			
		||||
 | 
			
		||||
        to_ret = []
 | 
			
		||||
        for m in matches:
 | 
			
		||||
            if not self.check_agreements(m):
 | 
			
		||||
                bad = "Agreement"
 | 
			
		||||
            elif not self.check_form(m):
 | 
			
		||||
                bad = "Form"
 | 
			
		||||
            else:
 | 
			
		||||
                bad = "OK"
 | 
			
		||||
 | 
			
		||||
            to_ret.append((m, bad))
 | 
			
		||||
 | 
			
		||||
        return to_ret
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def build_structures(filename):
 | 
			
		||||
@ -345,14 +652,27 @@ def build_structures(filename):
 | 
			
		||||
    with open(filename, 'r') as fp:
 | 
			
		||||
        et = ElementTree.XML(fp.read())
 | 
			
		||||
        for structure in et.iter('syntactic_structure'):
 | 
			
		||||
            structures.append(SyntacticStructure.from_xml(structure))
 | 
			
		||||
            to_append = SyntacticStructure.from_xml(structure)
 | 
			
		||||
            if to_append is None:
 | 
			
		||||
                continue
 | 
			
		||||
            structures.append(to_append)
 | 
			
		||||
    return structures
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_msd(comp):
 | 
			
		||||
    d = dict(comp.items())
 | 
			
		||||
    if 'msd' in d:
 | 
			
		||||
        return d['msd']
 | 
			
		||||
    elif 'ana' in d:
 | 
			
		||||
        return d['ana'][4:]
 | 
			
		||||
    else:
 | 
			
		||||
        logging.error(d, file=sys.stderr)
 | 
			
		||||
        raise NotImplementedError("MSD?")
 | 
			
		||||
 | 
			
		||||
class Word:
 | 
			
		||||
    def __init__(self, xml):
 | 
			
		||||
        self.lemma = xml.get('lemma')
 | 
			
		||||
        self.msd = MSD_TRANSLATE[xml.get('msd')]
 | 
			
		||||
        self.msd = MSD_TRANSLATE[get_msd(xml)]
 | 
			
		||||
        self.id = xml.get('id')
 | 
			
		||||
        self.text = xml.text
 | 
			
		||||
        self.links = defaultdict(list)
 | 
			
		||||
@ -370,6 +690,10 @@ class Word:
 | 
			
		||||
        return self.links[link]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_root_id(id_):
 | 
			
		||||
    return len(id_.split('.')) == 3
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_corpus(filename):
 | 
			
		||||
    with open(filename, 'r') as fp:
 | 
			
		||||
        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
 | 
			
		||||
@ -381,20 +705,38 @@ def load_corpus(filename):
 | 
			
		||||
    for w in et.iter("w"):
 | 
			
		||||
        words[w.get('id')] = Word(w)
 | 
			
		||||
 | 
			
		||||
    pcs = set()
 | 
			
		||||
    for pc in et.iter("pc"):
 | 
			
		||||
        pcs.add(pc.get('id'))
 | 
			
		||||
 | 
			
		||||
    for l in et.iter("link"):
 | 
			
		||||
        assert('dep' in l.keys() and 'from' in l.keys() and 'afun' in l.keys())
 | 
			
		||||
 | 
			
		||||
        if 'dep' in l.keys():
 | 
			
		||||
            ana = l.get('afun')
 | 
			
		||||
            lfrom = l.get('from')
 | 
			
		||||
        if lfrom in words:
 | 
			
		||||
            assert(not lfrom.endswith('.0'))
 | 
			
		||||
            next_word_id = l.get('dep')
 | 
			
		||||
            if next_word_id in words:
 | 
			
		||||
                next_word = words[next_word_id]
 | 
			
		||||
                words[l.get('from')].add_link(l.get('afun'), next_word)
 | 
			
		||||
            dest = l.get('dep')
 | 
			
		||||
        else:
 | 
			
		||||
            ana = l.get('ana')
 | 
			
		||||
            if ana[:4] != 'syn:': # dont bother...
 | 
			
		||||
                continue
 | 
			
		||||
            ana = ana[4:]
 | 
			
		||||
            lfrom, dest = l.get('target').replace('#', '').split()
 | 
			
		||||
 | 
			
		||||
        # catch modra links from root
 | 
			
		||||
        elif lfrom[-1] == '0' and l.get('afun') == 'modra':
 | 
			
		||||
            root_words.add(l.get('dep'))
 | 
			
		||||
        if lfrom in words:
 | 
			
		||||
            if is_root_id(lfrom):
 | 
			
		||||
                logging.error("NOO: ", lfrom, file=sys.stderr)
 | 
			
		||||
                sys.exit(1)
 | 
			
		||||
 | 
			
		||||
            if dest in words:
 | 
			
		||||
                next_word = words[dest]
 | 
			
		||||
                words[lfrom].add_link(ana, next_word)
 | 
			
		||||
 | 
			
		||||
        # catch links from root
 | 
			
		||||
        elif is_root_id(lfrom):
 | 
			
		||||
            root_words.add(dest)
 | 
			
		||||
 | 
			
		||||
        # catch links from <pc> :S
 | 
			
		||||
        elif lfrom in pcs:
 | 
			
		||||
            logging.warning(str(("link from <pc>: ", lfrom)))
 | 
			
		||||
 | 
			
		||||
        else:
 | 
			
		||||
            # strange errors, just skip...
 | 
			
		||||
@ -408,8 +750,6 @@ def load_corpus(filename):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main():
 | 
			
		||||
    words = load_corpus(STAVKI)
 | 
			
		||||
 | 
			
		||||
    import time
 | 
			
		||||
    t = time.time()
 | 
			
		||||
 | 
			
		||||
@ -417,21 +757,86 @@ def main():
 | 
			
		||||
    for s in structures:
 | 
			
		||||
        print(s)
 | 
			
		||||
 | 
			
		||||
    num_matches = 0
 | 
			
		||||
    # words = load_corpus(STAVKI)
 | 
			
		||||
    import pickle
 | 
			
		||||
    # with open("words.p", "wb") as fp:
 | 
			
		||||
    #     pickle.dump(words, fp)
 | 
			
		||||
    with open("words.p", "rb") as fp:
 | 
			
		||||
        words = pickle.load(fp)
 | 
			
		||||
 | 
			
		||||
    print("MATCHES...")
 | 
			
		||||
    matches = {s.id: [] for s in structures}
 | 
			
		||||
 | 
			
		||||
    for idx, s in enumerate(structures):
 | 
			
		||||
        print("\r{}/{}: {:7s}".format(idx, len(structures), s.id)) #, end="")
 | 
			
		||||
        for w in words:
 | 
			
		||||
            mhere = s.match(w)
 | 
			
		||||
            logging.debug("  GOT: {}".format(len(mhere)))
 | 
			
		||||
            for match, reason in mhere: 
 | 
			
		||||
                matches[s.id].append((match, reason))
 | 
			
		||||
    print("")
 | 
			
		||||
 | 
			
		||||
    header = [
 | 
			
		||||
            "Structure_ID", "Component_ID", "Token_ID", "Word_form", 
 | 
			
		||||
            "Lemma", "Msd", "Representative_form_1", "Component_ID", 
 | 
			
		||||
            "Token_ID", "Word_form", "Lemma", "Msd", "Representative_form_2", 
 | 
			
		||||
            "Collocation_ID", "Joint_representative_form"]
 | 
			
		||||
    csv = [", ".join(header)]
 | 
			
		||||
 | 
			
		||||
    colocation_ids = {}
 | 
			
		||||
 | 
			
		||||
    for s in structures:
 | 
			
		||||
            m = s.match(w)
 | 
			
		||||
            if m is not None:
 | 
			
		||||
                num_matches += 1
 | 
			
		||||
                print(s.id, m)
 | 
			
		||||
        ms = matches[s.id]
 | 
			
		||||
 | 
			
		||||
        for m, reason in ms:
 | 
			
		||||
            colocation_id = [s.id]
 | 
			
		||||
            to_print = [s.id]
 | 
			
		||||
 | 
			
		||||
            m_sorted = defaultdict(lambda: None, m.items())
 | 
			
		||||
            for idx, comp in enumerate(s.components):
 | 
			
		||||
                idx = str(idx + 1)
 | 
			
		||||
                if idx not in m_sorted:
 | 
			
		||||
                    to_print.extend([idx, "", "", "", "", ""])
 | 
			
		||||
                else:
 | 
			
		||||
                    w = m_sorted[idx]
 | 
			
		||||
                    # if comp.render_word(m_sorted[idx]) is not None:
 | 
			
		||||
                    if True:
 | 
			
		||||
                        to_print.extend([idx, w.id, w.text, w.lemma, w.msd, ""])
 | 
			
		||||
                        colocation_id.append(w.lemma)
 | 
			
		||||
 | 
			
		||||
            colocation_id = tuple(colocation_id)
 | 
			
		||||
            if colocation_id in colocation_ids:
 | 
			
		||||
                cid = colocation_ids[colocation_id]
 | 
			
		||||
            else:
 | 
			
		||||
                cid = len(colocation_ids)
 | 
			
		||||
                colocation_ids[colocation_id] = cid
 | 
			
		||||
                
 | 
			
		||||
            to_print.extend([str(cid), ""])
 | 
			
		||||
            csv.append(", ".join(to_print))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    with open(FILE_OUT, "w") as fp:
 | 
			
		||||
        print("\n".join(csv), file=fp)
 | 
			
		||||
 | 
			
		||||
        # groups = defaultdict(int)
 | 
			
		||||
        # for m, reason in ms:
 | 
			
		||||
        #     if reason != "OK":
 | 
			
		||||
        #         continue
 | 
			
		||||
        #     lemmas = [(n, w.lemma) for n, w in m.items()]
 | 
			
		||||
        #     lemmas = tuple(sorted(lemmas, key=lambda x: x[0]))
 | 
			
		||||
        #     groups[lemmas] += 1
 | 
			
		||||
 | 
			
		||||
        # print(s.id)
 | 
			
		||||
        # print(groups)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    print("")
 | 
			
		||||
    print("TIME", time.time() - t)
 | 
			
		||||
    print(num_matches)
 | 
			
		||||
 | 
			
		||||
    print([(k, len(v)) for k, v in matches.items()])
 | 
			
		||||
    print(sum(len(v) for _, v in matches.items()))
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    logging.basicConfig(level=logging.INFO)
 | 
			
		||||
    main()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# 6, 7 primeri laznih zadetkov?
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user