luscenje_struktur/src/restriction.py

import re
from enum import Enum

from codes_tagset import CODES, TAGSET


class RestrictionType(Enum):
    Morphology = 0
    Lexis = 1
    MatchAll = 2


def determine_ppb(rgx):
    if rgx[0] in ("A", "N", "R"):
        return 0
    elif rgx[0] == "V":
        if len(rgx) == 1:
            return 2
        elif 'a' in rgx[1]:
            return 3
        elif 'm' in rgx[1]:
            return 1
        else:
            return 2
    else:
        return 4

class MorphologyRegex:
    def __init__(self, restriction):
        self.min_msd_length = 1

        restr_dict = {}
        for feature in restriction:
            feature_dict = dict(feature.items())

            match_type = True
            if "filter" in feature_dict:
                assert feature_dict['filter'] == "negative"
                match_type = False
                del feature_dict['filter']

            assert len(feature_dict) == 1
            key, value = next(iter(feature_dict.items()))
            restr_dict[key] = (value, match_type)

        assert 'POS' in restr_dict
        category = restr_dict['POS'][0].capitalize()
        cat_code = CODES[category]
        rgx = [cat_code] + ['.'] * 10

        del restr_dict['POS']

        for attribute, (value, typ) in restr_dict.items():
            index = TAGSET[cat_code].index(attribute.lower())
            assert index >= 0

            if '|' in value:
                match = "".join(CODES[val] for val in value.split('|'))
            else:
                match = CODES[value]

            match = "[{}{}]".format("" if typ else "^", match)
            rgx[index + 1] = match

            if typ:
                self.min_msd_length = max(index + 1, self.min_msd_length)

        # strip rgx
        for i in reversed(range(len(rgx))):
            if rgx[i] == '.':
                rgx = rgx[:-1]
            else:
                break

        self.re_objects = [re.compile(r) for r in rgx]
        self.rgx = rgx
    
    def __call__(self, text):
        if len(text) <= self.min_msd_length:
            return False

        for c, r in zip(text, self.re_objects):
            if not r.match(c):
                return False
        return True


class LexisRegex:
    def __init__(self, restriction):
        restr_dict = {}
        for feature in restriction:
            restr_dict.update(feature.items())

        assert "lemma" in restr_dict
        self.match_list = restr_dict['lemma'].split('|')
    
    def __call__(self, text):
        return text in self.match_list

class Restriction:
    def __init__(self, restriction_tag):
        self.ppb = 4 # polnopomenska beseda (0-4)

        if restriction_tag is None:
            self.type = RestrictionType.MatchAll
            self.matcher = None
            self.present = None
            return

        restriction_type = restriction_tag.get('type')
        if restriction_type == "morphology":
            self.type = RestrictionType.Morphology
            self.matcher = MorphologyRegex(list(restriction_tag))
            self.ppb = determine_ppb(self.matcher.rgx)

        elif restriction_type == "lexis":
            self.type = RestrictionType.Lexis
            self.matcher = LexisRegex(list(restriction_tag))
        else:
            raise NotImplementedError()

    def match(self, word):
        if self.type == RestrictionType.Morphology:
            match_to = word.msd
        elif self.type == RestrictionType.Lexis:
            match_to = word.lemma
        elif self.type == RestrictionType.MatchAll:
            return True
        else:
            raise RuntimeError("Unreachable!")

        return self.matcher(match_to)
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`import re`
			`from enum import Enum`

			`from codes_tagset import CODES, TAGSET`


			`class RestrictionType(Enum):`
			`Morphology = 0`
			`Lexis = 1`
			`MatchAll = 2`


			`def determine_ppb(rgx):`
			`if rgx[0] in ("A", "N", "R"):`
			`return 0`
			`elif rgx[0] == "V":`
			`if len(rgx) == 1:`
			`return 2`
			`elif 'a' in rgx[1]:`
			`return 3`
			`elif 'm' in rgx[1]:`
			`return 1`
			`else:`
			`return 2`
			`else:`
			`return 4`

			`class MorphologyRegex:`
			`def __init__(self, restriction):`
			`self.min_msd_length = 1`

			`restr_dict = {}`
			`for feature in restriction:`
			`feature_dict = dict(feature.items())`

			`match_type = True`
			`if "filter" in feature_dict:`
			`assert feature_dict['filter'] == "negative"`
			`match_type = False`
			`del feature_dict['filter']`

			`assert len(feature_dict) == 1`
			`key, value = next(iter(feature_dict.items()))`
			`restr_dict[key] = (value, match_type)`

			`assert 'POS' in restr_dict`
			`category = restr_dict['POS'][0].capitalize()`
			`cat_code = CODES[category]`
			`rgx = [cat_code] + ['.'] * 10`

			`del restr_dict['POS']`

			`for attribute, (value, typ) in restr_dict.items():`
			`index = TAGSET[cat_code].index(attribute.lower())`
			`assert index >= 0`

			`if '\|' in value:`
			`match = "".join(CODES[val] for val in value.split('\|'))`
			`else:`
			`match = CODES[value]`

			`match = "[{}{}]".format("" if typ else "^", match)`
			`rgx[index + 1] = match`

			`if typ:`
			`self.min_msd_length = max(index + 1, self.min_msd_length)`

			`# strip rgx`
			`for i in reversed(range(len(rgx))):`
			`if rgx[i] == '.':`
			`rgx = rgx[:-1]`
			`else:`
			`break`

			`self.re_objects = [re.compile(r) for r in rgx]`
			`self.rgx = rgx`

			`def __call__(self, text):`
			`if len(text) <= self.min_msd_length:`
			`return False`

			`for c, r in zip(text, self.re_objects):`
			`if not r.match(c):`
			`return False`
			`return True`


			`class LexisRegex:`
			`def __init__(self, restriction):`
			`restr_dict = {}`
			`for feature in restriction:`
			`restr_dict.update(feature.items())`

			`assert "lemma" in restr_dict`
			`self.match_list = restr_dict['lemma'].split('\|')`

			`def __call__(self, text):`
			`return text in self.match_list`

			`class Restriction:`
			`def __init__(self, restriction_tag):`
			`self.ppb = 4 # polnopomenska beseda (0-4)`

			`if restriction_tag is None:`
			`self.type = RestrictionType.MatchAll`
			`self.matcher = None`
			`self.present = None`
			`return`

			`restriction_type = restriction_tag.get('type')`
			`if restriction_type == "morphology":`
			`self.type = RestrictionType.Morphology`
			`self.matcher = MorphologyRegex(list(restriction_tag))`
			`self.ppb = determine_ppb(self.matcher.rgx)`

			`elif restriction_type == "lexis":`
			`self.type = RestrictionType.Lexis`
			`self.matcher = LexisRegex(list(restriction_tag))`
			`else:`
			`raise NotImplementedError()`

			`def match(self, word):`
			`if self.type == RestrictionType.Morphology:`
			`match_to = word.msd`
			`elif self.type == RestrictionType.Lexis:`
			`match_to = word.lemma`
			`elif self.type == RestrictionType.MatchAll:`
			`return True`
			`else:`
			`raise RuntimeError("Unreachable!")`

			`return self.matcher(match_to)`