import re from enum import Enum from codes_tagset import CODES, TAGSET class RestrictionType(Enum): Morphology = 0 Lexis = 1 MatchAll = 2 def determine_ppb(rgx): if rgx[0] in ("A", "N", "R"): return 0 elif rgx[0] == "V": if len(rgx) == 1: return 2 elif 'a' in rgx[1]: return 3 elif 'm' in rgx[1]: return 1 else: return 2 else: return 4 class MorphologyRegex: def __init__(self, restriction): self.min_msd_length = 1 restr_dict = {} for feature in restriction: feature_dict = dict(feature.items()) match_type = True if "filter" in feature_dict: assert feature_dict['filter'] == "negative" match_type = False del feature_dict['filter'] assert len(feature_dict) == 1 key, value = next(iter(feature_dict.items())) restr_dict[key] = (value, match_type) assert 'POS' in restr_dict category = restr_dict['POS'][0].capitalize() cat_code = CODES[category] rgx = [cat_code] + ['.'] * 10 del restr_dict['POS'] for attribute, (value, typ) in restr_dict.items(): index = TAGSET[cat_code].index(attribute.lower()) assert index >= 0 if '|' in value: match = "".join(CODES[val] for val in value.split('|')) else: match = CODES[value] match = "[{}{}]".format("" if typ else "^", match) rgx[index + 1] = match if typ: self.min_msd_length = max(index + 1, self.min_msd_length) # strip rgx for i in reversed(range(len(rgx))): if rgx[i] == '.': rgx = rgx[:-1] else: break self.re_objects = [re.compile(r) for r in rgx] self.rgx = rgx def __call__(self, text): if len(text) <= self.min_msd_length: return False for c, r in zip(text, self.re_objects): if not r.match(c): return False return True class LexisRegex: def __init__(self, restriction): restr_dict = {} for feature in restriction: restr_dict.update(feature.items()) assert "lemma" in restr_dict self.match_list = restr_dict['lemma'].split('|') def __call__(self, text): return text in self.match_list class Restriction: def __init__(self, restriction_tag): self.ppb = 4 # polnopomenska beseda (0-4) if restriction_tag is None: self.type = RestrictionType.MatchAll self.matcher = None self.present = None return restriction_type = restriction_tag.get('type') if restriction_type == "morphology": self.type = RestrictionType.Morphology self.matcher = MorphologyRegex(list(restriction_tag)) self.ppb = determine_ppb(self.matcher.rgx) elif restriction_type == "lexis": self.type = RestrictionType.Lexis self.matcher = LexisRegex(list(restriction_tag)) else: raise NotImplementedError() def match(self, word): if self.type == RestrictionType.Morphology: match_to = word.msd elif self.type == RestrictionType.Lexis: match_to = word.lemma elif self.type == RestrictionType.MatchAll: return True else: raise RuntimeError("Unreachable!") return self.matcher(match_to)