import re from enum import Enum from codes_tagset import CODES, TAGSET class RestrictionType(Enum): Morphology = 0 Lexis = 1 MatchAll = 2 def determine_ppb(rgxs): if len(rgxs) != 1: return 0 rgx = rgxs[0] if rgx[0] in ("A", "N", "R"): return 0 elif rgx[0] == "V": if len(rgx) == 1: return 2 elif 'a' in rgx[1]: return 3 elif 'm' in rgx[1]: return 1 else: return 2 else: return 4 class MorphologyRegex: def __init__(self, restriction): # self.min_msd_length = 1 restr_dict = {} for feature in restriction: feature_dict = dict(feature.items()) match_type = True if "filter" in feature_dict: assert feature_dict['filter'] == "negative" match_type = False del feature_dict['filter'] assert len(feature_dict) == 1 key, value = next(iter(feature_dict.items())) restr_dict[key] = (value, match_type) assert 'POS' in restr_dict # handle multiple word types if '|' in restr_dict['POS'][0]: categories = restr_dict['POS'][0].split('|') else: categories = [restr_dict['POS'][0]] self.rgxs = [] self.re_objects = [] self.min_msd_lengths = [] del restr_dict['POS'] for category in categories: min_msd_length = 1 category = category.capitalize() cat_code = CODES[category] rgx = [cat_code] + ['.'] * 10 for attribute, (value, typ) in restr_dict.items(): if attribute.lower() not in TAGSET[cat_code]: continue index = TAGSET[cat_code].index(attribute.lower()) assert index >= 0 if '|' in value: match = "".join(CODES[val] for val in value.split('|')) else: match = CODES[value] match = "[{}{}]".format("" if typ else "^", match) rgx[index + 1] = match if typ: min_msd_length = max(index + 1, min_msd_length) # strip rgx for i in reversed(range(len(rgx))): if rgx[i] == '.': rgx = rgx[:-1] else: break self.re_objects.append([re.compile(r) for r in rgx]) self.rgxs.append(rgx) self.min_msd_lengths.append(min_msd_length) # self.re_objects = [re.compile(r) for r in rgx] # self.rgx = rgx def __call__(self, text): # if len(text) <= self.min_msd_length: # return False # if len(self.rgxs[0]) > 1 and len(self.rgxs) > 1: # a = 1 for i, re_object in enumerate(self.re_objects): if len(text) <= self.min_msd_lengths[i]: continue match = True for c, r in zip(text, re_object): if not r.match(c): match = False break if match: return True return False class LexisRegex: def __init__(self, restriction): restr_dict = {} for feature in restriction: restr_dict.update(feature.items()) assert "lemma" in restr_dict self.match_list = restr_dict['lemma'].split('|') def __call__(self, text): return text in self.match_list class Restriction: def __init__(self, restriction_tag): self.ppb = 4 # polnopomenska beseda (0-4) if restriction_tag is None: self.type = RestrictionType.MatchAll self.matcher = None self.present = None return restriction_type = restriction_tag.get('type') if restriction_type == "morphology": self.type = RestrictionType.Morphology self.matcher = MorphologyRegex(list(restriction_tag)) self.ppb = determine_ppb(self.matcher.rgxs) elif restriction_type == "lexis": self.type = RestrictionType.Lexis self.matcher = LexisRegex(list(restriction_tag)) else: raise NotImplementedError() def match(self, word): if self.type == RestrictionType.Morphology: match_to = word.msd elif self.type == RestrictionType.Lexis: match_to = word.lemma elif self.type == RestrictionType.MatchAll: return True else: raise RuntimeError("Unreachable!") return self.matcher(match_to)