import re from enum import Enum from luscenje_struktur.codes_tagset import CODES, TAGSET, CODES_UD class RestrictionType(Enum): Morphology = 0 Lexis = 1 MatchAll = 2 Space = 3 MorphologyUD = 4 def determine_ppb_ud(rgxs): if len(rgxs) != 1: return 0 rgx = rgxs[0] if rgx in ("ADJ", "NOUN", "ADV"): return 0 elif rgx == "AUX": return 3 elif rgx == "VERB": return 2 else: return 4 def determine_ppb(rgxs): if len(rgxs) != 1: return 0 rgx = rgxs[0] if rgx[0] in ("A", "N", "R"): return 0 elif rgx[0] == "V": if len(rgx) == 1: return 2 elif 'a' in rgx[1]: return 3 elif 'm' in rgx[1]: return 1 else: return 2 else: return 4 class MorphologyRegex: def __init__(self, restriction): # self.min_msd_length = 1 restr_dict = {} for feature in restriction: feature_dict = dict(feature.items()) match_type = True if "filter" in feature_dict: assert feature_dict['filter'] == "negative" match_type = False del feature_dict['filter'] assert len(feature_dict) == 1 key, value = next(iter(feature_dict.items())) restr_dict[key] = (value, match_type) assert 'POS' in restr_dict # handle multiple word types if '|' in restr_dict['POS'][0]: categories = restr_dict['POS'][0].split('|') else: categories = [restr_dict['POS'][0]] self.rgxs = [] self.re_objects = [] self.min_msd_lengths = [] del restr_dict['POS'] for category in categories: min_msd_length = 1 category = category.capitalize() cat_code = CODES[category] rgx = [cat_code] + ['.'] * 10 for attribute, (value, typ) in restr_dict.items(): if attribute.lower() not in TAGSET[cat_code]: continue index = TAGSET[cat_code].index(attribute.lower()) assert index >= 0 if '|' in value: match = "".join(CODES[val] for val in value.split('|')) else: match = CODES[value] match = "[{}{}]".format("" if typ else "^", match) rgx[index + 1] = match if typ: min_msd_length = max(index + 1, min_msd_length) # strip rgx for i in reversed(range(len(rgx))): if rgx[i] == '.': rgx = rgx[:-1] else: break self.re_objects.append([re.compile(r) for r in rgx]) self.rgxs.append(rgx) self.min_msd_lengths.append(min_msd_length) def __call__(self, text): for i, re_object in enumerate(self.re_objects): if len(text) < self.min_msd_lengths[i]: continue match = True for c, r in zip(text, re_object): if not r.match(c): match = False break if match: return True return False class MorphologyUDRegex: def __init__(self, restriction): # self.min_msd_length = 1 restr_dict = {} for feature in restriction: feature_dict = dict(feature.items()) match_type = True # if "filter" in feature_dict: # assert feature_dict['filter'] == "negative" # match_type = False # del feature_dict['filter'] assert len(feature_dict) == 1 key, value = next(iter(feature_dict.items())) restr_dict[key] = (value, match_type) assert 'POS' in restr_dict # handle multiple word types if '|' in restr_dict['POS'][0]: categories = restr_dict['POS'][0].split('|') else: categories = [restr_dict['POS'][0]] self.rgxs = [] self.re_objects = [] self.min_msd_lengths = [] del restr_dict['POS'] for category in categories: min_msd_length = 1 category = category.upper() assert category in CODES_UD cat_code = category rgx = category # for attribute, (value, typ) in restr_dict.items(): # if attribute.lower() not in TAGSET[cat_code]: # continue # index = TAGSET[cat_code].index(attribute.lower()) # assert index >= 0 # # if '|' in value: # match = "".join(CODES[val] for val in value.split('|')) # else: # match = CODES[value] # # match = "[{}{}]".format("" if typ else "^", match) # rgx[index + 1] = match # # if typ: # min_msd_length = max(index + 1, min_msd_length) # strip rgx # for i in reversed(range(len(rgx))): # if rgx[i] == '.': # rgx = rgx[:-1] # else: # break # self.re_objects.append([re.compile(r) for r in rgx]) self.rgxs.append(rgx) self.min_msd_lengths.append(min_msd_length) def __call__(self, text): assert len(self.rgxs) == 1 return self.rgxs[0] == text class LexisRegex: def __init__(self, restriction): restr_dict = {} for feature in restriction: restr_dict.update(feature.items()) assert "lemma" in restr_dict self.match_list = restr_dict['lemma'].split('|') def __call__(self, text): return text in self.match_list class SpaceRegex: def __init__(self, restriction): restr_dict = {} for feature in restriction: restr_dict.update(feature.items()) assert "contact" in restr_dict self.space = restr_dict['contact'].split('|') for el in self.space: if el not in ['both', 'right', 'left', 'neither']: raise Exception('Value of space restriction is not supported (it may be both, left, right or neither).') def __call__(self, word): match = False if 'neither' in self.space: match = match or (word.previous_glue != '' and word.glue != '') if 'left' in self.space: match = match or (word.previous_glue == '' and word.glue != '') if 'right' in self.space: match = match or (word.previous_glue != '' and word.glue == '') if 'both' in self.space: match = match or (word.previous_glue == '' and word.glue == '') return match class Restriction: def __init__(self, restriction_tag, system_type='JOS'): self.ppb = 4 # polnopomenska beseda (0-4) if restriction_tag is None: self.type = RestrictionType.MatchAll self.matcher = None self.present = None return restriction_type = restriction_tag.get('type') if restriction_type == "morphology": if system_type == 'JOS': self.type = RestrictionType.Morphology self.matcher = MorphologyRegex(list(restriction_tag)) self.ppb = determine_ppb(self.matcher.rgxs) # UD system is handled based on deprel elif system_type == 'UD': self.type = RestrictionType.MorphologyUD self.matcher = MorphologyUDRegex(list(restriction_tag)) # self.ppb = determine_ppb_ud(self.matcher.rgxs) elif restriction_type == "lexis": self.type = RestrictionType.Lexis self.matcher = LexisRegex(list(restriction_tag)) elif restriction_type == "space": self.type = RestrictionType.Space self.matcher = SpaceRegex(list(restriction_tag)) else: raise NotImplementedError() def match(self, word): if self.type == RestrictionType.Morphology or self.type == RestrictionType.MorphologyUD: match_to = word.msd elif self.type == RestrictionType.Lexis: match_to = word.lemma elif self.type == RestrictionType.MatchAll: return True elif self.type == RestrictionType.Space: match_to = word else: raise RuntimeError("Unreachable!") return self.matcher(match_to)