From 3be4118dc0b88072753005d98398f79b036f8e1f Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Tue, 11 Jun 2019 10:02:24 +0200 Subject: [PATCH] Refactoring lexis/morphology matchers, now "pickable". --- wani.py | 99 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 51 insertions(+), 48 deletions(-) diff --git a/wani.py b/wani.py index dfdcd5e..5717210 100644 --- a/wani.py +++ b/wani.py @@ -442,66 +442,70 @@ def determine_ppb(rgx): else: return 4 -def build_morphology_regex(restriction): - restr_dict = {} - for feature in restriction: - feature_dict = dict(feature.items()) +class MorphologyRegex: + def __init__(self, restriction): + self.min_msd_length = 1 - match_type = True - if "filter" in feature_dict: - assert feature_dict['filter'] == "negative" - match_type = False - del feature_dict['filter'] + restr_dict = {} + for feature in restriction: + feature_dict = dict(feature.items()) - assert len(feature_dict) == 1 - key, value = next(iter(feature_dict.items())) - restr_dict[key] = (value, match_type) + match_type = True + if "filter" in feature_dict: + assert feature_dict['filter'] == "negative" + match_type = False + del feature_dict['filter'] - assert 'POS' in restr_dict - category = restr_dict['POS'][0].capitalize() - cat_code = CODES[category] - rgx = [cat_code] + CATEGORY_BASES[cat_code] + assert len(feature_dict) == 1 + key, value = next(iter(feature_dict.items())) + restr_dict[key] = (value, match_type) - del restr_dict['POS'] - min_msd_length = 1 + assert 'POS' in restr_dict + category = restr_dict['POS'][0].capitalize() + cat_code = CODES[category] + rgx = [cat_code] + CATEGORY_BASES[cat_code] - for attribute, (value, typ) in restr_dict.items(): - index = TAGSET[cat_code].index(attribute.lower()) - assert index >= 0 + del restr_dict['POS'] - if '|' in value: - match = "".join(CODES[val] for val in value.split('|')) - else: - match = CODES[value] + for attribute, (value, typ) in restr_dict.items(): + index = TAGSET[cat_code].index(attribute.lower()) + assert index >= 0 - match = "[{}{}]".format("" if typ else "^", match) - rgx[index + 1] = match + if '|' in value: + match = "".join(CODES[val] for val in value.split('|')) + else: + match = CODES[value] - if typ: - min_msd_length = max(index + 1, min_msd_length) + match = "[{}{}]".format("" if typ else "^", match) + rgx[index + 1] = match - re_objects = [re.compile(r) for r in rgx] - def matcher(text): - if len(text) <= min_msd_length: + if typ: + self.min_msd_length = max(index + 1, self.min_msd_length) + + self.re_objects = [re.compile(r) for r in rgx] + self.rgx = rgx + + def __call__(self, text): + if len(text) <= self.min_msd_length: return False - for c, r in zip(text, re_objects): + for c, r in zip(text, self.re_objects): if not r.match(c): return False return True - return rgx, matcher +class LexisRegex: + def __init__(self, restriction): + restr_dict = {} + for feature in restriction: + restr_dict.update(feature.items()) -def build_lexis_regex(restriction): - restr_dict = {} - for feature in restriction: - restr_dict.update(feature.items()) - - assert "lemma" in restr_dict - match_list = restr_dict['lemma'].split('|') - - return match_list, lambda text: text in match_list + assert "lemma" in restr_dict + self.match_list = restr_dict['lemma'].split('|') + + def __call__(self, text): + return text in self.match_list class Restriction: @@ -517,13 +521,12 @@ class Restriction: restriction_type = restriction_tag.get('type') if restriction_type == "morphology": self.type = RestrictionType.Morphology - present, self.matcher = build_morphology_regex(list(restriction_tag)) - self.present = " ".join(present) - self.ppb = determine_ppb(present) + self.matcher = MorphologyRegex(list(restriction_tag)) + self.ppb = determine_ppb(self.matcher.rgx) elif restriction_type == "lexis": self.type = RestrictionType.Lexis - self.present, self.matcher = build_lexis_regex(list(restriction_tag)) + self.matcher = LexisRegex(list(restriction_tag)) else: raise NotImplementedError() @@ -802,7 +805,7 @@ def get_lemma_features(et): result = {} for pos in lf.iter('POS'): - rgx_list, _ = build_morphology_regex(pos) + rgx_list = MorphologyRegex(pos).rgx rgx_str = "" for position in rgx_list: if position == ".":