Refactoring lexis/morphology matchers, now "pickable".

This commit is contained in:
Ozbolt Menegatti 2019-06-11 10:02:24 +02:00
parent ad0f9b0956
commit 3be4118dc0

97
wani.py
View File

@ -442,66 +442,70 @@ def determine_ppb(rgx):
else:
return 4
def build_morphology_regex(restriction):
restr_dict = {}
for feature in restriction:
feature_dict = dict(feature.items())
class MorphologyRegex:
def __init__(self, restriction):
self.min_msd_length = 1
match_type = True
if "filter" in feature_dict:
assert feature_dict['filter'] == "negative"
match_type = False
del feature_dict['filter']
restr_dict = {}
for feature in restriction:
feature_dict = dict(feature.items())
assert len(feature_dict) == 1
key, value = next(iter(feature_dict.items()))
restr_dict[key] = (value, match_type)
match_type = True
if "filter" in feature_dict:
assert feature_dict['filter'] == "negative"
match_type = False
del feature_dict['filter']
assert 'POS' in restr_dict
category = restr_dict['POS'][0].capitalize()
cat_code = CODES[category]
rgx = [cat_code] + CATEGORY_BASES[cat_code]
assert len(feature_dict) == 1
key, value = next(iter(feature_dict.items()))
restr_dict[key] = (value, match_type)
del restr_dict['POS']
min_msd_length = 1
assert 'POS' in restr_dict
category = restr_dict['POS'][0].capitalize()
cat_code = CODES[category]
rgx = [cat_code] + CATEGORY_BASES[cat_code]
for attribute, (value, typ) in restr_dict.items():
index = TAGSET[cat_code].index(attribute.lower())
assert index >= 0
del restr_dict['POS']
if '|' in value:
match = "".join(CODES[val] for val in value.split('|'))
else:
match = CODES[value]
for attribute, (value, typ) in restr_dict.items():
index = TAGSET[cat_code].index(attribute.lower())
assert index >= 0
match = "[{}{}]".format("" if typ else "^", match)
rgx[index + 1] = match
if '|' in value:
match = "".join(CODES[val] for val in value.split('|'))
else:
match = CODES[value]
if typ:
min_msd_length = max(index + 1, min_msd_length)
match = "[{}{}]".format("" if typ else "^", match)
rgx[index + 1] = match
re_objects = [re.compile(r) for r in rgx]
def matcher(text):
if len(text) <= min_msd_length:
if typ:
self.min_msd_length = max(index + 1, self.min_msd_length)
self.re_objects = [re.compile(r) for r in rgx]
self.rgx = rgx
def __call__(self, text):
if len(text) <= self.min_msd_length:
return False
for c, r in zip(text, re_objects):
for c, r in zip(text, self.re_objects):
if not r.match(c):
return False
return True
return rgx, matcher
class LexisRegex:
def __init__(self, restriction):
restr_dict = {}
for feature in restriction:
restr_dict.update(feature.items())
def build_lexis_regex(restriction):
restr_dict = {}
for feature in restriction:
restr_dict.update(feature.items())
assert "lemma" in restr_dict
self.match_list = restr_dict['lemma'].split('|')
assert "lemma" in restr_dict
match_list = restr_dict['lemma'].split('|')
return match_list, lambda text: text in match_list
def __call__(self, text):
return text in self.match_list
class Restriction:
@ -517,13 +521,12 @@ class Restriction:
restriction_type = restriction_tag.get('type')
if restriction_type == "morphology":
self.type = RestrictionType.Morphology
present, self.matcher = build_morphology_regex(list(restriction_tag))
self.present = " ".join(present)
self.ppb = determine_ppb(present)
self.matcher = MorphologyRegex(list(restriction_tag))
self.ppb = determine_ppb(self.matcher.rgx)
elif restriction_type == "lexis":
self.type = RestrictionType.Lexis
self.present, self.matcher = build_lexis_regex(list(restriction_tag))
self.matcher = LexisRegex(list(restriction_tag))
else:
raise NotImplementedError()
@ -802,7 +805,7 @@ def get_lemma_features(et):
result = {}
for pos in lf.iter('POS'):
rgx_list, _ = build_morphology_regex(pos)
rgx_list = MorphologyRegex(pos).rgx
rgx_str = ""
for position in rgx_list:
if position == ".":