Refactoring lexis/morphology matchers, now "pickable".
This commit is contained in:
parent
ad0f9b0956
commit
3be4118dc0
99
wani.py
99
wani.py
|
@ -442,66 +442,70 @@ def determine_ppb(rgx):
|
||||||
else:
|
else:
|
||||||
return 4
|
return 4
|
||||||
|
|
||||||
def build_morphology_regex(restriction):
|
class MorphologyRegex:
|
||||||
restr_dict = {}
|
def __init__(self, restriction):
|
||||||
for feature in restriction:
|
self.min_msd_length = 1
|
||||||
feature_dict = dict(feature.items())
|
|
||||||
|
|
||||||
match_type = True
|
restr_dict = {}
|
||||||
if "filter" in feature_dict:
|
for feature in restriction:
|
||||||
assert feature_dict['filter'] == "negative"
|
feature_dict = dict(feature.items())
|
||||||
match_type = False
|
|
||||||
del feature_dict['filter']
|
|
||||||
|
|
||||||
assert len(feature_dict) == 1
|
match_type = True
|
||||||
key, value = next(iter(feature_dict.items()))
|
if "filter" in feature_dict:
|
||||||
restr_dict[key] = (value, match_type)
|
assert feature_dict['filter'] == "negative"
|
||||||
|
match_type = False
|
||||||
|
del feature_dict['filter']
|
||||||
|
|
||||||
assert 'POS' in restr_dict
|
assert len(feature_dict) == 1
|
||||||
category = restr_dict['POS'][0].capitalize()
|
key, value = next(iter(feature_dict.items()))
|
||||||
cat_code = CODES[category]
|
restr_dict[key] = (value, match_type)
|
||||||
rgx = [cat_code] + CATEGORY_BASES[cat_code]
|
|
||||||
|
|
||||||
del restr_dict['POS']
|
assert 'POS' in restr_dict
|
||||||
min_msd_length = 1
|
category = restr_dict['POS'][0].capitalize()
|
||||||
|
cat_code = CODES[category]
|
||||||
|
rgx = [cat_code] + CATEGORY_BASES[cat_code]
|
||||||
|
|
||||||
for attribute, (value, typ) in restr_dict.items():
|
del restr_dict['POS']
|
||||||
index = TAGSET[cat_code].index(attribute.lower())
|
|
||||||
assert index >= 0
|
|
||||||
|
|
||||||
if '|' in value:
|
for attribute, (value, typ) in restr_dict.items():
|
||||||
match = "".join(CODES[val] for val in value.split('|'))
|
index = TAGSET[cat_code].index(attribute.lower())
|
||||||
else:
|
assert index >= 0
|
||||||
match = CODES[value]
|
|
||||||
|
|
||||||
match = "[{}{}]".format("" if typ else "^", match)
|
if '|' in value:
|
||||||
rgx[index + 1] = match
|
match = "".join(CODES[val] for val in value.split('|'))
|
||||||
|
else:
|
||||||
|
match = CODES[value]
|
||||||
|
|
||||||
if typ:
|
match = "[{}{}]".format("" if typ else "^", match)
|
||||||
min_msd_length = max(index + 1, min_msd_length)
|
rgx[index + 1] = match
|
||||||
|
|
||||||
re_objects = [re.compile(r) for r in rgx]
|
if typ:
|
||||||
def matcher(text):
|
self.min_msd_length = max(index + 1, self.min_msd_length)
|
||||||
if len(text) <= min_msd_length:
|
|
||||||
|
self.re_objects = [re.compile(r) for r in rgx]
|
||||||
|
self.rgx = rgx
|
||||||
|
|
||||||
|
def __call__(self, text):
|
||||||
|
if len(text) <= self.min_msd_length:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
for c, r in zip(text, re_objects):
|
for c, r in zip(text, self.re_objects):
|
||||||
if not r.match(c):
|
if not r.match(c):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return rgx, matcher
|
|
||||||
|
|
||||||
|
class LexisRegex:
|
||||||
|
def __init__(self, restriction):
|
||||||
|
restr_dict = {}
|
||||||
|
for feature in restriction:
|
||||||
|
restr_dict.update(feature.items())
|
||||||
|
|
||||||
def build_lexis_regex(restriction):
|
assert "lemma" in restr_dict
|
||||||
restr_dict = {}
|
self.match_list = restr_dict['lemma'].split('|')
|
||||||
for feature in restriction:
|
|
||||||
restr_dict.update(feature.items())
|
def __call__(self, text):
|
||||||
|
return text in self.match_list
|
||||||
assert "lemma" in restr_dict
|
|
||||||
match_list = restr_dict['lemma'].split('|')
|
|
||||||
|
|
||||||
return match_list, lambda text: text in match_list
|
|
||||||
|
|
||||||
|
|
||||||
class Restriction:
|
class Restriction:
|
||||||
|
@ -517,13 +521,12 @@ class Restriction:
|
||||||
restriction_type = restriction_tag.get('type')
|
restriction_type = restriction_tag.get('type')
|
||||||
if restriction_type == "morphology":
|
if restriction_type == "morphology":
|
||||||
self.type = RestrictionType.Morphology
|
self.type = RestrictionType.Morphology
|
||||||
present, self.matcher = build_morphology_regex(list(restriction_tag))
|
self.matcher = MorphologyRegex(list(restriction_tag))
|
||||||
self.present = " ".join(present)
|
self.ppb = determine_ppb(self.matcher.rgx)
|
||||||
self.ppb = determine_ppb(present)
|
|
||||||
|
|
||||||
elif restriction_type == "lexis":
|
elif restriction_type == "lexis":
|
||||||
self.type = RestrictionType.Lexis
|
self.type = RestrictionType.Lexis
|
||||||
self.present, self.matcher = build_lexis_regex(list(restriction_tag))
|
self.matcher = LexisRegex(list(restriction_tag))
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@ -802,7 +805,7 @@ def get_lemma_features(et):
|
||||||
|
|
||||||
result = {}
|
result = {}
|
||||||
for pos in lf.iter('POS'):
|
for pos in lf.iter('POS'):
|
||||||
rgx_list, _ = build_morphology_regex(pos)
|
rgx_list = MorphologyRegex(pos).rgx
|
||||||
rgx_str = ""
|
rgx_str = ""
|
||||||
for position in rgx_list:
|
for position in rgx_list:
|
||||||
if position == ".":
|
if position == ".":
|
||||||
|
|
Loading…
Reference in New Issue
Block a user