Simplifying and also improving the speed (less regex comparisons!)

This commit is contained in:
Ozbolt Menegatti 2019-06-15 13:10:23 +02:00
parent 09bdd0fe3f
commit 43c6c9151b

28
wani.py
View File

@ -109,21 +109,6 @@ TAGSET = {
"X": ['type'] "X": ['type']
} }
CATEGORY_BASES = {
"N": ['.'] * 5,
"V": ['.'] * 7,
"A": ['.'] * 6,
"R": ['.'] * 2,
"P": ['.'] * 6,
"M": ['.'] * 6,
"S": ['.'] * 1,
"C": ['.'] * 1,
"Q": [],
"I": [],
"Y": [],
"X": ['.'] * 1
}
class ComponentType(Enum): class ComponentType(Enum):
Other = 0 Other = 0
Core = 2 Core = 2
@ -431,7 +416,9 @@ def determine_ppb(rgx):
if rgx[0] in ("A", "N", "R"): if rgx[0] in ("A", "N", "R"):
return 0 return 0
elif rgx[0] == "V": elif rgx[0] == "V":
if 'a' in rgx[1]: if len(rgx) == 1:
return 2
elif 'a' in rgx[1]:
return 3 return 3
elif 'm' in rgx[1]: elif 'm' in rgx[1]:
return 1 return 1
@ -461,7 +448,7 @@ class MorphologyRegex:
assert 'POS' in restr_dict assert 'POS' in restr_dict
category = restr_dict['POS'][0].capitalize() category = restr_dict['POS'][0].capitalize()
cat_code = CODES[category] cat_code = CODES[category]
rgx = [cat_code] + CATEGORY_BASES[cat_code] rgx = [cat_code] + ['.'] * 10
del restr_dict['POS'] del restr_dict['POS']
@ -480,6 +467,13 @@ class MorphologyRegex:
if typ: if typ:
self.min_msd_length = max(index + 1, self.min_msd_length) self.min_msd_length = max(index + 1, self.min_msd_length)
# strip rgx
for i in reversed(range(len(rgx))):
if rgx[i] == '.':
rgx = rgx[:-1]
else:
break
self.re_objects = [re.compile(r) for r in rgx] self.re_objects = [re.compile(r) for r in rgx]
self.rgx = rgx self.rgx = rgx