HUGE refactor, creating lots of modules, no code changes though!
This commit is contained in:
133
src/restriction.py
Normal file
133
src/restriction.py
Normal file
@@ -0,0 +1,133 @@
|
||||
import re
|
||||
from enum import Enum
|
||||
|
||||
from codes_tagset import CODES, TAGSET
|
||||
|
||||
|
||||
class RestrictionType(Enum):
|
||||
Morphology = 0
|
||||
Lexis = 1
|
||||
MatchAll = 2
|
||||
|
||||
|
||||
def determine_ppb(rgx):
|
||||
if rgx[0] in ("A", "N", "R"):
|
||||
return 0
|
||||
elif rgx[0] == "V":
|
||||
if len(rgx) == 1:
|
||||
return 2
|
||||
elif 'a' in rgx[1]:
|
||||
return 3
|
||||
elif 'm' in rgx[1]:
|
||||
return 1
|
||||
else:
|
||||
return 2
|
||||
else:
|
||||
return 4
|
||||
|
||||
class MorphologyRegex:
|
||||
def __init__(self, restriction):
|
||||
self.min_msd_length = 1
|
||||
|
||||
restr_dict = {}
|
||||
for feature in restriction:
|
||||
feature_dict = dict(feature.items())
|
||||
|
||||
match_type = True
|
||||
if "filter" in feature_dict:
|
||||
assert feature_dict['filter'] == "negative"
|
||||
match_type = False
|
||||
del feature_dict['filter']
|
||||
|
||||
assert len(feature_dict) == 1
|
||||
key, value = next(iter(feature_dict.items()))
|
||||
restr_dict[key] = (value, match_type)
|
||||
|
||||
assert 'POS' in restr_dict
|
||||
category = restr_dict['POS'][0].capitalize()
|
||||
cat_code = CODES[category]
|
||||
rgx = [cat_code] + ['.'] * 10
|
||||
|
||||
del restr_dict['POS']
|
||||
|
||||
for attribute, (value, typ) in restr_dict.items():
|
||||
index = TAGSET[cat_code].index(attribute.lower())
|
||||
assert index >= 0
|
||||
|
||||
if '|' in value:
|
||||
match = "".join(CODES[val] for val in value.split('|'))
|
||||
else:
|
||||
match = CODES[value]
|
||||
|
||||
match = "[{}{}]".format("" if typ else "^", match)
|
||||
rgx[index + 1] = match
|
||||
|
||||
if typ:
|
||||
self.min_msd_length = max(index + 1, self.min_msd_length)
|
||||
|
||||
# strip rgx
|
||||
for i in reversed(range(len(rgx))):
|
||||
if rgx[i] == '.':
|
||||
rgx = rgx[:-1]
|
||||
else:
|
||||
break
|
||||
|
||||
self.re_objects = [re.compile(r) for r in rgx]
|
||||
self.rgx = rgx
|
||||
|
||||
def __call__(self, text):
|
||||
if len(text) <= self.min_msd_length:
|
||||
return False
|
||||
|
||||
for c, r in zip(text, self.re_objects):
|
||||
if not r.match(c):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class LexisRegex:
|
||||
def __init__(self, restriction):
|
||||
restr_dict = {}
|
||||
for feature in restriction:
|
||||
restr_dict.update(feature.items())
|
||||
|
||||
assert "lemma" in restr_dict
|
||||
self.match_list = restr_dict['lemma'].split('|')
|
||||
|
||||
def __call__(self, text):
|
||||
return text in self.match_list
|
||||
|
||||
class Restriction:
|
||||
def __init__(self, restriction_tag):
|
||||
self.ppb = 4 # polnopomenska beseda (0-4)
|
||||
|
||||
if restriction_tag is None:
|
||||
self.type = RestrictionType.MatchAll
|
||||
self.matcher = None
|
||||
self.present = None
|
||||
return
|
||||
|
||||
restriction_type = restriction_tag.get('type')
|
||||
if restriction_type == "morphology":
|
||||
self.type = RestrictionType.Morphology
|
||||
self.matcher = MorphologyRegex(list(restriction_tag))
|
||||
self.ppb = determine_ppb(self.matcher.rgx)
|
||||
|
||||
elif restriction_type == "lexis":
|
||||
self.type = RestrictionType.Lexis
|
||||
self.matcher = LexisRegex(list(restriction_tag))
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def match(self, word):
|
||||
if self.type == RestrictionType.Morphology:
|
||||
match_to = word.msd
|
||||
elif self.type == RestrictionType.Lexis:
|
||||
match_to = word.lemma
|
||||
elif self.type == RestrictionType.MatchAll:
|
||||
return True
|
||||
else:
|
||||
raise RuntimeError("Unreachable!")
|
||||
|
||||
return self.matcher(match_to)
|
||||
|
||||
Reference in New Issue
Block a user