You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
luscenje_struktur/src/restriction.py

167 lines
4.6 KiB

import re
from enum import Enum
from codes_tagset import CODES, TAGSET
class RestrictionType(Enum):
Morphology = 0
Lexis = 1
MatchAll = 2
def determine_ppb(rgxs):
if len(rgxs) != 1:
return 0
rgx = rgxs[0]
if rgx[0] in ("A", "N", "R"):
return 0
elif rgx[0] == "V":
if len(rgx) == 1:
return 2
elif 'a' in rgx[1]:
return 3
elif 'm' in rgx[1]:
return 1
else:
return 2
else:
return 4
class MorphologyRegex:
def __init__(self, restriction):
# self.min_msd_length = 1
restr_dict = {}
for feature in restriction:
feature_dict = dict(feature.items())
match_type = True
if "filter" in feature_dict:
assert feature_dict['filter'] == "negative"
match_type = False
del feature_dict['filter']
assert len(feature_dict) == 1
key, value = next(iter(feature_dict.items()))
restr_dict[key] = (value, match_type)
assert 'POS' in restr_dict
# handle multiple word types
if '|' in restr_dict['POS'][0]:
categories = restr_dict['POS'][0].split('|')
else:
categories = [restr_dict['POS'][0]]
self.rgxs = []
self.re_objects = []
self.min_msd_lengths = []
del restr_dict['POS']
for category in categories:
min_msd_length = 1
category = category.capitalize()
cat_code = CODES[category]
rgx = [cat_code] + ['.'] * 10
for attribute, (value, typ) in restr_dict.items():
if attribute.lower() not in TAGSET[cat_code]:
continue
index = TAGSET[cat_code].index(attribute.lower())
assert index >= 0
if '|' in value:
match = "".join(CODES[val] for val in value.split('|'))
else:
match = CODES[value]
match = "[{}{}]".format("" if typ else "^", match)
rgx[index + 1] = match
if typ:
min_msd_length = max(index + 1, min_msd_length)
# strip rgx
for i in reversed(range(len(rgx))):
if rgx[i] == '.':
rgx = rgx[:-1]
else:
break
self.re_objects.append([re.compile(r) for r in rgx])
self.rgxs.append(rgx)
self.min_msd_lengths.append(min_msd_length)
# self.re_objects = [re.compile(r) for r in rgx]
# self.rgx = rgx
def __call__(self, text):
# if len(text) <= self.min_msd_length:
# return False
# if len(self.rgxs[0]) > 1 and len(self.rgxs) > 1:
# a = 1
for i, re_object in enumerate(self.re_objects):
if len(text) <= self.min_msd_lengths[i]:
continue
match = True
for c, r in zip(text, re_object):
if not r.match(c):
match = False
break
if match:
return True
return False
class LexisRegex:
def __init__(self, restriction):
restr_dict = {}
for feature in restriction:
restr_dict.update(feature.items())
assert "lemma" in restr_dict
self.match_list = restr_dict['lemma'].split('|')
def __call__(self, text):
return text in self.match_list
class Restriction:
def __init__(self, restriction_tag):
self.ppb = 4 # polnopomenska beseda (0-4)
if restriction_tag is None:
self.type = RestrictionType.MatchAll
self.matcher = None
self.present = None
return
restriction_type = restriction_tag.get('type')
if restriction_type == "morphology":
self.type = RestrictionType.Morphology
self.matcher = MorphologyRegex(list(restriction_tag))
self.ppb = determine_ppb(self.matcher.rgxs)
elif restriction_type == "lexis":
self.type = RestrictionType.Lexis
self.matcher = LexisRegex(list(restriction_tag))
else:
raise NotImplementedError()
def match(self, word):
if self.type == RestrictionType.Morphology:
match_to = word.msd
elif self.type == RestrictionType.Lexis:
match_to = word.lemma
elif self.type == RestrictionType.MatchAll:
return True
else:
raise RuntimeError("Unreachable!")
return self.matcher(match_to)