Refactoring lexis/morphology matchers, now "pickable".
This commit is contained in:
		
							parent
							
								
									ad0f9b0956
								
							
						
					
					
						commit
						3be4118dc0
					
				
							
								
								
									
										99
									
								
								wani.py
									
									
									
									
									
								
							
							
						
						
									
										99
									
								
								wani.py
									
									
									
									
									
								
							@ -442,66 +442,70 @@ def determine_ppb(rgx):
 | 
			
		||||
    else:
 | 
			
		||||
        return 4
 | 
			
		||||
 | 
			
		||||
def build_morphology_regex(restriction):
 | 
			
		||||
    restr_dict = {}
 | 
			
		||||
    for feature in restriction:
 | 
			
		||||
        feature_dict = dict(feature.items())
 | 
			
		||||
class MorphologyRegex:
 | 
			
		||||
    def __init__(self, restriction):
 | 
			
		||||
        self.min_msd_length = 1
 | 
			
		||||
 | 
			
		||||
        match_type = True
 | 
			
		||||
        if "filter" in feature_dict:
 | 
			
		||||
            assert feature_dict['filter'] == "negative"
 | 
			
		||||
            match_type = False
 | 
			
		||||
            del feature_dict['filter']
 | 
			
		||||
        restr_dict = {}
 | 
			
		||||
        for feature in restriction:
 | 
			
		||||
            feature_dict = dict(feature.items())
 | 
			
		||||
 | 
			
		||||
        assert len(feature_dict) == 1
 | 
			
		||||
        key, value = next(iter(feature_dict.items()))
 | 
			
		||||
        restr_dict[key] = (value, match_type)
 | 
			
		||||
            match_type = True
 | 
			
		||||
            if "filter" in feature_dict:
 | 
			
		||||
                assert feature_dict['filter'] == "negative"
 | 
			
		||||
                match_type = False
 | 
			
		||||
                del feature_dict['filter']
 | 
			
		||||
 | 
			
		||||
    assert 'POS' in restr_dict
 | 
			
		||||
    category = restr_dict['POS'][0].capitalize()
 | 
			
		||||
    cat_code = CODES[category]
 | 
			
		||||
    rgx = [cat_code] + CATEGORY_BASES[cat_code]
 | 
			
		||||
            assert len(feature_dict) == 1
 | 
			
		||||
            key, value = next(iter(feature_dict.items()))
 | 
			
		||||
            restr_dict[key] = (value, match_type)
 | 
			
		||||
 | 
			
		||||
    del restr_dict['POS']
 | 
			
		||||
    min_msd_length = 1
 | 
			
		||||
        assert 'POS' in restr_dict
 | 
			
		||||
        category = restr_dict['POS'][0].capitalize()
 | 
			
		||||
        cat_code = CODES[category]
 | 
			
		||||
        rgx = [cat_code] + CATEGORY_BASES[cat_code]
 | 
			
		||||
 | 
			
		||||
    for attribute, (value, typ) in restr_dict.items():
 | 
			
		||||
        index = TAGSET[cat_code].index(attribute.lower())
 | 
			
		||||
        assert index >= 0
 | 
			
		||||
        del restr_dict['POS']
 | 
			
		||||
 | 
			
		||||
        if '|' in value:
 | 
			
		||||
            match = "".join(CODES[val] for val in value.split('|'))
 | 
			
		||||
        else:
 | 
			
		||||
            match = CODES[value]
 | 
			
		||||
        for attribute, (value, typ) in restr_dict.items():
 | 
			
		||||
            index = TAGSET[cat_code].index(attribute.lower())
 | 
			
		||||
            assert index >= 0
 | 
			
		||||
 | 
			
		||||
        match = "[{}{}]".format("" if typ else "^", match)
 | 
			
		||||
        rgx[index + 1] = match
 | 
			
		||||
            if '|' in value:
 | 
			
		||||
                match = "".join(CODES[val] for val in value.split('|'))
 | 
			
		||||
            else:
 | 
			
		||||
                match = CODES[value]
 | 
			
		||||
 | 
			
		||||
        if typ:
 | 
			
		||||
            min_msd_length = max(index + 1, min_msd_length)
 | 
			
		||||
            match = "[{}{}]".format("" if typ else "^", match)
 | 
			
		||||
            rgx[index + 1] = match
 | 
			
		||||
 | 
			
		||||
    re_objects = [re.compile(r) for r in rgx]
 | 
			
		||||
    def matcher(text):
 | 
			
		||||
        if len(text) <= min_msd_length:
 | 
			
		||||
            if typ:
 | 
			
		||||
                self.min_msd_length = max(index + 1, self.min_msd_length)
 | 
			
		||||
 | 
			
		||||
        self.re_objects = [re.compile(r) for r in rgx]
 | 
			
		||||
        self.rgx = rgx
 | 
			
		||||
    
 | 
			
		||||
    def __call__(self, text):
 | 
			
		||||
        if len(text) <= self.min_msd_length:
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
        for c, r in zip(text, re_objects):
 | 
			
		||||
        for c, r in zip(text, self.re_objects):
 | 
			
		||||
            if not r.match(c):
 | 
			
		||||
                return False
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    return rgx, matcher
 | 
			
		||||
 | 
			
		||||
class LexisRegex:
 | 
			
		||||
    def __init__(self, restriction):
 | 
			
		||||
        restr_dict = {}
 | 
			
		||||
        for feature in restriction:
 | 
			
		||||
            restr_dict.update(feature.items())
 | 
			
		||||
 | 
			
		||||
def build_lexis_regex(restriction):
 | 
			
		||||
    restr_dict = {}
 | 
			
		||||
    for feature in restriction:
 | 
			
		||||
        restr_dict.update(feature.items())
 | 
			
		||||
 | 
			
		||||
    assert "lemma" in restr_dict
 | 
			
		||||
    match_list = restr_dict['lemma'].split('|')
 | 
			
		||||
 | 
			
		||||
    return match_list, lambda text: text in match_list
 | 
			
		||||
        assert "lemma" in restr_dict
 | 
			
		||||
        self.match_list = restr_dict['lemma'].split('|')
 | 
			
		||||
    
 | 
			
		||||
    def __call__(self, text):
 | 
			
		||||
        return text in self.match_list
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Restriction:
 | 
			
		||||
@ -517,13 +521,12 @@ class Restriction:
 | 
			
		||||
        restriction_type = restriction_tag.get('type')
 | 
			
		||||
        if restriction_type == "morphology":
 | 
			
		||||
            self.type = RestrictionType.Morphology
 | 
			
		||||
            present, self.matcher = build_morphology_regex(list(restriction_tag))
 | 
			
		||||
            self.present = " ".join(present)
 | 
			
		||||
            self.ppb = determine_ppb(present)
 | 
			
		||||
            self.matcher = MorphologyRegex(list(restriction_tag))
 | 
			
		||||
            self.ppb = determine_ppb(self.matcher.rgx)
 | 
			
		||||
 | 
			
		||||
        elif restriction_type == "lexis":
 | 
			
		||||
            self.type = RestrictionType.Lexis
 | 
			
		||||
            self.present, self.matcher = build_lexis_regex(list(restriction_tag))
 | 
			
		||||
            self.matcher = LexisRegex(list(restriction_tag))
 | 
			
		||||
        else:
 | 
			
		||||
            raise NotImplementedError()
 | 
			
		||||
 | 
			
		||||
@ -802,7 +805,7 @@ def get_lemma_features(et):
 | 
			
		||||
 | 
			
		||||
    result = {}
 | 
			
		||||
    for pos in lf.iter('POS'):
 | 
			
		||||
        rgx_list, _ = build_morphology_regex(pos)
 | 
			
		||||
        rgx_list = MorphologyRegex(pos).rgx
 | 
			
		||||
        rgx_str = ""
 | 
			
		||||
        for position in rgx_list:
 | 
			
		||||
            if position == ".":
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user