From a13559941e0e2e152f53b8524b7e4c52fb0db3ca Mon Sep 17 00:00:00 2001 From: Luka Date: Mon, 11 Nov 2019 14:52:35 +0100 Subject: [PATCH] Added root filtering. --- Tree.py | 58 +++++++++++++++++++++++++++----------- dependency-parsetree.py | 62 ++++++++++++++++++++++++++--------------- 2 files changed, 81 insertions(+), 39 deletions(-) diff --git a/Tree.py b/Tree.py index e4a6529..0daec39 100644 --- a/Tree.py +++ b/Tree.py @@ -7,9 +7,9 @@ from Value import Value class Tree(object): - def __init__(self, form, lemma, upos, xpos, deprel, feats, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, feats_dict, feats_complete_dict, head): - # if not hasattr(self, 'feats'): - # self.feats = {} + def __init__(self, form, lemma, upos, xpos, deprel, feats, feats_detailed, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, feats_dict, feats_detailed_dict, head): + if not hasattr(self, 'feats'): + self.feats_detailed = {} # form_unicode = str(form).encode("utf-8") if form not in form_dict: @@ -27,15 +27,15 @@ class Tree(object): if deprel not in deprel_dict: deprel_dict[deprel] = Value(deprel) self.deprel = deprel_dict[deprel] - if feats not in feats_complete_dict: - feats_complete_dict[feats] = Value(feats) - self.feats_complete = feats_complete_dict[feats] - # for feat in feats.keys(): - # if next(iter(feats[feat])) not in feats_dict[feat]: - # feats_dict[feat][next(iter(feats[feat]))] = Value(next(iter(feats[feat]))) - # if not feat in self.feats: - # self.feats[feat] = {} - # self.feats[feat][next(iter(feats[feat]))] = feats_dict[feat][next(iter(feats[feat]))] + if feats not in feats_dict: + feats_dict[feats] = Value(feats) + self.feats = feats_dict[feats] + for feat in feats_detailed.keys(): + if next(iter(feats_detailed[feat])) not in feats_detailed_dict[feat]: + feats_detailed_dict[feat][next(iter(feats_detailed[feat]))] = Value(next(iter(feats_detailed[feat]))) + if not feat in self.feats_detailed: + self.feats_detailed[feat] = {} + self.feats_detailed[feat][next(iter(feats_detailed[feat]))] = feats_detailed_dict[feat][next(iter(feats_detailed[feat]))] # self.position = position self.parent = head @@ -62,8 +62,34 @@ class Tree(object): # return True - def fits_temporary_requirements(self, filters): + def fits_permanent_requirements(self, filters): + main_attributes = ['deprel', 'feats', 'form', 'lemma', 'upos'] + if not filters['root_whitelist']: + return True + + for option in filters['root_whitelist']: + filter_passed = True + + # check if attributes are valid + for key in option.keys(): + if key not in main_attributes: + if key not in self.feats_detailed or option[key] != list(self.feats_detailed[key].items())[0][1].get_value(): + filter_passed = False + + filter_passed = filter_passed and \ + ('deprel' not in option or option['deprel'] == self.deprel.get_value()) and \ + ('feats' not in option or option['feats'] == self.feats.get_value()) and \ + ('form' not in option or option['form'] == self.form.get_value()) and \ + ('lemma' not in option or option['lemma'] == self.lemma.get_value()) and \ + ('upos' not in option or option['upos'] == self.upos.get_value()) + + if filter_passed: + return True + + return False + + def fits_temporary_requirements(self, filters): return not filters['label_whitelist'] or self.deprel.get_value() in filters['label_whitelist'] def fits_static_requirements(self, query_tree, filters): @@ -72,7 +98,7 @@ class Tree(object): ('upos' not in query_tree or query_tree['upos'] == self.upos.get_value()) and \ ('xpos' not in query_tree or query_tree['xpos'] == self.xpos.get_value()) and \ ('deprel' not in query_tree or query_tree['deprel'] == self.deprel.get_value()) and \ - ('feats' not in query_tree or query_tree['feats'] == self.feats_complete.get_value()) and \ + ('feats' not in query_tree or query_tree['feats'] == self.feats.get_value()) and \ (not filters['complete_tree_type'] or (len(self.children) == 0 and 'children' not in query_tree) or ('children' in query_tree and len(self.children) == len(query_tree['children']))) # self.fits_static_requirements_feats(query_tree) @@ -302,7 +328,7 @@ class Tree(object): active_permanent_query_trees = [] for permanent_query_tree in permanent_query_trees: - if self.fits_static_requirements(permanent_query_tree, filters): + if self.fits_static_requirements(permanent_query_tree, filters) and self.fits_permanent_requirements(filters): active_permanent_query_trees.append(permanent_query_tree) if 'children' in permanent_query_tree: all_query_indices.append((permanent_query_tree['children'], True)) @@ -617,4 +643,4 @@ def create_output_string_xpos(tree): return tree.xpos.get_value() def create_output_string_feats(tree): - return tree.feats_complete.get_value() + return tree.feats.get_value() diff --git a/dependency-parsetree.py b/dependency-parsetree.py index b854b52..48689bc 100644 --- a/dependency-parsetree.py +++ b/dependency-parsetree.py @@ -11,21 +11,21 @@ import pyconll from Tree import Tree, create_output_string_form, create_output_string_deprel, create_output_string_lemma, create_output_string_upos, create_output_string_xpos, create_output_string_feats # for separate searches of feats -# feats_list = [ -# # lexical features -# 'PronType', 'NumType', 'Poss', 'Reflex', 'Foreign', 'Abbr', -# -# # Inflectional features (nominal) -# 'Gender', 'Animacy', 'NounClass', 'Number', 'Case', 'Definite', 'Degree', -# -# # Inflectional features (verbal) -# 'VerbForm', 'Mood', 'Tense', 'Aspect', 'Voice', 'Evident', 'Polarity', 'Person', 'Polite', 'Clusivity', -# -# # Other -# 'Variant', 'Number[psor]', 'Gender[psor]', 'NumForm' -# ] -# -# feats_dict = {key: {} for key in feats_list} +feats_detailed_list = [ + # lexical features + 'PronType', 'NumType', 'Poss', 'Reflex', 'Foreign', 'Abbr', + + # Inflectional features (nominal) + 'Gender', 'Animacy', 'NounClass', 'Number', 'Case', 'Definite', 'Degree', + + # Inflectional features (verbal) + 'VerbForm', 'Mood', 'Tense', 'Aspect', 'Voice', 'Evident', 'Polarity', 'Person', 'Polite', 'Clusivity', + + # Other + 'Variant', 'Number[psor]', 'Gender[psor]', 'NumForm' +] + +feats_detailed_dict = {key: {} for key in feats_detailed_list} def decode_query(orig_query, dependency_type): @@ -65,10 +65,10 @@ def decode_query(orig_query, dependency_type): elif orig_query_split[0] == 'feats': decoded_query['feats'] = orig_query_split[1] # return decoded_query - # elif orig_query_split[0] in feats_list: - # decoded_query['feats'] = {} - # decoded_query['feats'][orig_query_split[0]] = orig_query_split[1] - # return decoded_query + elif orig_query_split[0] in feats_detailed_list: + decoded_query['feats_detailed'] = {} + decoded_query['feats_detailed'][orig_query_split[0]] = orig_query_split[1] + return decoded_query elif not new_query: raise Exception('Not supported yet!') else: @@ -122,7 +122,7 @@ def create_trees(config): train = pyconll.load_from_file(input_path) - form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, feats_complete_dict = {}, {}, {}, {}, {}, {} + form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, feats_dict = {}, {}, {}, {}, {}, {} all_trees = [] @@ -136,9 +136,9 @@ def create_trees(config): # token_feats += k + next(iter(v)) + '|' # token_feats = token_feats[:-1] # TODO check if 5th place is always there for feats - token_feats = token._fields[5] - node = Tree(token.form, token.lemma, token.upos, token.xpos, token.deprel, token_feats, form_dict, - lemma_dict, upos_dict, xpos_dict, deprel_dict, None, feats_complete_dict, token.head) + feats = token._fields[5] + node = Tree(token.form, token.lemma, token.upos, token.xpos, token.deprel, feats, token.feats, form_dict, + lemma_dict, upos_dict, xpos_dict, deprel_dict, feats_dict, feats_detailed_dict, token.head) token_nodes.append(node) if token.deprel == 'root': root = node @@ -276,6 +276,22 @@ def main(): else: filters['label_whitelist'] = [] + if config.has_option('settings', 'root_whitelist'): + # test + filters['root_whitelist'] = [] + + for option in config.get('settings', 'root_whitelist'). split('|'): + attribute_dict = {} + for attribute in option.split('&'): + value = attribute.split('=') + # assert value[0] in ['deprel', 'lemma', 'upos', 'xpos', 'form', + # 'feats'], '"root_whitelist" is not set up correctly' + attribute_dict[value[0]] = value[1] + filters['root_whitelist'].append(attribute_dict) + # filters['root_whitelist'] = [{'upos': 'NOUN', 'Case': 'Nom'}, {'upos': 'ADJ', 'Degree': 'Sup'}] + else: + filters['root_whitelist'] = [] + filters['complete_tree_type'] = config.get('settings', 'tree_type') == 'complete' for tree in all_trees[2:]: