399 lines
19 KiB
399 lines
19 KiB
import sys
from copy import copy
from ResultNode import ResultNode
from ResultTree import ResultTree
from Value import Value
from generic import generate_key
class Tree(object):
def __init__(self, index, form, lemma, upos, xpos, deprel, feats, feats_detailed, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, feats_dict, feats_detailed_dict, head):
if not hasattr(self, 'feats'):
self.feats_detailed = {}
if form not in form_dict:
form_dict[form] = Value(form)
self.form = form_dict[form]
if lemma not in lemma_dict:
lemma_dict[lemma] = Value(lemma)
self.lemma = lemma_dict[lemma]
if upos not in upos_dict:
upos_dict[upos] = Value(upos)
self.upos = upos_dict[upos]
if xpos not in xpos_dict:
xpos_dict[xpos] = Value(xpos)
self.xpos = xpos_dict[xpos]
if deprel not in deprel_dict:
deprel_dict[deprel] = Value(deprel)
self.deprel = deprel_dict[deprel]
if feats not in feats_dict:
feats_dict[feats] = Value(feats)
self.feats = feats_dict[feats]
for feat in feats_detailed.keys():
if feat not in feats_detailed_dict:
feats_detailed_dict[feat] = {}
if next(iter(feats_detailed[feat])) not in feats_detailed_dict[feat]:
feats_detailed_dict[feat][next(iter(feats_detailed[feat]))] = Value(next(iter(feats_detailed[feat])))
if not feat in self.feats_detailed:
self.feats_detailed[feat] = {}
self.feats_detailed[feat][next(iter(feats_detailed[feat]))] = feats_detailed_dict[feat][next(iter(feats_detailed[feat]))]
self.parent = head
self.children = []
self.children_split = -1
self.index = index
# for caching answers to questions
self.cache = {}
def add_child(self, child):
def set_parent(self, parent):
self.parent = parent
def fits_static_requirements_feats(self, query_tree):
if 'feats_detailed' not in query_tree:
return True
for feat in query_tree['feats_detailed'].keys():
if feat not in self.feats_detailed or query_tree['feats_detailed'][feat] != next(iter(self.feats_detailed[feat].values())).get_value():
return False
return True
def fits_permanent_requirements(self, filters):
main_attributes = ['deprel', 'feats', 'form', 'lemma', 'upos']
if not filters['root_whitelist']:
return True
for option in filters['root_whitelist']:
filter_passed = True
# check if attributes are valid
for key in option.keys():
if key not in main_attributes:
if key not in self.feats_detailed or option[key] != list(self.feats_detailed[key].items())[0][1].get_value():
filter_passed = False
filter_passed = filter_passed and \
('deprel' not in option or option['deprel'] == self.deprel.get_value()) and \
('feats' not in option or option['feats'] == self.feats.get_value()) and \
('form' not in option or option['form'] == self.form.get_value()) and \
('lemma' not in option or option['lemma'] == self.lemma.get_value()) and \
('upos' not in option or option['upos'] == self.upos.get_value())
if filter_passed:
return True
return False
def fits_temporary_requirements(self, filters):
return not filters['label_whitelist'] or self.deprel.get_value() in filters['label_whitelist']
def fits_static_requirements(self, query_tree, filters):
return ('form' not in query_tree or query_tree['form'] == self.form.get_value()) and \
('lemma' not in query_tree or query_tree['lemma'] == self.lemma.get_value()) and \
('upos' not in query_tree or query_tree['upos'] == self.upos.get_value()) and \
('xpos' not in query_tree or query_tree['xpos'] == self.xpos.get_value()) and \
('deprel' not in query_tree or query_tree['deprel'] == self.deprel.get_value()) and \
('feats' not in query_tree or query_tree['feats'] == self.feats.get_value()) and \
(not filters['complete_tree_type'] or (len(self.children) == 0 and 'children' not in query_tree) or ('children' in query_tree and len(self.children) == len(query_tree['children']))) and \
def generate_children_queries(self, all_query_indices, children):
partial_results = {}
# list of pairs (index of query in group, group of query, is permanent)
child_queries_metadata = []
for child_index, child in enumerate(children):
new_queries = []
# add continuation queries to children
for result_part_index, result_index, is_permanent in child_queries_metadata:
if result_index in partial_results and result_part_index in partial_results[result_index] and len(partial_results[result_index][result_part_index]) > 0:
if len(all_query_indices[result_index][0]) > result_part_index + 1:
new_queries.append((result_part_index + 1, result_index, is_permanent))
child_queries_metadata = new_queries
# add new queries to children
for result_index, (group, is_permanent) in enumerate(all_query_indices):
# check if node has enough children for query to be possible
if len(children) - len(group) >= child_index:
child_queries_metadata.append((0, result_index, is_permanent))
child_queries = []
for result_part_index, result_index, _ in child_queries_metadata:
partial_results = yield child, child_queries, child_queries_metadata
yield None, None, None
def add_subtrees(self, old_subtree, new_subtree):
def get_all_query_indices(self, temporary_query_nb, permanent_query_nb, permanent_query_trees, all_query_indices, children, create_output_string, filters):
partial_answers = [[] for i in range(permanent_query_nb + temporary_query_nb)]
complete_answers = [[] for i in range(permanent_query_nb)]
# list of pairs (index of query in group, group of query)
# TODO try to erase!!!
child_queries = [all_query_indice[0] for all_query_indice in all_query_indices]
answers_lengths = [len(query) for query in child_queries]
child_queries_flatten = [query_part for query in child_queries for query_part in query]
all_new_partial_answers = [[] for query_part in child_queries_flatten]
child_queries_flatten_dedup = []
child_queries_flatten_dedup_indices = []
for query_part in child_queries_flatten:
index = child_queries_flatten_dedup.index(query_part)
except ValueError:
index = len(child_queries_flatten_dedup)
# ask children all queries/partial queries
for child in children:
# obtain children results
new_partial_answers_dedup, new_complete_answers = child.get_subtrees(permanent_query_trees, child_queries_flatten_dedup,
create_output_string, filters)
assert len(new_partial_answers_dedup) == len(child_queries_flatten_dedup)
# duplicate results again on correct places
for i, flattened_index in enumerate(child_queries_flatten_dedup_indices):
for i in range(len(new_complete_answers)):
# TODO add order rearagement (TO KEY)
# merge answers in appropriate way
i = 0
# iterate over all answers per queries
for answer_i, answer_length in enumerate(answers_lengths):
# iterate over answers of query
partial_answers[answer_i] = self.create_answers(all_new_partial_answers[i:i + answer_length], answer_length, filters)
i += answer_length
return partial_answers, complete_answers
def order_dependent_queries(self, active_permanent_query_trees, active_temporary_query_trees, partial_subtrees,
create_output_string, merged_partial_subtrees, i_query, i_answer, filters):
node = ResultNode(self, self.index, create_output_string)
if i_query < len(active_permanent_query_trees):
if 'children' in active_permanent_query_trees[i_query]:
self.create_output_children(partial_subtrees[i_answer], [ResultTree(node, [], filters)], filters))
i_answer += 1
merged_partial_subtrees.append([ResultTree(node, [], filters)])
if 'children' in active_temporary_query_trees[i_query - len(active_permanent_query_trees)]:
self.create_output_children(partial_subtrees[i_answer], [ResultTree(node, [], filters)], filters))
i_answer += 1
merged_partial_subtrees.append([ResultTree(node, [], filters)])
return i_answer
def get_unigrams(self, create_output_strings, filters):
unigrams = [generate_key(self, create_output_strings, print_lemma=False)[1]]
for child in self.children:
unigrams += child.get_unigrams(create_output_strings, filters)
return unigrams
def get_subtrees(self, permanent_query_trees, temporary_query_trees, create_output_string, filters):
:param permanent_query_trees:
:param temporary_query_trees:
# list of all children queries grouped by parent queries
all_query_indices = []
active_permanent_query_trees = []
for permanent_query_tree in permanent_query_trees:
if self.fits_static_requirements(permanent_query_tree, filters) and self.fits_permanent_requirements(filters):
if 'children' in permanent_query_tree:
all_query_indices.append((permanent_query_tree['children'], True))
# r_all_query_indices.append((permanent_query_tree['r_children'], True))
active_temporary_query_trees = []
successful_temporary_queries = []
for i, temporary_query_tree in enumerate(temporary_query_trees):
if self.fits_static_requirements(temporary_query_tree, filters) and self.fits_temporary_requirements(filters):
if 'children' in temporary_query_tree:
all_query_indices.append((temporary_query_tree['children'], False))
partial_subtrees, complete_answers = self.get_all_query_indices(len(temporary_query_trees),
all_query_indices, self.children,
create_output_string, filters)
merged_partial_answers = []
i_question = 0
# i_child is necessary, because some queries may be answered at the beginning and were not passed to children.
# i_child is used to point where we are inside answers
i_answer = 0
# go over all permanent and temporary query trees
while i_question < len(active_permanent_query_trees) + len(active_temporary_query_trees):
# permanent query trees always have left and right child
i_answer = self.order_dependent_queries(active_permanent_query_trees, active_temporary_query_trees, partial_subtrees,
create_output_string, merged_partial_answers, i_question, i_answer, filters)
i_question += 1
for i in range(len(active_permanent_query_trees)):
# erase first and last braclets when adding new query result
add_subtree = [subtree.finalize_result() for subtree in merged_partial_answers[i]]
# answers to valid queries
partial_answers = [[] for i in range(len(temporary_query_trees))]
for inside_i, outside_i in enumerate(successful_temporary_queries):
partial_answers[outside_i] = merged_partial_answers[
len(active_permanent_query_trees) + inside_i]
return partial_answers, complete_answers
def create_children_groups(left_parts, right_parts):
if not left_parts:
return right_parts
if not right_parts:
return left_parts
all_children_group_possibilities = []
for left_part in left_parts:
for right_part in right_parts:
new_part = copy(left_part)
return all_children_group_possibilities
def merge_answer(answer1, answer2, base_answer_i, answer_j):
merged_results = []
merged_indices = []
for answer1p_i, old_result in enumerate(answer1):
for answer2p_i, new_result in enumerate(answer2):
if answer1p_i != answer2p_i:
new_indices = [answer1p_i] + [answer2p_i]
# TODO add comparison answers with different indices if equal than ignore
merged_results.append(old_result + new_result)
return merged_results, merged_indices
def merge_results3(self, child, new_results, filters):
if filters['node_order']:
new_child = child
new_child = sorted(child, key=lambda x: x[0].get_key())
children_groups = []
for i_answer, answer in enumerate(new_child):
children_groups = self.create_children_groups(children_groups, [[answer_part] for answer_part in answer])
results = []
for result in new_results:
for children in children_groups:
new_result = copy(result)
return results
def create_output_children(self, children, new_results, filters):
merged_results = []
for i_child, child in enumerate(children):
merged_results.extend(self.merge_results3(child, new_results, filters))
return merged_results
def create_answers(self, separated_answers, answer_length, filters):
partly_built_trees = [[None] * answer_length]
partly_built_trees_architecture_indices = [[None] * answer_length]
built_trees = []
built_trees_architecture_indices = []
# iterate over children first, so that new partly built trees are added only after all results of specific
# child are added
for child_i in range(len(separated_answers[0])):
new_partly_built_trees = []
new_partly_built_trees_architecture_indices = []
# iterate over answers parts
for answer_part_i in range(len(separated_answers)):
# necessary because some parts do not pass filters and are not added
if separated_answers[answer_part_i][child_i]:
for tree_part_i, tree_part in enumerate(partly_built_trees):
if not tree_part[answer_part_i]:
new_tree_part = copy(tree_part)
new_tree_part_architecture_indices = copy(partly_built_trees_architecture_indices[tree_part_i])
new_tree_part[answer_part_i] = separated_answers[answer_part_i][child_i]
new_tree_part_architecture_indices[answer_part_i] = child_i
completed_tree_part = True
for val_i, val in enumerate(new_tree_part):
if not val:
completed_tree_part = False
if completed_tree_part:
# pass over repetitions of same words
l_ordered_built_trees, unique_trees_architecture = [], []
if built_trees:
# sort 3 arrays by architecture indices
temp_trees_index, temp_trees = (list(t) for t in zip(
*sorted(zip(built_trees_architecture_indices, built_trees))))
# order outputs and erase duplicates
for tree, tree_index in zip(temp_trees, temp_trees_index):
new_tree_index, new_tree = (list(t) for t in zip(*sorted(zip(tree_index, tree))))
# TODO check if inside new_tree_architecture in ordered_built_trees_architecture and if not append!
is_unique = True
for unique_tree in unique_trees_architecture:
already_in = True
for part_i in range(len(unique_tree)):
if len(unique_tree[part_i]) != len(new_tree[part_i]) or any(unique_tree[part_i][i_unique_part].get_order_key() != new_tree[part_i][i_unique_part].get_order_key() for i_unique_part in range(len(unique_tree[part_i]))):
already_in = False
if already_in:
is_unique = False
if is_unique:
return l_ordered_built_trees