diff --git a/Tree.py b/Tree.py index 33eaf8d..0fb98bf 100644 --- a/Tree.py +++ b/Tree.py @@ -1,4 +1,5 @@ import sys +from copy import copy from pyconll.unit import Token @@ -29,10 +30,14 @@ class Tree(object): self.l_children = [] self.r_children = [] + self.index = 0 + def add_l_child(self, child): + child.index = len(self.l_children) self.l_children.append(child) def add_r_child(self, child): + child.index = len(self.l_children) + len(self.r_children) self.r_children.append(child) def set_parent(self, parent): @@ -110,7 +115,7 @@ class Tree(object): if not is_permanent: partial_subtrees[result_index].append([]) - def get_all_query_indices(self, temporary_query_trees_size, completed_subtrees_size, permanent_query_trees, l_all_query_indices, children, create_output_string): + def get_all_query_indices_old(self, temporary_query_trees_size, completed_subtrees_size, permanent_query_trees, l_all_query_indices, children, create_output_string): partial_subtrees = [[] for i in range(completed_subtrees_size + temporary_query_trees_size)] completed_subtrees = [[] for i in range(completed_subtrees_size)] @@ -135,6 +140,162 @@ class Tree(object): return partial_subtrees, completed_subtrees + def get_all_query_indices(self, temporary_query_trees_size, completed_subtrees_size, permanent_query_trees, all_query_indices, children, create_output_string): + # l_partial_subtrees, l_completed_subtrees = self.get_all_query_indices(len(temporary_query_trees), + # len(permanent_query_trees), + # permanent_query_trees, + # l_all_query_indices, self.l_children, + # create_output_string) + partial_subtrees = [[] for i in range(completed_subtrees_size + temporary_query_trees_size)] + partial_subtrees_architectures = [[] for i in range(completed_subtrees_size + temporary_query_trees_size)] + completed_subtrees = [[] for i in range(completed_subtrees_size)] + + # list of pairs (index of query in group, group of query) + partial_results_dict = {} + + # TODO try to erase!!! + child_queries = [all_query_indice[0] for all_query_indice in all_query_indices] + + answers_lengths = [len(query) for query in child_queries] + + child_queries_flatten = [query_part for query in child_queries for query_part in query] + + all_new_partial_answers = [[] for query_part in child_queries_flatten] + all_new_partial_answers_architecture = [[] for query_part in child_queries_flatten] + + # ask children all queries/partial queries + for child in children: + # obtain children results + new_partial_answers_architecture, new_partial_answers, new_completed_subtrees = child.get_subtrees(permanent_query_trees, child_queries_flatten, + create_output_string) + + assert len(new_partial_answers) == len(child_queries_flatten) + for i, new_partial_subtree in enumerate(new_partial_answers): + all_new_partial_answers[i].append(new_partial_subtree) + all_new_partial_answers_architecture[i].append(new_partial_answers_architecture[i]) + + # add 6 queries from 3 split up + # self.group_results(new_partial_subtrees, child_queries_metadata, all_query_indices, + # partial_results_dict, partial_subtrees) + + for i in range(len(new_completed_subtrees)): + completed_subtrees[i].extend(new_completed_subtrees[i]) + + # merge answers in appropriate way + i = 0 + # iterate over all answers per queries + for answer_i, answer_length in enumerate(answers_lengths): + # iterate over answers of query + partial_subtrees[answer_i], partial_subtrees_architectures[answer_i] = self.create_answers(all_new_partial_answers[i:i + answer_length], all_new_partial_answers_architecture[i:i + answer_length], answer_length, len(self.l_children)) + # while i < answers_length: + # self.create_grouped_answers() + # i += 1 + i += answer_length + + # merged_results = [] + # for old_result in old_results: + # for new_result in new_results: + # merged_results.append(old_result + new_result) + # return merged_results + # children_queries_generator = self.generate_children_queries(all_query_indices, children) + # + # child_index = 0 + # child, child_queries, child_queries_metadata = next(children_queries_generator) + # while child: + # # obtain children results + # new_partial_subtrees, new_completed_subtrees = child.get_subtrees(permanent_query_trees, child_queries, create_output_string) + # + # self.group_results(new_partial_subtrees, child_queries_metadata, all_query_indices, + # partial_results_dict, partial_subtrees) + # + # for i in range(len(new_completed_subtrees)): + # completed_subtrees[i].extend(new_completed_subtrees[i]) + # child, child_queries, child_queries_metadata = children_queries_generator.send(partial_results_dict) + # child_index += 1 + + return partial_subtrees_architectures, partial_subtrees, completed_subtrees + + def order_dependent_queries(self, active_permanent_query_trees, active_temporary_query_trees, l_partial_subtrees, l_partial_subtrees_architecture, + r_partial_subtrees, r_partial_subtrees_architecture, create_output_string, merged_partial_subtrees, merged_partial_subtrees_architecture, i, i_left, i_right): + # string_output = '' + if i < len(active_permanent_query_trees): + if ('l_children' in active_permanent_query_trees[i] and 'r_children' in active_permanent_query_trees[i]): + merged_partial_subtree = self.create_output_left_children(l_partial_subtrees[i_left], [create_output_string(self)]) + merged_partial_subtrees.append( + self.create_output_right_children(merged_partial_subtree, r_partial_subtrees[i_right])) + + merged_partial_subtree_architecture = self.create_output_left_children(l_partial_subtrees_architecture[i_left], [str([self.index])]) + merged_partial_subtrees_architecture.append( + self.create_output_right_children(merged_partial_subtree_architecture, l_partial_subtrees_architecture[i_right])) + # merged_partial_subtree = self.merge_results(l_partial_subtrees[i_left], + # [[create_output_string(self)]]) + # merged_partial_subtrees.append( + # self.merge_results(merged_partial_subtree, r_partial_subtrees[i_right])) + # # merged_partial_subtrees.append(self.merge_results(l_partial_subtrees[i], [[create_output_string(self)]])) + i_left += 1 + i_right += 1 + + elif 'l_children' in active_permanent_query_trees[i]: + merged_partial_subtrees.append(self.create_output_left_children(l_partial_subtrees[i_left], [create_output_string(self)])) + merged_partial_subtrees_architecture.append(self.create_output_left_children(l_partial_subtrees_architecture[i_left], [str([self.index])])) + # merged_partial_subtrees.append( + # self.merge_results(l_partial_subtrees[i_left], [[create_output_string(self)]])) + i_left += 1 + + elif 'r_children' in active_permanent_query_trees[i]: + merged_partial_subtrees.append(self.create_output_right_children([create_output_string(self)], r_partial_subtrees[i_right])) + merged_partial_subtrees_architecture.append( + self.create_output_right_children(r_partial_subtrees_architecture[i_left], [str([self.index])])) + # merged_partial_subtrees.append( + # self.merge_results([[create_output_string(self)]], r_partial_subtrees[i_right])) + i_right += 1 + else: + merged_partial_subtrees.append([create_output_string(self)]) + merged_partial_subtrees_architecture.append([str([self.index])]) + # merged_partial_subtrees.append([[create_output_string(self)]]) + else: + if ('l_children' in active_temporary_query_trees[i - len(active_permanent_query_trees)] and 'r_children' in + active_temporary_query_trees[i - len(active_permanent_query_trees)]): + merged_partial_subtree = self.create_output_left_children(l_partial_subtrees[i_left], + [create_output_string(self)]) + merged_partial_subtrees.append( + self.create_output_right_children(merged_partial_subtree, r_partial_subtrees[i_right])) + + merged_partial_subtree_architecture = self.create_output_left_children( + l_partial_subtrees_architecture[i_left], [str([self.index])]) + merged_partial_subtrees_architecture.append( + self.create_output_right_children(merged_partial_subtree_architecture, + l_partial_subtrees_architecture[i_right])) + # merged_partial_subtree = self.merge_results(l_partial_subtrees[i_left], [[create_output_string(self)]]) + # merged_partial_subtrees.append(self.merge_results(merged_partial_subtree, r_partial_subtrees[i_right])) + # # merged_partial_subtrees.append(self.merge_results(l_partial_subtrees[i], [[create_output_string(self)]])) + i_left += 1 + i_right += 1 + + elif 'l_children' in active_temporary_query_trees[i - len(active_permanent_query_trees)]: + merged_partial_subtrees.append( + self.create_output_left_children(l_partial_subtrees[i_left], [create_output_string(self)])) + merged_partial_subtrees_architecture.append( + self.create_output_left_children(l_partial_subtrees_architecture[i_left], [str([self.index])])) + # merged_partial_subtrees.append( + # self.merge_results(l_partial_subtrees[i_left], [[create_output_string(self)]])) + i_left += 1 + + elif 'r_children' in active_temporary_query_trees[i - len(active_permanent_query_trees)]: + merged_partial_subtrees.append( + self.create_output_right_children([create_output_string(self)], r_partial_subtrees[i_right])) + merged_partial_subtrees_architecture.append( + self.create_output_right_children(r_partial_subtrees_architecture[i_left], [str([self.index])])) + # merged_partial_subtrees.append( + # self.merge_results([[create_output_string(self)]], r_partial_subtrees[i_right])) + i_right += 1 + else: + merged_partial_subtrees.append([create_output_string(self)]) + merged_partial_subtrees_architecture.append([str([self.index])]) + # merged_partial_subtrees.append([[create_output_string(self)]]) + + return i_left, i_right + def get_subtrees(self, permanent_query_trees, temporary_query_trees, create_output_string): """ @@ -149,74 +310,105 @@ class Tree(object): active_permanent_query_trees = [] for permanent_query_tree in permanent_query_trees: if self.fits_static_requirements(permanent_query_tree): + if 'l_children' in permanent_query_tree and 'r_children' in permanent_query_tree: + permanent_query_tree['l_children'] += permanent_query_tree['r_children'] + del(permanent_query_tree['r_children']) + elif 'r_children' in permanent_query_tree: + permanent_query_tree['l_children'] = permanent_query_tree['r_children'] + del(permanent_query_tree['r_children']) active_permanent_query_trees.append(permanent_query_tree) if 'l_children' in permanent_query_tree: l_all_query_indices.append((permanent_query_tree['l_children'], True)) if 'r_children' in permanent_query_tree: - r_all_query_indices.append((permanent_query_tree['r_children'], True)) + r_all_query_indices.append((permanent_query_tree['l_children'], True)) + # r_all_query_indices.append((permanent_query_tree['r_children'], True)) active_temporary_query_trees = [] + successful_temporary_queries = [] for i, temporary_query_tree in enumerate(temporary_query_trees): if self.fits_static_requirements(temporary_query_tree): + if 'l_children' in temporary_query_tree and 'r_children' in temporary_query_tree: + temporary_query_tree['l_children'] += temporary_query_tree['r_children'] + del(temporary_query_tree['r_children']) + elif 'r_children' in temporary_query_tree: + temporary_query_tree['l_children'] = temporary_query_tree['r_children'] + del(temporary_query_tree['r_children']) active_temporary_query_trees.append(temporary_query_tree) + successful_temporary_queries.append(i) # if 'l_children' in temporary_query_tree and 'r_children' in temporary_query_tree: if 'l_children' in temporary_query_tree: l_all_query_indices.append((temporary_query_tree['l_children'], False)) if 'r_children' in temporary_query_tree: - r_all_query_indices.append((temporary_query_tree['r_children'], False)) - - l_partial_subtrees, l_completed_subtrees = self.get_all_query_indices(len(temporary_query_trees), len(permanent_query_trees), permanent_query_trees, l_all_query_indices, self.l_children, create_output_string) - r_partial_subtrees, r_completed_subtrees = self.get_all_query_indices(len(temporary_query_trees), len(permanent_query_trees), permanent_query_trees, r_all_query_indices, self.r_children, create_output_string) - - + # r_all_query_indices.append((temporary_query_tree['r_children'], False)) + r_all_query_indices.append((temporary_query_tree['l_children'], False)) + + # l_partial_subtrees, l_completed_subtrees = self.get_all_query_indices_old(len(temporary_query_trees), len(permanent_query_trees), permanent_query_trees, l_all_query_indices, self.l_children, create_output_string) + # r_partial_subtrees, r_completed_subtrees = self.get_all_query_indices_old(len(temporary_query_trees), len(permanent_query_trees), permanent_query_trees, r_all_query_indices, self.r_children, create_output_string) + + all_query_indices = l_all_query_indices + r_all_query_indices + + l_partial_subtrees_architecture, l_partial_subtrees, l_completed_subtrees = self.get_all_query_indices(len(temporary_query_trees), + len(permanent_query_trees), + permanent_query_trees, + all_query_indices, self.l_children + self.r_children, + create_output_string) + # r_partial_subtrees_architecture, r_partial_subtrees, r_completed_subtrees = self.get_all_query_indices(len(temporary_query_trees), + # len(permanent_query_trees), + # permanent_query_trees, + # r_all_query_indices, self.r_children, + # create_output_string) + r_partial_subtrees_architecture, r_partial_subtrees, r_completed_subtrees = [], [], [] merged_partial_subtrees = [] + merged_partial_subtrees_architecture = [] i = 0 i_left = 0 i_right = 0 # go over all permanent and temporary query trees while i < len(active_permanent_query_trees) + len(active_temporary_query_trees): # permanent query trees always have left and right child - if i < len(active_permanent_query_trees): - if ('l_children' in active_permanent_query_trees[i] and 'r_children' in active_permanent_query_trees[i]): - merged_partial_subtree = self.merge_results(l_partial_subtrees[i_left], - [[create_output_string(self)]]) - merged_partial_subtrees.append( - self.merge_results(merged_partial_subtree, r_partial_subtrees[i_right])) - # merged_partial_subtrees.append(self.merge_results(l_partial_subtrees[i], [[create_output_string(self)]])) - i_left += 1 - i_right += 1 - - elif 'l_children' in active_permanent_query_trees[i]: - merged_partial_subtrees.append( - self.merge_results(l_partial_subtrees[i_left], [[create_output_string(self)]])) - i_left += 1 - - elif 'r_children' in active_permanent_query_trees[i]: - merged_partial_subtrees.append( - self.merge_results([[create_output_string(self)]], r_partial_subtrees[i_right])) - i_right += 1 - else: - merged_partial_subtrees.append([[create_output_string(self)]]) - else: - if ('l_children' in active_temporary_query_trees[i - len(active_permanent_query_trees)] and 'r_children' in active_temporary_query_trees[i - len(active_permanent_query_trees)]): - merged_partial_subtree = self.merge_results(l_partial_subtrees[i_left], [[create_output_string(self)]]) - merged_partial_subtrees.append(self.merge_results(merged_partial_subtree, r_partial_subtrees[i_right])) - # merged_partial_subtrees.append(self.merge_results(l_partial_subtrees[i], [[create_output_string(self)]])) - i_left += 1 - i_right += 1 - - elif 'l_children' in active_temporary_query_trees[i - len(active_permanent_query_trees)]: - merged_partial_subtrees.append(self.merge_results(l_partial_subtrees[i_left], [[create_output_string(self)]])) - i_left += 1 - - elif 'r_children' in active_temporary_query_trees[i - len(active_permanent_query_trees)]: - merged_partial_subtrees.append(self.merge_results([[create_output_string(self)]], r_partial_subtrees[i_right])) - i_right += 1 - else: - merged_partial_subtrees.append([[create_output_string(self)]]) - # if r_partial_subtrees[i]: - # merged_partial_subtrees.append(self.merge_results(l_partial_subtrees[i], [[create_output_string(self)]])) + i_left, i_right = self.order_dependent_queries(active_permanent_query_trees, active_temporary_query_trees, l_partial_subtrees, l_partial_subtrees_architecture, + r_partial_subtrees, r_partial_subtrees_architecture, create_output_string, merged_partial_subtrees, merged_partial_subtrees_architecture, i, i_left, i_right) + # if i < len(active_permanent_query_trees): + # if ('l_children' in active_permanent_query_trees[i] and 'r_children' in active_permanent_query_trees[i]): + # merged_partial_subtree = self.merge_results(l_partial_subtrees[i_left], + # [[create_output_string(self)]]) + # merged_partial_subtrees.append( + # self.merge_results(merged_partial_subtree, r_partial_subtrees[i_right])) + # # merged_partial_subtrees.append(self.merge_results(l_partial_subtrees[i], [[create_output_string(self)]])) + # i_left += 1 + # i_right += 1 + # + # elif 'l_children' in active_permanent_query_trees[i]: + # merged_partial_subtrees.append( + # self.merge_results(l_partial_subtrees[i_left], [[create_output_string(self)]])) + # i_left += 1 + # + # elif 'r_children' in active_permanent_query_trees[i]: + # merged_partial_subtrees.append( + # self.merge_results([[create_output_string(self)]], r_partial_subtrees[i_right])) + # i_right += 1 + # else: + # merged_partial_subtrees.append([[create_output_string(self)]]) + # else: + # if ('l_children' in active_temporary_query_trees[i - len(active_permanent_query_trees)] and 'r_children' in active_temporary_query_trees[i - len(active_permanent_query_trees)]): + # merged_partial_subtree = self.merge_results(l_partial_subtrees[i_left], [[create_output_string(self)]]) + # merged_partial_subtrees.append(self.merge_results(merged_partial_subtree, r_partial_subtrees[i_right])) + # # merged_partial_subtrees.append(self.merge_results(l_partial_subtrees[i], [[create_output_string(self)]])) + # i_left += 1 + # i_right += 1 + # + # elif 'l_children' in active_temporary_query_trees[i - len(active_permanent_query_trees)]: + # merged_partial_subtrees.append(self.merge_results(l_partial_subtrees[i_left], [[create_output_string(self)]])) + # i_left += 1 + # + # elif 'r_children' in active_temporary_query_trees[i - len(active_permanent_query_trees)]: + # merged_partial_subtrees.append(self.merge_results([[create_output_string(self)]], r_partial_subtrees[i_right])) + # i_right += 1 + # else: + # merged_partial_subtrees.append([[create_output_string(self)]]) + # # if r_partial_subtrees[i]: + # # merged_partial_subtrees.append(self.merge_results(l_partial_subtrees[i], [[create_output_string(self)]])) i += 1 completed_subtrees = l_completed_subtrees @@ -227,7 +419,19 @@ class Tree(object): completed_subtrees[i].extend(merged_partial_subtrees[i]) for i in range(len(r_completed_subtrees)): completed_subtrees[i].extend(r_completed_subtrees[i]) - return merged_partial_subtrees[len(active_permanent_query_trees):], completed_subtrees + + # answers to valid queries + subtrees_architecture = [[] for i in range(len(temporary_query_trees))] + for inside_i, outside_i in enumerate(successful_temporary_queries): + subtrees_architecture[outside_i] = merged_partial_subtrees_architecture[len(active_permanent_query_trees) + inside_i] + + # answers to valid queries + subtrees = [[] for i in range(len(temporary_query_trees))] + for inside_i, outside_i in enumerate(successful_temporary_queries): + subtrees[outside_i] = merged_partial_subtrees[ + len(active_permanent_query_trees) + inside_i] + return subtrees_architecture, subtrees, completed_subtrees + # return merged_partial_subtrees_architecture[len(active_permanent_query_trees):], merged_partial_subtrees[len(active_permanent_query_trees):], completed_subtrees @staticmethod def merge_results(old_results, new_results): @@ -237,6 +441,143 @@ class Tree(object): merged_results.append(old_result + new_result) return merged_results + @staticmethod + def merge_answer(answer1, answer2, base_answer_i, answer_j): + merged_results = [] + merged_indices = [] + for answer1p_i, old_result in enumerate(answer1): + for answer2p_i, new_result in enumerate(answer2): + if answer1p_i != answer2p_i: + new_indices = [answer1p_i] + [answer2p_i] + sorted_indices = sorted(new_indices) + + if sorted_indices in merged_indices: + test = merged_indices.index(sorted(new_indices)) + # TODO add comparison answers with different indices if equal than ignore + merged_results.append(old_result + new_result) + merged_indices.append(new_indices) + return merged_results, merged_indices + + @staticmethod + def create_output_left_children(left_children, new_results): + merged_results = [] + for child in left_children: + for new_result in new_results: + res = '' + if type(child) == str: + res += '(' + child + ') < ' + else: + for el in sorted(child): + res += '(' + el + ') < ' + merged_results.append(res + new_result) + return merged_results + + @staticmethod + def create_output_right_children(new_results, right_children): + merged_results = [] + for child in right_children: + for new_result in new_results: + res = '' + if type(child) == str: + res += ' > (' + child + ')' + else: + for el in sorted(child): + res += ' > (' + el + ')' + merged_results.append(new_result + res) + # merged_results.append(new_result + ' > (' + child + ')') + return merged_results + + @staticmethod + def create_answers(separated_answers, separated_answers_architecture, answer_length, l_children_len): + # TODO + node_order = False + partly_built_trees = [[None] * answer_length] + partly_built_trees_architecture = [[None] * answer_length] + partly_built_trees_architecture_indices = [[None] * answer_length] + built_trees = [] + built_trees_architecture = [] + built_trees_architecture_indices = [] + # iterate over children first, so that new partly built trees are added only after all results of specific + # child are added + for child_i in range(len(separated_answers[0])): + new_partly_built_trees = [] + new_partly_built_trees_architecture = [] + new_partly_built_trees_architecture_indices = [] + # iterate over answers parts + for answer_part_i in range(len(separated_answers)): + # necessary because some parts do not pass filters and are not added + # if child_i < len(separated_answers[answer_part_i]) and separated_answers[answer_part_i][child_i]: + if separated_answers[answer_part_i][child_i]: + for tree_part_i, tree_part in enumerate(partly_built_trees): + # if tree_part[answer_part_i] equals None add new element in its place + if not tree_part[answer_part_i]: + new_tree_part = copy(tree_part) + new_tree_part_architecture = copy(partly_built_trees_architecture[tree_part_i]) + new_tree_part_architecture_indices = copy(partly_built_trees_architecture_indices[tree_part_i]) + new_tree_part[answer_part_i] = separated_answers[answer_part_i][child_i][0] + new_tree_part_architecture[answer_part_i] = separated_answers_architecture[answer_part_i][child_i][0] + new_tree_part_architecture_indices[answer_part_i] = child_i + completed_tree_part = True + for val_i, val in enumerate(new_tree_part): + if not val: + completed_tree_part = False + if completed_tree_part: + built_trees.append(new_tree_part) + built_trees_architecture.append(new_tree_part_architecture) + built_trees_architecture_indices.append(new_tree_part_architecture_indices) + else: + new_partly_built_trees.append(new_tree_part) + new_partly_built_trees_architecture.append(new_tree_part_architecture) + new_partly_built_trees_architecture_indices.append(new_tree_part_architecture_indices) + + partly_built_trees.extend(new_partly_built_trees) + partly_built_trees_architecture.extend(new_partly_built_trees_architecture) + partly_built_trees_architecture_indices.extend(new_partly_built_trees_architecture_indices) + + l_ordered_built_trees_architecture, l_ordered_built_trees, r_ordered_built_trees_architecture, r_ordered_built_trees, unique_trees_architecture = [], [], [], [], [] + + if built_trees: + # sort 3 arrays by architecture indices + temp_trees_architecture_indice, temp_trees, temp_trees_architectures = (list(t) for t in zip( + *sorted(zip(built_trees_architecture_indices, built_trees, built_trees_architecture)))) + + # order outputs and erase duplicates + # for tree, tree_architecture, tree_architecture_indice in zip(built_trees, built_trees_architecture, built_trees_architecture_indices): + for tree, tree_architecture, tree_architecture_indice in zip(temp_trees, temp_trees_architectures, temp_trees_architecture_indice): + new_tree_architecture_indice, new_tree, new_tree_architecture = (list(t) for t in zip(*sorted(zip(tree_architecture_indice, tree, tree_architecture)))) + # TODO check if inside new_tree_architecture in ordered_built_trees_architecture and if not append! + is_unique = True + for unique_tree in unique_trees_architecture: + already_in = True + for part_i in range(len(unique_tree)): + if unique_tree[part_i] != new_tree_architecture[part_i]: + already_in = False + break + if already_in: + is_unique = False + break + + if is_unique: + unique_trees_architecture.append(new_tree_architecture) + if not node_order: + l_ordered_built_trees_architecture.append(new_tree_architecture) + l_ordered_built_trees.append(new_tree) + # TODO NODE ORDER = FALSE + # else: + # + # ordered_built_trees_architecture.append(tree_architecture) + # ordered_built_trees.append(tree) + # print("test") + # for answer1_i, answer1 in enumerate(separated_answers): + # for answer2_i, answer2 in enumerate(separated_answers): + # if answer1_i != answer2_i: + # res, res_i = self.merge_answer(answer1, answer2, answer1_i, answer2_i) + # print('aaa') + # + # pass + return l_ordered_built_trees, l_ordered_built_trees_architecture + + def create_output_string_form(tree): return tree.form.get_value() diff --git a/dependency-parsetree.py b/dependency-parsetree.py index 33ecb96..afa09c3 100644 --- a/dependency-parsetree.py +++ b/dependency-parsetree.py @@ -50,8 +50,11 @@ def decode_query(orig_query, dependency_type): return decoded_query # split over spaces if not inside braces - PATTERN = re.compile(r'''((?:[^ ()]|\([^(]*\))+)''') - all_orders = PATTERN.split(orig_query)[1::2] + # PATTERN = re.compile(r'''((?:[^ ()]|\([^.]*\))+)''') + # all_orders = PATTERN.split(orig_query) + # PATTERN = re.compile(r"(?:[^ ()]|\([^.]*\))+") + # all_orders = re.findall(r"(?:[^ ()]|\([^]*\))+", orig_query) + all_orders = re.split(r"\s+(?=[^()]*(?:\(|$))", orig_query) # all_orders = orig_query.split() @@ -137,6 +140,32 @@ def create_trees(config): return all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict + +# def order_independent_queries(query_tree): +# all_children = query_tree['l_children'] + query_tree['r_children'] +# if all_children > 0: +# +# else: +# return query_tree +# pass + +def printable_answers(query): + # all_orders = re.findall(r"(?:[^ ()]|\([^]*\))+", query) + all_orders = re.split(r"\s+(?=[^()]*(?:\(|$))", query) + + # all_orders = orig_query.split() + node_actions = all_orders[::2] + # priority_actions = all_orders[1::2] + + if len(node_actions) > 1: + res = [] + for node_action in node_actions[:-1]: + res.extend(printable_answers(node_action[1:-1])) + res.extend([node_actions[-1]]) + return res + else: + return [query] + def main(): parser = argparse.ArgumentParser() @@ -156,22 +185,34 @@ def main(): ngrams = 0 if config.getint('settings', 'ngrams') == 2: ngrams = 2 - query_tree = [{"l_children": [{}]}, {"r_children": [{}]}] + query_tree = [{"l_children": [{}]}] + elif config.getint('settings', 'ngrams') == 3: + ngrams = 3 + query_tree = [{"l_children": [{}, {}]}, {"l_children": [{"l_children": [{}]}]}] + elif config.getint('settings', 'ngrams') == 4: + ngrams = 4 + query_tree = [{"l_children": [{}, {}, {}]}, {"l_children": [{"l_children": [{}, {}]}]}, {"l_children": [{"l_children": [{}]}, {}]}, {"l_children": [{"l_children": [{"l_children": [{}]}]}]}] + elif config.getint('settings', 'ngrams') == 5: + ngrams = 5 + query_tree = [{"l_children": [{}, {}, {}, {}]}, {"l_children": [{"l_children": [{}]}, {}, {}]}, {"l_children": [{"l_children": [{}, {}]}, {}]}, {"l_children": [{"l_children": [{}]}, {"l_children": [{}]}]}, + {"l_children": [{"l_children": [{"l_children": [{}]}]}, {}]}, {"l_children": [{"l_children": [{"l_children": [{}]}, {}]}]}, {"l_children": [{"l_children": [{"l_children": [{}, {}]}]}]}, + {"l_children": [{"l_children": [{"l_children": [{"l_children": [{}]}]}]}]}] else: query_tree = [decode_query('(' + config.get('settings', 'query') + ')', '')] + # order_independent_queries(query_tree) (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = create_trees(config) # set filters - assert config.get('settings', 'analyze_type') in ['deprel', 'lemma', 'upos', 'xpos', 'form'], '"analyze_type" is not set up correctly' - if config.get('settings', 'analyze_type') == 'deprel': + assert config.get('settings', 'node_type') in ['deprel', 'lemma', 'upos', 'xpos', 'form'], '"node_type" is not set up correctly' + if config.get('settings', 'node_type') == 'deprel': create_output_string_funct = create_output_string_deprel - elif config.get('settings', 'analyze_type') == 'lemma': + elif config.get('settings', 'node_type') == 'lemma': create_output_string_funct = create_output_string_lemma - elif config.get('settings', 'analyze_type') == 'upos': + elif config.get('settings', 'node_type') == 'upos': create_output_string_funct = create_output_string_upos - elif config.get('settings', 'analyze_type') == 'xpos': + elif config.get('settings', 'node_type') == 'xpos': create_output_string_funct = create_output_string_xpos else: create_output_string_funct = create_output_string_form @@ -184,21 +225,27 @@ def main(): # original # r_children = tree.r_children[:1] + tree.r_children[3:4] # tree.r_children = tree.r_children[:1] + tree.r_children[2:4] - _, subtrees = tree.get_subtrees(query_tree, [], create_output_string_funct) + _, _, subtrees = tree.get_subtrees(query_tree, [], create_output_string_funct) for query_results in subtrees: for result in query_results: - if ngrams: - result = sorted(result) - r = tuple(result) + # if ngrams: + # result = sorted(result) + # r = tuple(result) + r = result if r in result_dict: result_dict[r] += 1 else: result_dict[r] = 1 # test 1 layer queries - # tree.r_children = [] - # tree.l_children[1].l_children = [] - # _, subtrees = tree.get_subtrees([{'q1':'', "l_children": [{'a1':''}, {'a2':''}]}, {'q2':'', "l_children": [{'b1':''}]}, {'q3':'', "l_children": [{'c1':''}, {'c2':''}, {'c3':''}]}], []) + # # tree.r_children = [] + # # tree.l_children[1].l_children = [] + # # query = [{'l_children': [{}]}, {'r_children': [{}]}] + # # query = [{"l_children": [{}, {}]}, {"l_children": [{}]}, {"l_children": [{}, {}, {}]}] + # query = [{"l_children": [{'form': 'je'}, {}]}, {"l_children": [{'form': 'je'}]}, {"l_children": [{'form': 'je'}, {}, {}]}] + # # query = [{'q1':'', "l_children": [{'a1':''}, {'a2':''}]}, {'q2':'', "l_children": [{'b1':''}]}, {'q3':'', "l_children": [{'c1':''}, {'c2':''}, {'c3':''}]}] + # _, _, subtrees = tree.get_subtrees(query, [], create_output_string_funct) # # _, subtrees = tree.get_subtrees([{'q1':'', "l_children": [{'a1':''}, {'a2':''}], "r_children": []}, {'q2':'', "l_children": [{'b1':''}], "r_children": []}, {'q3':'', "l_children": [{'c1':''}, {'c2':''}, {'c3':''}], "r_children": []}], []) + # print('HERE!') # test 2 layer queries # tree.r_children = [Tree('je', '', '', '', '', form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, None)] @@ -216,16 +263,18 @@ def main(): # header - use every second space as a split writer = csv.writer(f, delimiter='\t') if ngrams: - writer.writerow(['Word 1', 'Word 2', 'Number of occurences']) + len_words = ngrams else: - span = 2 - words = config.get('settings', 'query').split(" ") - header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Number of occurences'] - writer.writerow(header) + len_words = len(config.get('settings', 'query').split(" ")) + span = 2 + header = ["Structure"] + ["Word #" + str(int(i/2 + 1)) for i in range(0, len_words * 2, span)] + ['Number of occurences'] + # header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Number of occurences'] + writer.writerow(header) # body for k, v in sorted_list: - writer.writerow(list(k) + [str(v)]) + words_only = printable_answers(k) + writer.writerow([k] + words_only + [str(v)]) return