From d34f429d05dedf2decc2b012c36c5a44be779adb Mon Sep 17 00:00:00 2001 From: Luka Date: Fri, 29 Nov 2019 10:37:48 +0100 Subject: [PATCH] Adding results class to code --- Result.py | 83 +++++++++++++++++- Tree.py | 187 ++++++++++++++++++++++------------------ dependency-parsetree.py | 38 +++++--- 3 files changed, 208 insertions(+), 100 deletions(-) diff --git a/Result.py b/Result.py index eea2d2e..ee3c788 100644 --- a/Result.py +++ b/Result.py @@ -1,9 +1,86 @@ +import copy + class Result(object): - def __init__(self, string, order): + def __init__(self, string, architecture_order): self.key = string - self.key_split = [string] + self.order_key = str([architecture_order]) + self.array = [string] # order with original numbers in sentences - self.build_order = [order] + # self.order = str([architecture_order]) # order with numbers from 0 to n of n-gram self.final_order = '' + self.separators = [] + + def __repr__(self): + return self.key + + def add(self, string, architecture_order, separator, is_left): + if is_left: + self.array = [string] + self.array + # self.order = [architecture_order] + self.order + self.separators = [separator] + self.separators + self.key = string + ' ' + separator + ' ' + self.key + self.order_key = architecture_order + ' ' + separator + ' ' + self.order_key + + else: + self.array += [string] + # self.order += [architecture_order] + self.separators += [separator] + + self.key += ' ' + separator + ' ' + string + self.order_key += ' ' + separator + ' ' + architecture_order + + def add_separator(self, separator, left=True): + self_copy = copy.copy(self) + if left: + self_copy.separators += [separator] + self_copy.key += separator + self_copy.order_key += separator + else: + self_copy.separators = [separator] + self_copy.separators + self_copy.key = separator + self_copy.key + self_copy.order_key = separator + self_copy.order_key + return self_copy + + def merge_results(self, right_t, separator, left=True): + left_tree = copy.copy(self) + right_tree = copy.copy(right_t) + + if separator: + if left: + # merged_results.append(left_part + right_part + separator) + left_tree.key = left_tree.key + right_tree.key + separator + left_tree.order_key = left_tree.order_key + right_tree.order_key + separator + left_tree.array = left_tree.array + right_tree.array + # left_tree.order = str([architecture_order]) + left_tree.separators = left_tree.separators + right_tree.separators + [separator] + else: + # merged_results.append(left_part + separator + right_part) + left_tree.key = left_tree.key + separator + right_tree.key + left_tree.order_key = left_tree.order_key + separator + right_tree.order_key + left_tree.array = left_tree.array + right_tree.array + # left_tree.order = str([architecture_order]) + left_tree.separators = left_tree.separators + [separator] + right_tree.separators + else: + # merged_results.append(left_part + right_part) + left_tree.key = left_tree.key + right_tree.key + left_tree.order_key = left_tree.order_key + right_tree.order_key + left_tree.array = left_tree.array + right_tree.array + # left_tree.order = str([architecture_order]) + left_tree.separators = left_tree.separators + right_tree.separators + + return left_tree + + def put_in_bracelets(self): + result = copy.copy(self) + result.key = ('(' + result.key + ')') + result.order_key = ('(' + result.order_key + ')') + return result + + def finalize_result(self): + result = copy.copy(self) + result.key = result.key[1:-1] + # result.order_key = result.order_key[1:-1] + # TODO When tree is finalized create relative word order (alphabet)! + return result diff --git a/Tree.py b/Tree.py index 958fbd3..8a365b2 100644 --- a/Tree.py +++ b/Tree.py @@ -3,11 +3,12 @@ from copy import copy from pyconll.unit import Token +from Result import Result from Value import Value class Tree(object): - def __init__(self, form, lemma, upos, xpos, deprel, feats, feats_detailed, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, feats_dict, feats_detailed_dict, head): + def __init__(self, index, form, lemma, upos, xpos, deprel, feats, feats_detailed, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, feats_dict, feats_detailed_dict, head): if not hasattr(self, 'feats'): self.feats_detailed = {} @@ -42,13 +43,13 @@ class Tree(object): self.children = [] self.children_split = -1 - self.index = 0 + self.index = index # for caching answers to questions self.cache = {} def add_child(self, child): - child.index = len(self.children) + # child.index = len(self.children) self.children.append(child) def set_parent(self, parent): @@ -268,10 +269,11 @@ class Tree(object): # partial_results_dict, partial_subtrees) for i in range(len(new_complete_answers)): + # TODO add order rearagement (TO KEY) complete_answers[i].extend(new_complete_answers[i]) - # if create_output_string_form(self) == 'vožnji': - # print('HERE!@@!') + if create_output_string_form(self) == 'Dogodek': + print('HERE!@@!') # if create_output_string_form(self) == 'vpiti': # print('HERE!@@!') # merge answers in appropriate way @@ -317,29 +319,36 @@ class Tree(object): if i_query < len(active_permanent_query_trees): if 'children' in active_permanent_query_trees[i_query]: # if not filters['node_order'] or i_child < self.children_split: + # merged_partial_subtrees.append( + # self.create_output_children(partial_subtrees[i_answer], [create_output_string(self)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) + # merged_partial_subtrees_architecture.append( + # self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) + merged_partial_subtrees.append( - self.create_output_children(partial_subtrees[i_answer], [create_output_string(self)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) - merged_partial_subtrees_architecture.append( - self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) + self.create_output_children(partial_subtrees[i_answer], [Result(create_output_string(self), self.index)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) i_answer += 1 else: - merged_partial_subtrees.append([create_output_string(self)]) - merged_partial_subtrees_architecture.append([str([self.index])]) - # merged_partial_subtrees.append([[create_output_string(self)]]) + merged_partial_subtrees.append([Result(create_output_string(self), self.index)]) + # merged_partial_subtrees.append([create_output_string(self)]) + # merged_partial_subtrees_architecture.append([str([self.index])]) + else: if 'children' in active_temporary_query_trees[i_query - len(active_permanent_query_trees)]: # if not filters['node_order'] or i_child < self.children_split: + # merged_partial_subtrees.append( + # self.create_output_children(partial_subtrees[i_answer], [create_output_string(self)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) + # merged_partial_subtrees_architecture.append( + # self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) + merged_partial_subtrees.append( - self.create_output_children(partial_subtrees[i_answer], [create_output_string(self)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) - merged_partial_subtrees_architecture.append( - self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) + self.create_output_children(partial_subtrees[i_answer], [Result(create_output_string(self), self.index)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) i_answer += 1 else: - merged_partial_subtrees.append([create_output_string(self)]) - merged_partial_subtrees_architecture.append([str([self.index])]) - # merged_partial_subtrees.append([[create_output_string(self)]]) + merged_partial_subtrees.append([Result(create_output_string(self), self.index)]) + # merged_partial_subtrees.append([create_output_string(self)]) + # merged_partial_subtrees_architecture.append([str([self.index])]) return i_answer @@ -395,8 +404,9 @@ class Tree(object): i_question += 1 for i in range(len(active_permanent_query_trees)): + # TODO FINALIZE RESULT # erase first and last braclets when adding new query result - add_subtree = [subtree[1:-1] for subtree in merged_partial_answers[i]] + add_subtree = [subtree.finalize_result() for subtree in merged_partial_answers[i]] # if 0 < len(active_permanent_query_trees): complete_answers[i].extend(add_subtree) # completed_subtrees[i].extend(merged_partial_subtrees[i]) @@ -405,7 +415,7 @@ class Tree(object): partial_answers_architecture = [[] for i in range(len(temporary_query_trees))] partial_answers = [[] for i in range(len(temporary_query_trees))] for inside_i, outside_i in enumerate(successful_temporary_queries): - partial_answers_architecture[outside_i] = merged_partial_answers_architecture[len(active_permanent_query_trees) + inside_i] + # partial_answers_architecture[outside_i] = merged_partial_answers_architecture[len(active_permanent_query_trees) + inside_i] partial_answers[outside_i] = merged_partial_answers[ len(active_permanent_query_trees) + inside_i] @@ -416,23 +426,28 @@ class Tree(object): @staticmethod def merge_results(left_parts, right_parts, separator, left=True): if not left_parts: - # return right_parts - if left: - return [r_p + separator for r_p in right_parts] - else: - return [separator + r_p for r_p in right_parts] + # return all right_parts + return [r_p.add_separator(separator, left) for r_p in right_parts] + # if left: + # return [r_p + separator for r_p in right_parts] + # # return [r_p.add_separator(separator, left) for r_p in right_parts] + # else: + # return [separator + r_p for r_p in right_parts] + if not right_parts: - return [separator + l_p for l_p in left_parts] + return [l_p.add_separator(separator, False) for l_p in left_parts] + # return [separator + l_p for l_p in left_parts] merged_results = [] for left_part in left_parts: for right_part in right_parts: - if separator: - if left: - merged_results.append(left_part + right_part + separator) - else: - merged_results.append(left_part + separator + right_part) - else: - merged_results.append(left_part + right_part) + merged_results.append(left_part.merge_results(right_part, separator)) + # if separator: + # if left: + # merged_results.append(left_part + right_part + separator) + # else: + # merged_results.append(left_part + separator + right_part) + # else: + # merged_results.append(left_part + right_part) return merged_results @staticmethod @@ -458,50 +473,46 @@ class Tree(object): # print('HERE!@@!') # if create_output_string_form(self) == 'utišal': # print('HERE!@@!') - + # if len(new_results) > 1: + # print('HERE') merged_results = [] for i_child, child in enumerate(children): - for i_new_result, new_result in enumerate(new_results): - l_res = [] - r_res = [] - if type(child) == str: - # res += '(' + child + ') < ' - print('ERROR!?!?!') - if not filters['node_order'] or indices[i_child][i_new_result] < self.children_split: - l_res += child + ' < ' + l_res = [] + r_res = [] + if filters['node_order']: + new_child = child + else: + # a = [['tistega', 'dne'], ['sem', 'bil']] + # b = sorted(a) + # TODO CHECK IF THIS WORKS FOR CERTIAN + new_child = sorted(child, key=lambda x: x[0].key) + for i_answer, answer in enumerate(new_child): + # res += '(' + el + ') < ' + if not filters['node_order'] or indices[i_child][i_answer] < self.children_split: + if filters['dependency_type']: + separator = ' <' + deprel[i_child][i_answer] + ' ' else: - r_res += ' > ' + child + separator = ' < ' + l_res = self.merge_results(l_res, answer, separator, left=True) + # l_res += answer + separator else: - if filters['node_order']: - new_child = child + if filters['dependency_type']: + separator = ' >' + deprel[i_child][i_answer] + ' ' else: - new_child = sorted(child) - for i_answer, answer in enumerate(new_child): - # res += '(' + el + ') < ' - if not filters['node_order'] or indices[i_child][i_answer] < self.children_split: - if filters['dependency_type']: - separator = ' <' + deprel[i_child][i_answer] + ' ' - else: - separator = ' < ' - l_res = self.merge_results(l_res, answer, separator, left=True) - # l_res += answer + separator - else: - if filters['dependency_type']: - separator = ' >' + deprel[i_child][i_answer] + ' ' - else: - separator = ' > ' - r_res = self.merge_results(r_res, answer, separator, left=False) - # r_res += separator + answer - if l_res: - l_res_combined = self.merge_results(l_res, new_results, None) - if r_res: - r_res_combined = self.merge_results(l_res_combined, r_res, None) - merged_results.extend(['(' + el + ')' for el in r_res_combined]) - else: - merged_results.extend(['(' + el + ')' for el in l_res_combined]) - elif r_res: - r_res_combined = self.merge_results(new_results, r_res, None) - merged_results.extend(['(' + el + ')' for el in r_res_combined]) + separator = ' > ' + r_res = self.merge_results(r_res, answer, separator, left=False) + # r_res += separator + answer + if l_res: + l_res_combined = self.merge_results(l_res, new_results, None) + if r_res: + r_res_combined = self.merge_results(l_res_combined, r_res, None) + # merged_results.extend(['(' + el + ')' for el in r_res_combined]) + merged_results.extend([el.put_in_bracelets() for el in r_res_combined]) + else: + merged_results.extend([el.put_in_bracelets() for el in l_res_combined]) + elif r_res: + r_res_combined = self.merge_results(new_results, r_res, None) + merged_results.extend(['(' + el + ')' for el in r_res_combined]) # merged_results.append('(' + l_res + new_result + r_res + ')') @@ -553,11 +564,11 @@ class Tree(object): # TODO # node_order = False partly_built_trees = [[None] * answer_length] - partly_built_trees_architecture = [[None] * answer_length] + # partly_built_trees_architecture = [[None] * answer_length] partly_built_trees_architecture_indices = [[None] * answer_length] partly_built_trees_deprel = [[None] * answer_length] built_trees = [] - built_trees_architecture = [] + # built_trees_architecture = [] built_trees_architecture_indices = [] built_trees_deprel = [] @@ -568,7 +579,7 @@ class Tree(object): # child are added for child_i in range(len(separated_answers[0])): new_partly_built_trees = [] - new_partly_built_trees_architecture = [] + # new_partly_built_trees_architecture = [] new_partly_built_trees_architecture_indices = [] new_partly_built_trees_deprel = [] # iterate over answers parts @@ -580,11 +591,11 @@ class Tree(object): # if tree_part[answer_part_i] equals None add new element in its place if not tree_part[answer_part_i]: new_tree_part = copy(tree_part) - new_tree_part_architecture = copy(partly_built_trees_architecture[tree_part_i]) + # new_tree_part_architecture = copy(partly_built_trees_architecture[tree_part_i]) new_tree_part_architecture_indices = copy(partly_built_trees_architecture_indices[tree_part_i]) new_tree_part_deprel = copy(partly_built_trees_deprel[tree_part_i]) new_tree_part[answer_part_i] = separated_answers[answer_part_i][child_i] - new_tree_part_architecture[answer_part_i] = separated_answers_architecture[answer_part_i][child_i] + # new_tree_part_architecture[answer_part_i] = separated_answers_architecture[answer_part_i][child_i] new_tree_part_architecture_indices[answer_part_i] = child_i new_tree_part_deprel[answer_part_i] = separated_answers_deprel[answer_part_i][child_i] completed_tree_part = True @@ -593,12 +604,12 @@ class Tree(object): completed_tree_part = False if completed_tree_part: built_trees.append(new_tree_part) - built_trees_architecture.append(new_tree_part_architecture) + # built_trees_architecture.append(new_tree_part_architecture) built_trees_architecture_indices.append(new_tree_part_architecture_indices) built_trees_deprel.append(new_tree_part_deprel) else: new_partly_built_trees.append(new_tree_part) - new_partly_built_trees_architecture.append(new_tree_part_architecture) + # new_partly_built_trees_architecture.append(new_tree_part_architecture) new_partly_built_trees_architecture_indices.append(new_tree_part_architecture_indices) new_partly_built_trees_deprel.append(new_tree_part_deprel) else: @@ -607,7 +618,7 @@ class Tree(object): # print('HERE!!!') partly_built_trees.extend(new_partly_built_trees) - partly_built_trees_architecture.extend(new_partly_built_trees_architecture) + # partly_built_trees_architecture.extend(new_partly_built_trees_architecture) partly_built_trees_architecture_indices.extend(new_partly_built_trees_architecture_indices) partly_built_trees_deprel.extend(new_partly_built_trees_deprel) @@ -615,19 +626,23 @@ class Tree(object): if built_trees: # sort 3 arrays by architecture indices - temp_trees_index, temp_trees, temp_trees_architectures, temp_trees_deprel = (list(t) for t in zip( - *sorted(zip(built_trees_architecture_indices, built_trees, built_trees_architecture, built_trees_deprel)))) + # temp_trees_index, temp_trees, temp_trees_architectures, temp_trees_deprel = (list(t) for t in zip( + # *sorted(zip(built_trees_architecture_indices, built_trees, built_trees_architecture, built_trees_deprel)))) + temp_trees_index, temp_trees, temp_trees_deprel = (list(t) for t in zip( + *sorted(zip(built_trees_architecture_indices, built_trees, built_trees_deprel)))) # order outputs and erase duplicates # for tree, tree_architecture, tree_architecture_indice in zip(built_trees, built_trees_architecture, built_trees_architecture_indices): - for tree, tree_architecture, tree_index, tree_deprel in zip(temp_trees, temp_trees_architectures, temp_trees_index, temp_trees_deprel): - new_tree_index, new_tree, new_tree_architecture, new_tree_deprel = (list(t) for t in zip(*sorted(zip(tree_index, tree, tree_architecture, tree_deprel)))) + # for tree, tree_architecture, tree_index, tree_deprel in zip(temp_trees, temp_trees_architectures, temp_trees_index, temp_trees_deprel): + for tree, tree_index, tree_deprel in zip(temp_trees, temp_trees_index, temp_trees_deprel): + # new_tree_index, new_tree, new_tree_architecture, new_tree_deprel = (list(t) for t in zip(*sorted(zip(tree_index, tree, tree_architecture, tree_deprel)))) + new_tree_index, new_tree, new_tree_deprel = (list(t) for t in zip(*sorted(zip(tree_index, tree, tree_deprel)))) # TODO check if inside new_tree_architecture in ordered_built_trees_architecture and if not append! is_unique = True for unique_tree in unique_trees_architecture: already_in = True for part_i in range(len(unique_tree)): - if unique_tree[part_i] != new_tree_architecture[part_i]: + if unique_tree[part_i].order_key != new_tree[part_i].order_key: already_in = False break if already_in: @@ -635,9 +650,9 @@ class Tree(object): break if is_unique: - unique_trees_architecture.append(new_tree_architecture) + unique_trees_architecture.append(new_tree) # if not filters['node_order']: - l_ordered_built_trees_architecture.append(new_tree_architecture) + # l_ordered_built_trees_architecture.append(new_tree_architecture) l_ordered_built_trees.append(new_tree) l_ordered_built_trees_index.append(new_tree_index) l_ordered_built_trees_deprel.append(new_tree_deprel) diff --git a/dependency-parsetree.py b/dependency-parsetree.py index 331c0ee..aedb993 100644 --- a/dependency-parsetree.py +++ b/dependency-parsetree.py @@ -139,9 +139,12 @@ def create_trees(config): # for k, v in token.feats.items(): # token_feats += k + next(iter(v)) + '|' # token_feats = token_feats[:-1] + if not token.id.isdigit(): + continue + # TODO check if 5th place is always there for feats feats = token._fields[5] - node = Tree(token.form, token.lemma, token.upos, token.xpos, token.deprel, feats, token.feats, form_dict, + node = Tree(int(token.id), token.form, token.lemma, token.upos, token.xpos, token.deprel, feats, token.feats, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, feats_dict, feats_detailed_dict, token.head) token_nodes.append(node) if token.deprel == 'root': @@ -438,26 +441,39 @@ def main(): # 1.02 s (16 cores) if cpu_cores > 1: - all_subtrees = p.map(tree_calculations, [(tree, query_tree, create_output_string_funct, filters) for tree in all_trees]) + all_subtrees = p.map(tree_calculations, [(tree, query_tree, create_output_string_funct, filters) for tree in all_trees[5170:]]) - for subtrees in all_subtrees: + # for subtrees in all_subtrees: + for tree_i, subtrees in enumerate(all_subtrees): for query_results in subtrees: for r in query_results: + # if r == '(" < , < je < velik) < tem': + # print(tree_i) + # if r in result_dict: + # result_dict[r] += 1 + # else: + # result_dict[r] = 1 if r in result_dict: - result_dict[r] += 1 + result_dict[r]['number'] += 1 else: - result_dict[r] = 1 + result_dict[r] = {'object': r, 'number': 1} # 3.65 s (1 core) else: - for tree in all_trees: + # for tree_i, tree in enumerate(all_trees[-5:]): + for tree_i, tree in enumerate(all_trees[1:]): + # text = Če pa ostane odrasel otrok doma, se starši le težko sprijaznijo s tem, da je "velik", otrok pa ima ves čas občutek, da se njegovi starši po nepotrebnem vtikajo v njegovo življenje. + # for tree_i, tree in enumerate(all_trees[5170:]): + # for tree in all_trees: subtrees = tree_calculations((tree, query_tree, create_output_string_funct, filters)) for query_results in subtrees: for r in query_results: + # if r == '(" < , < je < velik) < tem': + # print(tree_i) if r in result_dict: - result_dict[r] += 1 + result_dict[r]['number'] += 1 else: - result_dict[r] = 1 + result_dict[r] = {'object': r, 'number': 1} print("Execution time:") print("--- %s seconds ---" % (time.time() - start_exe_time)) @@ -481,7 +497,7 @@ def main(): # [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}]}]}], []) # # _, subtrees = new_tree.get_subtrees( # # [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}], "r_children": []}], "r_children": []}], []) - sorted_list = sorted(result_dict.items(), key=lambda x: x[1], reverse=True) + sorted_list = sorted(result_dict.items(), key=lambda x: x[1]['number'], reverse=True) with open(config.get('settings', 'output'), "w", newline="") as f: # header - use every second space as a split @@ -496,8 +512,8 @@ def main(): # body for k, v in sorted_list: - words_only = printable_answers(k) - writer.writerow([k] + words_only + [str(v)]) + words_only = printable_answers(k.key) + writer.writerow([k.key] + words_only + [str(v['number'])]) return "Done"