diff --git a/ResultNode.py b/ResultNode.py new file mode 100644 index 0000000..02df3a5 --- /dev/null +++ b/ResultNode.py @@ -0,0 +1,20 @@ +from generic import generate_key, generate_name + + +class ResultNode(object): + def __init__(self, node, architecture_order, create_output_strings): + self.name_parts, self.name = generate_name(node, create_output_strings) + # self.key_free = self.key + # self.array = [[output_string]] + # self.order_key = str(architecture_order) + self.location = architecture_order + self.deprel = node.deprel.get_value() + # order with original numbers in sentences + # self.order = str([architecture_order]) + # order with numbers from 0 to n of n-gram + # self.root = '' + # self.final_order = '' + # self.separators = [] + + def __repr__(self): + return self.name diff --git a/ResultTree.py b/ResultTree.py new file mode 100644 index 0000000..cdba2bf --- /dev/null +++ b/ResultTree.py @@ -0,0 +1,249 @@ +import copy +import string + +from generic import create_output_string_form, create_output_string_deprel, create_output_string_lemma, \ + create_output_string_upos, create_output_string_xpos, create_output_string_feats, generate_key + + +class ResultTree(object): + def __init__(self, node, children, filters): + # self.array = [[create_output_string(node) for create_output_string in create_output_strings]] + # if create_output_string_lemma in create_output_strings: + # key_array = [[create_output_string(node) if create_output_string != create_output_string_lemma else 'L=' + create_output_string(node) for create_output_string in create_output_strings]] + # else: + # key_array = self.array + # if len(self.array[0]) > 1: + # self.key = '&'.join(key_array[0]) + # else: + # # output_string = create_output_strings[0](node) + # self.key = key_array[0][0] + + self.node = node + + # order with original numbers in sentences + # self.order = str([architecture_order]) + # order with numbers from 0 to n of n-gram + # self.root = '' + # self.final_order = '' + # self.separators = separators + self.children = children + self.filters = filters + self.key = None + self.order_key = None + + def __repr__(self): + return self.get_key() + + def set_children(self, children): + self.children = children + + def get_key(self): + # if self.key: + # return self.key + key = '' + write_self_node_to_result = False + if self.children: + for child in self.children: + if child.node.location < self.node.location: + if self.filters['dependency_type']: + # separator = ' <' + deprel[i_child][i_answer] + ' ' + separator = ' <' + child.node.deprel + ' ' + else: + separator = ' < ' + key += child.get_key() + separator + else: + if not write_self_node_to_result: + write_self_node_to_result = True + key += self.node.name + if self.filters['dependency_type']: + separator = ' >' + child.node.deprel + ' ' + else: + separator = ' > ' + key += separator + child.get_key() + + if not write_self_node_to_result: + key += self.node.name + self.key = '(' + key + ')' + else: + self.key = self.node.name + return self.key + + def get_order_key(self): + # if self.order_key: + # return self.order_key + order_key = '' + write_self_node_to_result = False + if self.children: + for child in self.children: + if child.node.location < self.node.location: + if self.filters['dependency_type']: + # separator = ' <' + deprel[i_child][i_answer] + ' ' + separator = ' <' + child.node.deprel + ' ' + else: + separator = ' < ' + order_key += child.get_order_key() + separator + else: + if not write_self_node_to_result: + write_self_node_to_result = True + order_key += str(self.node.location) + if self.filters['dependency_type']: + separator = ' >' + child.node.deprel + ' ' + else: + separator = ' > ' + order_key += separator + child.get_order_key() + if not write_self_node_to_result: + order_key += str(self.node.location) + self.order_key = '(' + order_key + ')' + else: + self.order_key = str(self.node.location) + return self.order_key + + def get_order(self): + # if self.order_key: + # return self.order_key + order = [] + write_self_node_to_result = False + if self.children: + for child in self.children: + if child.node.location < self.node.location: + order += child.get_order() + else: + if not write_self_node_to_result: + write_self_node_to_result = True + order += [self.node.location] + order += child.get_order() + + if not write_self_node_to_result: + order += [self.node.location] + self.order = order + else: + self.order = [self.node.location] + return self.order + + def get_array(self): + # if self.order_key: + # return self.order_key + array = [] + write_self_node_to_result = False + if self.children: + for child in self.children: + if child.node.location < self.node.location: + array += child.get_array() + else: + if not write_self_node_to_result: + write_self_node_to_result = True + array += [self.node.name_parts] + array += child.get_array() + + if not write_self_node_to_result: + array += [self.node.name_parts] + self.array = array + else: + self.array = [self.node.name_parts] + return self.array + + # def add(self, string, architecture_order, separator, is_left): + # if is_left: + # self.array = [string] + self.array + # self.order = [architecture_order] + self.order + # # self.order = [architecture_order] + self.order + # self.separators = [separator] + self.separators + # self.key = string + ' ' + separator + ' ' + self.key + # self.order_key = architecture_order + ' ' + separator + ' ' + self.order_key + # + # else: + # self.array += [string] + # self.order += [architecture_order] + # # self.order += [architecture_order] + # self.separators += [separator] + # + # self.key += ' ' + separator + ' ' + string + # self.order_key += ' ' + separator + ' ' + architecture_order + + def add_separator(self, separator, left=True): + self_copy = copy.copy(self) + if left: + self_copy.separators += [separator] + self_copy.key += separator + self_copy.order_key += separator + else: + self_copy.separators = [separator] + self_copy.separators + self_copy.key = separator + self_copy.key + self_copy.order_key = separator + self_copy.order_key + return self_copy + + # def merge_results2(self): + + + def merge_results(self, right_t, separator, left=True): + left_tree = copy.copy(self) + right_tree = copy.copy(right_t) + + if separator: + if left: + # merged_results.append(left_part + right_part + separator) + left_tree.key = left_tree.key + right_tree.key + separator + left_tree.order_key = left_tree.order_key + right_tree.order_key + separator + left_tree.array = left_tree.array + right_tree.array + left_tree.order = left_tree.order + right_tree.order + # left_tree.order = str([architecture_order]) + left_tree.separators = left_tree.separators + right_tree.separators + [separator] + else: + # merged_results.append(left_part + separator + right_part) + left_tree.key = left_tree.key + separator + right_tree.key + left_tree.order_key = left_tree.order_key + separator + right_tree.order_key + left_tree.array = left_tree.array + right_tree.array + left_tree.order = left_tree.order + right_tree.order + # left_tree.order = str([architecture_order]) + left_tree.separators = left_tree.separators + [separator] + right_tree.separators + else: + # merged_results.append(left_part + right_part) + left_tree.key = left_tree.key + right_tree.key + left_tree.order_key = left_tree.order_key + right_tree.order_key + left_tree.array = left_tree.array + right_tree.array + left_tree.order = left_tree.order + right_tree.order + # left_tree.order = str([architecture_order]) + left_tree.separators = left_tree.separators + right_tree.separators + + return left_tree + + def extend_answer(self, other_answer, separator): + self.array.extend(other_answer.array) + self.order.extend(other_answer.order) + self.key += separator + other_answer.key + self.order_key += separator + other_answer.order_key + self.separators.extend(separator) + + def put_in_bracelets(self, inplace=False): + if inplace: + self.key = ('(' + self.key + ')') + self.order_key = ('(' + self.order_key + ')') + return + result = copy.copy(self) + result.key = ('(' + result.key + ')') + result.order_key = ('(' + result.order_key + ')') + return result + + def finalize_result(self): + result = copy.copy(self) + result.key = result.get_key() + # result.set_root() + + # create order letters + order = result.get_order() + order_letters = [''] * len(result.order) + for i in range(len(order)): + ind = order.index(min(order)) + order[ind] = 10000 + order_letters[ind] = string.ascii_uppercase[i] + result.order = ''.join(order_letters) + # result.order_key = result.order_key[1:-1] + # TODO When tree is finalized create relative word order (alphabet)! + return result + + # def set_root(self): + # if len(self.array[0]) > 1: + # self.root = '&'.join(self.array[0]) + # else: + # # output_string = create_output_strings[0](node) + # self.root = self.array[0][0] \ No newline at end of file diff --git a/Tree.py b/Tree.py index 04aedc7..3749a1b 100644 --- a/Tree.py +++ b/Tree.py @@ -4,6 +4,8 @@ from copy import copy from pyconll.unit import Token from Result import Result +from ResultNode import ResultNode +from ResultTree import ResultTree from Value import Value from generic import create_output_string_form, create_output_string_deprel, create_output_string_lemma, \ create_output_string_upos, create_output_string_xpos, create_output_string_feats, generate_key @@ -314,39 +316,43 @@ class Tree(object): # string_output = '' # if create_output_string_form(self) == 'vožnji': # print('HERE!@@!') + # if create_output_string_form(self) == 'začelo': + # print('HERE!@@!') + node = ResultNode(self, self.index, create_output_string) + + # TEST = ResultTree(node, [], filters) + # a = TEST.create_key() + # if i_query < len(active_permanent_query_trees): + # if 'children' in active_permanent_query_trees[i_query]: + # merged_partial_subtrees.append( + # self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters)) + # i_answer += 1 + # else: + # merged_partial_subtrees.append([Result(self, self.index, create_output_string)]) + # else: + # if 'children' in active_temporary_query_trees[i_query - len(active_permanent_query_trees)]: + # merged_partial_subtrees.append( + # self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters)) + # i_answer += 1 + # else: + # merged_partial_subtrees.append([Result(self, self.index, create_output_string)]) + if i_query < len(active_permanent_query_trees): if 'children' in active_permanent_query_trees[i_query]: - # if not filters['node_order'] or i_child < self.children_split: - # merged_partial_subtrees.append( - # self.create_output_children(partial_subtrees[i_answer], [create_output_string(self)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) - # merged_partial_subtrees_architecture.append( - # self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) - merged_partial_subtrees.append( - self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters)) - + self.create_output_children(partial_subtrees[i_answer], [ResultTree(node, [], filters)], filters)) i_answer += 1 else: - merged_partial_subtrees.append([Result(self, self.index, create_output_string)]) - # merged_partial_subtrees.append([create_output_string(self)]) - # merged_partial_subtrees_architecture.append([str([self.index])]) - + merged_partial_subtrees.append([ResultTree(node, [], filters)]) else: if 'children' in active_temporary_query_trees[i_query - len(active_permanent_query_trees)]: - # if not filters['node_order'] or i_child < self.children_split: - # merged_partial_subtrees.append( - # self.create_output_children(partial_subtrees[i_answer], [create_output_string(self)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) - # merged_partial_subtrees_architecture.append( - # self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) - merged_partial_subtrees.append( - self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters)) - + self.create_output_children(partial_subtrees[i_answer], [ResultTree(node, [], filters)], filters)) i_answer += 1 else: - merged_partial_subtrees.append([Result(self, self.index, create_output_string)]) - # merged_partial_subtrees.append([create_output_string(self)]) - # merged_partial_subtrees_architecture.append([str([self.index])]) + merged_partial_subtrees.append([ResultTree(node, [], filters)]) + + return i_answer @@ -458,6 +464,36 @@ class Tree(object): # merged_results.append(left_part + right_part) return merged_results + @staticmethod + def create_children_groups(left_parts, right_parts): + if not left_parts: + # return all right_parts + return right_parts + # if left: + # return [r_p + separator for r_p in right_parts] + # # return [r_p.add_separator(separator, left) for r_p in right_parts] + # else: + # return [separator + r_p for r_p in right_parts] + + if not right_parts: + return left_parts + # return [separator + l_p for l_p in left_parts] + all_children_group_possibilities = [] + for left_part in left_parts: + for right_part in right_parts: + new_part = copy(left_part) + new_part.extend(right_part) + all_children_group_possibilities.append(new_part) + # merged_results.append(left_part.merge_results(right_part, separator)) + # if separator: + # if left: + # merged_results.append(left_part + right_part + separator) + # else: + # merged_results.append(left_part + separator + right_part) + # else: + # merged_results.append(left_part + right_part) + return all_children_group_possibilities + @staticmethod def merge_answer(answer1, answer2, base_answer_i, answer_j): merged_results = [] @@ -476,6 +512,14 @@ class Tree(object): return merged_results, merged_indices def merge_results2(self, child, new_results, filters): + if create_output_string_form(self) == 'začelo': + print('HERE!@@!') + if create_output_string_form(self) == 'Dogodek': + print('HERE!@@!') + if create_output_string_form(self) == 'utišal': + print('HERE!@@!') + if create_output_string_form(self) == 'prijel': + print('HERE!@@!') if filters['node_order']: new_child = child # new_child_sorted = sorted(enumerate(child), key=lambda x: x[1][0].key) @@ -550,64 +594,50 @@ class Tree(object): return new_answers # def create_merged_results(self, new_child, new_answers, i_child, indices, deprel, filters): - def merge_results3(self, new_child, new_answers, i_child, indices, deprel, filters): - # l_res = [] - # r_res = [] - # results = [] - separators = [] - l_answers = [] - r_answers = [] - separator_switch = len(new_child) - 1 + def merge_results3(self, child, new_results, filters): + if create_output_string_form(self) == 'Dogodek': + print('HERE!@@!') + # if create_output_string_form(self) == 'začelo': + # print('HERE!@@!') + # if create_output_string_form(self) == 'utišal': + # print('HERE!@@!') + # if create_output_string_form(self) == 'prijel': + # print('HERE!@@!') + + if filters['node_order']: + new_child = child + # new_child_sorted = sorted(enumerate(child), key=lambda x: x[1][0].key) + # new_child_sorted = sorted(child, key=lambda x: x[0].get_key()) + else: + new_child = sorted(child, key=lambda x: x[0].get_key()) + + children_groups = [] + for i_answer, answer in enumerate(new_child): - if filters['node_order'] and indices[i_child][i_answer] < self.children_split: - if filters['dependency_type']: - separators.append(' <' + deprel[i_child][i_answer] + ' ') - else: - separators.append(' < ') - l_answers.append(answer) - # l_res = res - # return merged_results - # l_res += answer + separator - else: - if i_answer < separator_switch: - separator_switch = i_answer - if filters['dependency_type']: - separators.append(' >' + deprel[i_child][i_answer] + ' ') - else: - separators.append(' > ') - r_answers.append(answer) + children_groups = self.create_children_groups(children_groups, [[answer_part] for answer_part in answer]) # r_res += separator + answer - answers = [] - if l_answers and r_answers: - answers = l_answers + [new_answers] + r_answers - # for l_answer in l_answers: - # for r_answer in r_answers: - # answers.append(l_answer + new_answers + r_answer) - elif l_answers: - answers = l_answers + [new_answers] - # for l_answer in l_answers: - # answers.append(l_answer + new_answers) - elif r_answers: - answers = [new_answers] + r_answers - # for r_answer in r_answers: - # answers.append(new_answers + r_answer) - else: - answers = [new_answers] + # children_groups_sorted = [] + # for i_answer, answer in enumerate(new_child_sorted): + # children_groups_sorted = self.create_children_groups(children_groups_sorted, [[answer_part] for answer_part in answer]) + # + # + # results_sorted = {} + # for result in new_results: + # for children in children_groups_sorted: + # new_result = copy(result) + # new_result.set_children(children) + # order = sorted(new_result.get_order()) + # results_sorted.append(new_result) - results = self.create_merged_results(answers, separators, separator_switch) - # if l_res: - # l_res_combined = self.merge_results(l_res, new_answers, None) - # if r_res: - # r_res_combined = self.merge_results(l_res_combined, r_res, None) - # # merged_results.extend(['(' + el + ')' for el in r_res_combined]) - # results.extend([el.put_in_bracelets() for el in r_res_combined]) - # else: - # results.extend([el.put_in_bracelets() for el in l_res_combined]) - # elif r_res: - # r_res_combined = self.merge_results(new_answers, r_res, None) - # results.extend([el.put_in_bracelets() for el in r_res_combined]) + results = [] + for result in new_results: + for children in children_groups: + new_result = copy(result) + new_result.set_children(children) + order = new_result.get_order() + results.append(new_result) return results @@ -620,7 +650,8 @@ class Tree(object): # print('HERE') merged_results = [] for i_child, child in enumerate(children): - merged_results.extend(self.merge_results2(child, new_results, filters)) + # merged_results.extend(self.merge_results2(child, new_results, filters)) + merged_results.extend(self.merge_results3(child, new_results, filters)) return merged_results # @staticmethod @@ -680,8 +711,9 @@ class Tree(object): for unique_tree in unique_trees_architecture: already_in = True for part_i in range(len(unique_tree)): - test = unique_tree[part_i][0].order_key - if len(unique_tree[part_i]) != len(new_tree[part_i]) or any(unique_tree[part_i][i_unique_part].order_key != new_tree[part_i][i_unique_part].order_key for i_unique_part in range(len(unique_tree[part_i]))): + # test = unique_tree[part_i][0].get_order_key() + if len(unique_tree[part_i]) != len(new_tree[part_i]) or any(unique_tree[part_i][i_unique_part].get_order_key() != new_tree[part_i][i_unique_part].get_order_key() for i_unique_part in range(len(unique_tree[part_i]))): + # if len(unique_tree[part_i]) != len(new_tree[part_i]) or any(unique_tree[part_i][i_unique_part].order_key != new_tree[part_i][i_unique_part].order_key for i_unique_part in range(len(unique_tree[part_i]))): # if unique_tree[part_i].order_key != new_tree[part_i].order_key: already_in = False break diff --git a/dependency-parsetree.py b/dependency-parsetree.py index 3679d2b..c9d342b 100644 --- a/dependency-parsetree.py +++ b/dependency-parsetree.py @@ -476,8 +476,12 @@ def main(): # for subtrees in all_subtrees: for tree_i, subtrees in enumerate(all_subtrees): + for query_results in subtrees: for r in query_results: + # if r.key == '(ne xcomp (se punct .)': + # print('HERE') + # print(tree_i) if filters['node_order']: key = r.key + r.order else: @@ -496,8 +500,10 @@ def main(): # 3.65 s (1 core) else: # for tree_i, tree in enumerate(all_trees[-5:]): - # for tree_i, tree in enumerate(all_trees): - for tree_i, tree in enumerate(all_trees[1:]): + for tree_i, tree in enumerate(all_trees): + # for tree_i, tree in enumerate(all_trees[852:]): + # for tree_i, tree in enumerate(all_trees[1689:]): + # for tree_i, tree in enumerate(all_trees[2:]): input_data = (tree, query_tree, create_output_string_functs, filters) if filters['association_measures']: unigrams = get_unigrams(input_data) @@ -573,19 +579,20 @@ def main(): # body for k, v in sorted_list: + v['object'].get_array() absolute_frequency = v['number'] * 1000000.0 / corpus_size if filters['frequency_threshold'] and filters['frequency_threshold'] > absolute_frequency: break words_only = [word_att for word in v['object'].array for word_att in word] + ['' for i in range((tree_size_range[-1] - len(v['object'].array)) * len(v['object'].array[0]))] # words_only = printable_answers(k) - row = [v['object'].key] + words_only + [str(v['number'])] + row = [v['object'].key[1:-1]] + words_only + [str(v['number'])] row += ['%.4f' % absolute_frequency] if filters['node_order']: row += [v['object'].order] if filters['nodes_number']: row += ['%d' % len(v['object'].array)] if filters['print_root']: - row += [v['object'].root] + row += [v['object'].node.name] if filters['association_measures']: row += get_collocabilities(v, unigrams_dict, corpus_size) writer.writerow(row) diff --git a/generic.py b/generic.py index 408ee82..6983d13 100644 --- a/generic.py +++ b/generic.py @@ -36,6 +36,22 @@ def generate_key(node, create_output_strings, print_lemma=True): return array, key +def generate_name(node, create_output_strings, print_lemma=True): + array = [create_output_string(node) for create_output_string in create_output_strings] + if create_output_string_lemma in create_output_strings and print_lemma: + name_array = [create_output_string( + node) if create_output_string != create_output_string_lemma else 'L=' + create_output_string(node) for + create_output_string in create_output_strings] + else: + name_array = array + if len(array) > 1: + name = '&'.join(name_array) + else: + # output_string = create_output_strings[0](node) + name = name_array[0] + + return array, name + def get_collocabilities(ngram, unigrams_dict, corpus_size): sum_fwi = 0.0 mul_fwi = 1.0