Browse Source

Cleaning repo

master
Luka 3 years ago
parent
commit
421f12cac6
  1. 9
      ResultNode.py
  2. 97
      ResultTree.py
  3. 353
      Tree.py
  4. 152
      dependency-parsetree.py
  5. 3
      generic.py

9
ResultNode.py

@ -18,17 +18,8 @@ from generic import generate_key, generate_name
class ResultNode(object):
def __init__(self, node, architecture_order, create_output_strings):
self.name_parts, self.name = generate_name(node, create_output_strings)
# self.key_free = self.key
# self.array = [[output_string]]
# self.order_key = str(architecture_order)
self.location = architecture_order
self.deprel = node.deprel.get_value()
# order with original numbers in sentences
# self.order = str([architecture_order])
# order with numbers from 0 to n of n-gram
# self.root = ''
# self.final_order = ''
# self.separators = []
def __repr__(self):
return self.name

97
ResultTree.py

@ -15,9 +15,6 @@
import copy
import string
from generic import create_output_string_form, create_output_string_deprel, create_output_string_lemma, \
create_output_string_upos, create_output_string_xpos, create_output_string_feats, generate_key
class ResultTree(object):
def __init__(self, node, children, filters):
@ -51,7 +48,6 @@ class ResultTree(object):
for child in children:
if self.filters['node_order'] and child.node.location < self.node.location:
if self.filters['dependency_type']:
# separator = ' <' + deprel[i_child][i_answer] + ' '
separator = ' <' + child.node.deprel + ' '
else:
separator = ' < '
@ -104,7 +100,6 @@ class ResultTree(object):
for child in self.children:
if self.filters['node_order'] and child.node.location < self.node.location:
if self.filters['dependency_type']:
# separator = ' <' + deprel[i_child][i_answer] + ' '
separator = ' <' + child.node.deprel + ' '
else:
separator = ' < '
@ -169,93 +164,9 @@ class ResultTree(object):
self.array = [self.node.name_parts]
return self.array
# def add(self, string, architecture_order, separator, is_left):
# if is_left:
# self.array = [string] + self.array
# self.order = [architecture_order] + self.order
# # self.order = [architecture_order] + self.order
# self.separators = [separator] + self.separators
# self.key = string + ' ' + separator + ' ' + self.key
# self.order_key = architecture_order + ' ' + separator + ' ' + self.order_key
#
# else:
# self.array += [string]
# self.order += [architecture_order]
# # self.order += [architecture_order]
# self.separators += [separator]
#
# self.key += ' ' + separator + ' ' + string
# self.order_key += ' ' + separator + ' ' + architecture_order
# def add_separator(self, separator, left=True):
# self_copy = copy.copy(self)
# if left:
# self_copy.separators += [separator]
# self_copy.key += separator
# self_copy.order_key += separator
# else:
# self_copy.separators = [separator] + self_copy.separators
# self_copy.key = separator + self_copy.key
# self_copy.order_key = separator + self_copy.order_key
# return self_copy
# def merge_results2(self):
# def merge_results(self, right_t, separator, left=True):
# left_tree = copy.copy(self)
# right_tree = copy.copy(right_t)
#
# if separator:
# if left:
# # merged_results.append(left_part + right_part + separator)
# left_tree.key = left_tree.key + right_tree.key + separator
# left_tree.order_key = left_tree.order_key + right_tree.order_key + separator
# left_tree.array = left_tree.array + right_tree.array
# left_tree.order = left_tree.order + right_tree.order
# # left_tree.order = str([architecture_order])
# left_tree.separators = left_tree.separators + right_tree.separators + [separator]
# else:
# # merged_results.append(left_part + separator + right_part)
# left_tree.key = left_tree.key + separator + right_tree.key
# left_tree.order_key = left_tree.order_key + separator + right_tree.order_key
# left_tree.array = left_tree.array + right_tree.array
# left_tree.order = left_tree.order + right_tree.order
# # left_tree.order = str([architecture_order])
# left_tree.separators = left_tree.separators + [separator] + right_tree.separators
# else:
# # merged_results.append(left_part + right_part)
# left_tree.key = left_tree.key + right_tree.key
# left_tree.order_key = left_tree.order_key + right_tree.order_key
# left_tree.array = left_tree.array + right_tree.array
# left_tree.order = left_tree.order + right_tree.order
# # left_tree.order = str([architecture_order])
# left_tree.separators = left_tree.separators + right_tree.separators
#
# return left_tree
# def extend_answer(self, other_answer, separator):
# self.array.extend(other_answer.array)
# self.order.extend(other_answer.order)
# self.key += separator + other_answer.key
# self.order_key += separator + other_answer.order_key
# self.separators.extend(separator)
# def put_in_bracelets(self, inplace=False):
# if inplace:
# self.key = ('(' + self.key + ')')
# self.order_key = ('(' + self.order_key + ')')
# return
# result = copy.copy(self)
# result.key = ('(' + result.key + ')')
# result.order_key = ('(' + result.order_key + ')')
# return result
def finalize_result(self):
result = copy.copy(self)
result.reset_params()
# result.key = result.get_key()
# result.set_root()
# create order letters
order = result.get_order()
@ -265,13 +176,5 @@ class ResultTree(object):
order[ind] = 10000
order_letters[ind] = string.ascii_uppercase[i]
result.order = ''.join(order_letters)
# result.order_key = result.order_key[1:-1]
# TODO When tree is finalized create relative word order (alphabet)!
return result
# def set_root(self):
# if len(self.array[0]) > 1:
# self.root = '&'.join(self.array[0])
# else:
# # output_string = create_output_strings[0](node)
# self.root = self.array[0][0]

353
Tree.py

@ -4,8 +4,7 @@ from copy import copy
from ResultNode import ResultNode
from ResultTree import ResultTree
from Value import Value
from generic import create_output_string_form, create_output_string_deprel, create_output_string_lemma, \
create_output_string_upos, create_output_string_xpos, create_output_string_feats, generate_key
from generic import generate_key
class Tree(object):
@ -13,7 +12,6 @@ class Tree(object):
if not hasattr(self, 'feats'):
self.feats_detailed = {}
# form_unicode = str(form).encode("utf-8")
if form not in form_dict:
form_dict[form] = Value(form)
self.form = form_dict[form]
@ -40,7 +38,6 @@ class Tree(object):
if not feat in self.feats_detailed:
self.feats_detailed[feat] = {}
self.feats_detailed[feat][next(iter(feats_detailed[feat]))] = feats_detailed_dict[feat][next(iter(feats_detailed[feat]))]
# self.position = position
self.parent = head
self.children = []
@ -52,7 +49,6 @@ class Tree(object):
self.cache = {}
def add_child(self, child):
# child.index = len(self.children)
self.children.append(child)
def set_parent(self, parent):
@ -68,7 +64,6 @@ class Tree(object):
return True
def fits_permanent_requirements(self, filters):
main_attributes = ['deprel', 'feats', 'form', 'lemma', 'upos']
@ -121,8 +116,6 @@ class Tree(object):
if result_index in partial_results and result_part_index in partial_results[result_index] and len(partial_results[result_index][result_part_index]) > 0:
if len(all_query_indices[result_index][0]) > result_part_index + 1:
new_queries.append((result_part_index + 1, result_index, is_permanent))
# else:
# completed_subtrees.append((child, result_index))
child_queries_metadata = new_queries
@ -142,76 +135,11 @@ class Tree(object):
def add_subtrees(self, old_subtree, new_subtree):
old_subtree.extend(new_subtree)
# def get_results(self, partial_results_dict, result_index, result_part, outcome, last_result_part):
# # save results for later usage
#
# # if result index already in and element 0 exists (otherwise error)
# if result_index in partial_results_dict and 0 in partial_results_dict[result_index]:
# if result_part - 1 in partial_results_dict[result_index]:
# if result_part in partial_results_dict[result_index]:
# partial_results_dict[result_index][result_part].extend(self.merge_results(partial_results_dict[result_index][result_part - 1], outcome))
# else:
# partial_results_dict[result_index][result_part] = self.merge_results(partial_results_dict[result_index][result_part - 1], outcome)
#
# # extend one word layer with output
# else:
# partial_results_dict[result_index][0].extend(outcome)
# else:
# partial_results_dict[result_index] = {0: outcome}
#
# if last_result_part - 1 in partial_results_dict[result_index]:
# return partial_results_dict[result_index].pop(last_result_part - 1)
# return []
# def group_results(self, new_partial_subtrees, child_queries_metadata, all_query_indices, partial_results_dict, partial_subtrees):
# for outcome, (result_part, result_index, is_permanent) in zip(new_partial_subtrees, child_queries_metadata):
# if outcome:
# new_results = self.get_results(partial_results_dict, result_index, result_part, outcome, len(all_query_indices[result_index][0]))
# if new_results:
# self.add_subtrees(partial_subtrees[result_index], new_results)
# else:
# if not is_permanent:
# partial_subtrees[result_index].append([])
# def get_all_query_indices_old(self, temporary_query_trees_size, completed_subtrees_size, permanent_query_trees, l_all_query_indices, children, create_output_string):
# partial_subtrees = [[] for i in range(completed_subtrees_size + temporary_query_trees_size)]
# completed_subtrees = [[] for i in range(completed_subtrees_size)]
#
# # list of pairs (index of query in group, group of query)
# partial_results_dict = {}
#
# children_queries_generator = self.generate_children_queries(l_all_query_indices, children)
#
# child_index = 0
# child, child_queries, child_queries_metadata = next(children_queries_generator)
# while child:
# # obtain children results
# new_partial_subtrees, new_completed_subtrees = child.get_subtrees(permanent_query_trees, child_queries, create_output_string)
#
# self.group_results(new_partial_subtrees, child_queries_metadata, l_all_query_indices,
# partial_results_dict, partial_subtrees)
#
# for i in range(len(new_completed_subtrees)):
# completed_subtrees[i].extend(new_completed_subtrees[i])
# child, child_queries, child_queries_metadata = children_queries_generator.send(partial_results_dict)
# child_index += 1
#
# return partial_subtrees, completed_subtrees
def get_all_query_indices(self, temporary_query_nb, permanent_query_nb, permanent_query_trees, all_query_indices, children, create_output_string, filters):
# l_partial_subtrees, l_completed_subtrees = self.get_all_query_indices(len(temporary_query_trees),
# len(permanent_query_trees),
# permanent_query_trees,
# l_all_query_indices, self.l_children,
# create_output_string)
partial_answers = [[] for i in range(permanent_query_nb + temporary_query_nb)]
partial_answers_index = [[] for i in range(permanent_query_nb + temporary_query_nb)]
complete_answers = [[] for i in range(permanent_query_nb)]
# list of pairs (index of query in group, group of query)
partial_results_dict = {}
# TODO try to erase!!!
child_queries = [all_query_indice[0] for all_query_indice in all_query_indices]
@ -221,8 +149,6 @@ class Tree(object):
all_new_partial_answers = [[] for query_part in child_queries_flatten]
# if filters['caching']:
# erase duplicate queries
child_queries_flatten_dedup = []
child_queries_flatten_dedup_indices = []
for query_part in child_queries_flatten:
@ -237,7 +163,6 @@ class Tree(object):
# ask children all queries/partial queries
for child in children:
# obtain children results
# if filters['caching']:
new_partial_answers_dedup, new_complete_answers = child.get_subtrees(permanent_query_trees, child_queries_flatten_dedup,
create_output_string, filters)
@ -247,32 +172,10 @@ class Tree(object):
for i, flattened_index in enumerate(child_queries_flatten_dedup_indices):
all_new_partial_answers[i].append(new_partial_answers_dedup[flattened_index])
# else:
# new_partial_answers_architecture, new_partial_answers, new_complete_answers = child.get_subtrees(
# permanent_query_trees, child_queries_flatten,
# create_output_string, filters)
#
# assert len(new_partial_answers) == len(child_queries_flatten)
#
# for i, new_partial_subtree in enumerate(new_partial_answers):
# all_new_partial_answers[i].append(new_partial_subtree)
# all_new_partial_answers_architecture[i].append(new_partial_answers_architecture[i])
# # if len(new_partial_answers_architecture[i]) > 1:
# # print('HERE!!!')
# all_new_partial_answers_deprel[i].append(create_output_string_deprel(child))
# add 6 queries from 3 split up
# self.group_results(new_partial_subtrees, child_queries_metadata, all_query_indices,
# partial_results_dict, partial_subtrees)
for i in range(len(new_complete_answers)):
# TODO add order rearagement (TO KEY)
complete_answers[i].extend(new_complete_answers[i])
# if create_output_string_lemma(self) == 'drama':
# print('HERE!@@!')
# if create_output_string_form(self) == 'vpiti':
# print('HERE!@@!')
# merge answers in appropriate way
i = 0
# iterate over all answers per queries
@ -280,60 +183,14 @@ class Tree(object):
# iterate over answers of query
# TODO ERROR IN HERE!
partial_answers[answer_i] = self.create_answers(all_new_partial_answers[i:i + answer_length], answer_length, filters)
# while i < answers_length:
# self.create_grouped_answers()
# i += 1
i += answer_length
# merged_results = []
# for old_result in old_results:
# for new_result in new_results:
# merged_results.append(old_result + new_result)
# return merged_results
# children_queries_generator = self.generate_children_queries(all_query_indices, children)
#
# child_index = 0
# child, child_queries, child_queries_metadata = next(children_queries_generator)
# while child:
# # obtain children results
# new_partial_subtrees, new_completed_subtrees = child.get_subtrees(permanent_query_trees, child_queries, create_output_string)
#
# self.group_results(new_partial_subtrees, child_queries_metadata, all_query_indices,
# partial_results_dict, partial_subtrees)
#
# for i in range(len(new_completed_subtrees)):
# completed_subtrees[i].extend(new_completed_subtrees[i])
# child, child_queries, child_queries_metadata = children_queries_generator.send(partial_results_dict)
# child_index += 1
return partial_answers, complete_answers
def order_dependent_queries(self, active_permanent_query_trees, active_temporary_query_trees, partial_subtrees,
create_output_string, merged_partial_subtrees, i_query, i_answer, filters):
# string_output = ''
# if create_output_string_form(self) == 'vožnji':
# print('HERE!@@!')
# if create_output_string_form(self) == 'začelo':
# print('HERE!@@!')
node = ResultNode(self, self.index, create_output_string)
# TEST = ResultTree(node, [], filters)
# a = TEST.create_key()
# if i_query < len(active_permanent_query_trees):
# if 'children' in active_permanent_query_trees[i_query]:
# merged_partial_subtrees.append(
# self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters))
# i_answer += 1
# else:
# merged_partial_subtrees.append([Result(self, self.index, create_output_string)])
# else:
# if 'children' in active_temporary_query_trees[i_query - len(active_permanent_query_trees)]:
# merged_partial_subtrees.append(
# self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters))
# i_answer += 1
# else:
# merged_partial_subtrees.append([Result(self, self.index, create_output_string)])
if i_query < len(active_permanent_query_trees):
if 'children' in active_permanent_query_trees[i_query]:
merged_partial_subtrees.append(
@ -366,9 +223,6 @@ class Tree(object):
:param temporary_query_trees:
"""
# if create_output_string_form(self) == 'vožnji':
# print('HERE!@@!')
# list of all children queries grouped by parent queries
all_query_indices = []
@ -384,7 +238,6 @@ class Tree(object):
successful_temporary_queries = []
for i, temporary_query_tree in enumerate(temporary_query_trees):
if self.fits_static_requirements(temporary_query_tree, filters) and self.fits_temporary_requirements(filters):
# if 'l_children' in temporary_query_tree and 'r_children' in temporary_query_tree:
active_temporary_query_trees.append(temporary_query_tree)
successful_temporary_queries.append(i)
if 'children' in temporary_query_tree:
@ -397,7 +250,6 @@ class Tree(object):
create_output_string, filters)
merged_partial_answers = []
# merged_partial_answers_architecture = []
i_question = 0
# i_child is necessary, because some queries may be answered at the beginning and were not passed to children.
# i_child is used to point where we are inside answers
@ -414,82 +266,30 @@ class Tree(object):
# TODO FINALIZE RESULT
# erase first and last braclets when adding new query result
add_subtree = [subtree.finalize_result() for subtree in merged_partial_answers[i]]
# if 0 < len(active_permanent_query_trees):
complete_answers[i].extend(add_subtree)
# completed_subtrees[i].extend(merged_partial_subtrees[i])
# answers to valid queries
partial_answers = [[] for i in range(len(temporary_query_trees))]
for inside_i, outside_i in enumerate(successful_temporary_queries):
# partial_answers_architecture[outside_i] = merged_partial_answers_architecture[len(active_permanent_query_trees) + inside_i]
partial_answers[outside_i] = merged_partial_answers[
len(active_permanent_query_trees) + inside_i]
# return subtrees_architecture, subtrees, completed_subtrees
return partial_answers, complete_answers
# return merged_partial_subtrees_architecture[len(active_permanent_query_trees):], merged_partial_subtrees[len(active_permanent_query_trees):], completed_subtrees
# @staticmethod
# def merge_results(left_parts, right_parts, separator, left=True, right_part_free=False):
# if not left_parts:
# # return all right_parts
# return [r_p.add_separator(separator, left) for r_p in right_parts]
# # if left:
# # return [r_p + separator for r_p in right_parts]
# # # return [r_p.add_separator(separator, left) for r_p in right_parts]
# # else:
# # return [separator + r_p for r_p in right_parts]
#
# if not right_parts:
# return [l_p.add_separator(separator, False) for l_p in left_parts]
# # return [separator + l_p for l_p in left_parts]
# merged_results = []
# for left_part in left_parts:
# if right_part_free:
# for right_part in right_parts[1]:
# merged_results.append((right_parts[0], left_part.merge_results(right_part, separator, left)))
# else:
# for right_part in right_parts:
# merged_results.append(left_part.merge_results(right_part, separator, left))
# # merged_results.append(left_part.merge_results(right_part, separator))
# # if separator:
# # if left:
# # merged_results.append(left_part + right_part + separator)
# # else:
# # merged_results.append(left_part + separator + right_part)
# # else:
# # merged_results.append(left_part + right_part)
# return merged_results
@staticmethod
def create_children_groups(left_parts, right_parts):
if not left_parts:
# return all right_parts
return right_parts
# if left:
# return [r_p + separator for r_p in right_parts]
# # return [r_p.add_separator(separator, left) for r_p in right_parts]
# else:
# return [separator + r_p for r_p in right_parts]
if not right_parts:
return left_parts
# return [separator + l_p for l_p in left_parts]
all_children_group_possibilities = []
for left_part in left_parts:
for right_part in right_parts:
new_part = copy(left_part)
# new_part.reset_params()
new_part.extend(right_part)
all_children_group_possibilities.append(new_part)
# merged_results.append(left_part.merge_results(right_part, separator))
# if separator:
# if left:
# merged_results.append(left_part + right_part + separator)
# else:
# merged_results.append(left_part + separator + right_part)
# else:
# merged_results.append(left_part + right_part)
return all_children_group_possibilities
@staticmethod
@ -500,112 +300,15 @@ class Tree(object):
for answer2p_i, new_result in enumerate(answer2):
if answer1p_i != answer2p_i:
new_indices = [answer1p_i] + [answer2p_i]
sorted_indices = sorted(new_indices)
if sorted_indices in merged_indices:
test = merged_indices.index(sorted(new_indices))
# TODO add comparison answers with different indices if equal than ignore
# TODO add comparison answers with different indices if equal than ignore
merged_results.append(old_result + new_result)
merged_indices.append(new_indices)
return merged_results, merged_indices
# def merge_results2(self, child, new_results, filters):
# if create_output_string_form(self) == 'začelo':
# print('HERE!@@!')
# if create_output_string_form(self) == 'Dogodek':
# print('HERE!@@!')
# if create_output_string_form(self) == 'utišal':
# print('HERE!@@!')
# if create_output_string_form(self) == 'prijel':
# print('HERE!@@!')
# if filters['node_order']:
# new_child = child
# # new_child_sorted = sorted(enumerate(child), key=lambda x: x[1][0].key)
# else:
# new_child = sorted(child, key=lambda x: x[0].key)
#
# l_res = []
# r_res = []
# results = []
# for i_answer, answer in enumerate(new_child):
# if filters['node_order'] and answer[0].order[0] < self.index:
# # if filters['node_order'] and indices[i_child][i_answer] < self.children_split:
# if filters['dependency_type']:
# # separator = ' <' + deprel[i_child][i_answer] + ' '
# separator = ' <' + answer[0].deprel + ' '
# else:
# separator = ' < '
# l_res = self.merge_results(l_res, answer, separator, left=True)
# # l_res += answer + separator
# else:
# if filters['dependency_type']:
# separator = ' >' + answer[0].deprel + ' '
# else:
# separator = ' > '
# r_res = self.merge_results(r_res, answer, separator, left=False)
# # r_res += separator + answer
#
# # if filters['node_order']:
# # r_res_sorted = []
# # for i_answer, answer in new_child_sorted:
# # if filters['dependency_type']:
# # separator = ' >' + answer[0].deprel + ' '
# # else:
# # separator = ' > '
# # r_res_sorted = (i_answer, self.merge_results(r_res_sorted, answer, separator, left=False))
# #
# #
# # r_res_sorted_combined = self.merge_results(new_results, r_res_sorted, None, right_part_free=True)
# # # print('here')
#
# if l_res:
# l_res_combined = self.merge_results(l_res, new_results, None)
# if r_res:
# r_res_combined = self.merge_results(l_res_combined, r_res, None)
# # merged_results.extend(['(' + el + ')' for el in r_res_combined])
# result = r_res_combined
# # results.extend([el.put_in_bracelets() for el in r_res_combined])
# else:
# result = l_res_combined
# # results.extend([el.put_in_bracelets() for el in l_res_combined])
# elif r_res:
# r_res_combined = self.merge_results(new_results, r_res, None)
# result = r_res_combined
# # results.extend([el.put_in_bracelets() for el in r_res_combined])
# else:
# result = []
#
#
# results.extend([el.put_in_bracelets() for el in result])
#
# return results
# def create_merged_results(self, answers, separators, separator_switch):
# new_answers = []
# for answer_i, answer in enumerate(answers):
# new_answer = copy(answer[0])
# print(create_output_string_form(self))
# for answer_part_i, answer_part in enumerate(answer[1:]):
# new_answer.extend_answer(answer_part, separators[answer_part_i])
# new_answer.put_in_bracelets(inplace=True)
# new_answers.append(new_answer)
# return new_answers
# def create_merged_results(self, new_child, new_answers, i_child, indices, deprel, filters):
def merge_results3(self, child, new_results, filters):
# if create_output_string_form(self) == 'Dogodek':
# print('HERE!@@!')
# if create_output_string_form(self) == 'začelo':
# print('HERE!@@!')
# if create_output_string_form(self) == 'utišal':
# print('HERE!@@!')
# if create_output_string_form(self) == 'prijel':
# print('HERE!@@!')
if filters['node_order']:
new_child = child
# new_child_sorted = sorted(enumerate(child), key=lambda x: x[1][0].key)
# new_child_sorted = sorted(child, key=lambda x: x[0].get_key())
else:
new_child = sorted(child, key=lambda x: x[0].get_key())
@ -613,58 +316,28 @@ class Tree(object):
for i_answer, answer in enumerate(new_child):
children_groups = self.create_children_groups(children_groups, [[answer_part] for answer_part in answer])
# r_res += separator + answer
# children_groups_sorted = []
# for i_answer, answer in enumerate(new_child_sorted):
# children_groups_sorted = self.create_children_groups(children_groups_sorted, [[answer_part] for answer_part in answer])
#
#
# results_sorted = {}
# for result in new_results:
# for children in children_groups_sorted:
# new_result = copy(result)
# new_result.set_children(children)
# order = tuple(sorted(new_result.get_order()))
# results_sorted[order] = new_result
results = []
for result in new_results:
for children in children_groups:
new_result = copy(result)
# if result.key is not None or result.order is not None or result.array is not None or result.order_key is not None:
# print('here')
# new_result.reset_params()
new_result.set_children(children)
# order = tuple(sorted(new_result.get_order()))
results.append(new_result)
return results
def create_output_children(self, children, new_results, filters):
# if create_output_string_form(self) == 'Dogodek':
# print('HERE!@@!')
# if create_output_string_form(self) == 'utišal':
# print('HERE!@@!')
# if len(new_results) > 1:
# print('HERE')
merged_results = []
for i_child, child in enumerate(children):
# merged_results.extend(self.merge_results2(child, new_results, filters))
merged_results.extend(self.merge_results3(child, new_results, filters))
return merged_results
# @staticmethod
def create_answers(self, separated_answers, answer_length, filters):
partly_built_trees = [[None] * answer_length]
partly_built_trees_architecture_indices = [[None] * answer_length]
built_trees = []
built_trees_architecture_indices = []
# if create_output_string_form(self) == 'Dogodek':
# print('HERE!@@!')
# iterate over children first, so that new partly built trees are added only after all results of specific
# child are added
for child_i in range(len(separated_answers[0])):
@ -712,10 +385,7 @@ class Tree(object):
for unique_tree in unique_trees_architecture:
already_in = True
for part_i in range(len(unique_tree)):
# test = unique_tree[part_i][0].get_order_key()
if len(unique_tree[part_i]) != len(new_tree[part_i]) or any(unique_tree[part_i][i_unique_part].get_order_key() != new_tree[part_i][i_unique_part].get_order_key() for i_unique_part in range(len(unique_tree[part_i]))):
# if len(unique_tree[part_i]) != len(new_tree[part_i]) or any(unique_tree[part_i][i_unique_part].order_key != new_tree[part_i][i_unique_part].order_key for i_unique_part in range(len(unique_tree[part_i]))):
# if unique_tree[part_i].order_key != new_tree[part_i].order_key:
already_in = False
break
if already_in:
@ -724,20 +394,5 @@ class Tree(object):
if is_unique:
unique_trees_architecture.append(new_tree)
# if not filters['node_order']:
# l_ordered_built_trees_architecture.append(new_tree_architecture)
l_ordered_built_trees.append(new_tree)
# TODO NODE ORDER = FALSE
# else:
#
# ordered_built_trees_architecture.append(tree_architecture)
# ordered_built_trees.append(tree)
# print("test")
# for answer1_i, answer1 in enumerate(separated_answers):
# for answer2_i, answer2 in enumerate(separated_answers):
# if answer1_i != answer2_i:
# res, res_i = self.merge_answer(answer1, answer2, answer1_i, answer2_i)
# print('aaa')
#
# pass
return l_ordered_built_trees

152
dependency-parsetree.py

@ -23,11 +23,13 @@ import pickle
import re
import string
import time
import timeit
from multiprocessing import Pool
from pathlib import Path
import gzip
import sys
import pyconll
from Tree import Tree
from generic import get_collocabilities, create_output_string_form, create_output_string_deprel, create_output_string_lemma, create_output_string_upos, create_output_string_xpos, create_output_string_feats
sys.setrecursionlimit(25000)
def save_zipped_pickle(obj, filename, protocol=-1):
@ -39,31 +41,6 @@ def load_zipped_pickle(filename):
loaded_object = pickle.load(f)
return loaded_object
import pyconll
from Tree import Tree, create_output_string_form, create_output_string_deprel, create_output_string_lemma, create_output_string_upos, create_output_string_xpos, create_output_string_feats
# for separate searches of feats
# feats_detailed_list = [
# # lexical features
# 'PronType', 'NumType', 'Poss', 'Reflex', 'Foreign', 'Abbr',
#
# # Inflectional features (nominal)
# 'Gender', 'Animacy', 'NounClass', 'Number', 'Case', 'Definite', 'Degree',
#
# # Inflectional features (verbal)
# 'VerbForm', 'Mood', 'Tense', 'Aspect', 'Voice', 'Evident', 'Polarity', 'Person', 'Polite', 'Clusivity',
#
# # Other
# 'Variant', 'Number[psor]', 'Gender[psor]', 'NumForm'
# ]
# feats_detailed_list = []
# feats_detailed_dict = {key: {} for key in feats_detailed_list}
from generic import get_collocabilities
def decode_query(orig_query, dependency_type, feats_detailed_list):
new_query = False
@ -72,7 +49,6 @@ def decode_query(orig_query, dependency_type, feats_detailed_list):
new_query = True
orig_query = orig_query[1:-1]
# if orig_query is '_' return {}
if dependency_type != '':
decoded_query = {'deprel': dependency_type}
else:
@ -88,19 +64,14 @@ def decode_query(orig_query, dependency_type, feats_detailed_list):
if len(orig_query_split) > 1:
if orig_query_split[0] == 'L':
decoded_query['lemma'] = orig_query_split[1]
# return decoded_query
elif orig_query_split[0] == 'upos':
decoded_query['upos'] = orig_query_split[1]
# return decoded_query
elif orig_query_split[0] == 'xpos':
decoded_query['xpos'] = orig_query_split[1]
# return decoded_query
elif orig_query_split[0] == 'form':
decoded_query['form'] = orig_query_split[1]
# return decoded_query
elif orig_query_split[0] == 'feats':
decoded_query['feats'] = orig_query_split[1]
# return decoded_query
elif orig_query_split[0] in feats_detailed_list:
decoded_query['feats_detailed'] = {}
decoded_query['feats_detailed'][orig_query_split[0]] = orig_query_split[1]
@ -111,18 +82,11 @@ def decode_query(orig_query, dependency_type, feats_detailed_list):
print('???')
elif not new_query:
decoded_query['form'] = orig_query_split_part
# return decoded_query
return decoded_query
# split over spaces if not inside braces
# PATTERN = re.compile(r'''((?:[^ ()]|\([^.]*\))+)''')
# all_orders = PATTERN.split(orig_query)
# PATTERN = re.compile(r"(?:[^ ()]|\([^.]*\))+")
# all_orders = re.findall(r"(?:[^ ()]|\([^]*\))+", orig_query)
all_orders = re.split(r"\s+(?=[^()]*(?:\(|$))", orig_query)
# all_orders = orig_query.split()
node_actions = all_orders[::2]
priority_actions = all_orders[1::2]
priority_actions_beginnings = [a[0] for a in priority_actions]
@ -148,8 +112,6 @@ def decode_query(orig_query, dependency_type, feats_detailed_list):
def create_trees(input_path, internal_saves, feats_detailed_dict={}, save=True):
# internal_saves = filters['internal_saves']
# input_path = filters['input']
hash_object = hashlib.sha1(input_path.encode('utf-8'))
hex_dig = hash_object.hexdigest()
trees_read_outputfile = os.path.join(internal_saves, hex_dig)
@ -165,13 +127,8 @@ def create_trees(input_path, internal_saves, feats_detailed_dict={}, save=True):
for sentence in train:
root = None
root_id = None
token_nodes = []
for token in sentence:
# token_feats = ''
# for k, v in token.feats.items():
# token_feats += k + next(iter(v)) + '|'
# token_feats = token_feats[:-1]
if not token.id.isdigit():
continue
@ -194,12 +151,6 @@ def create_trees(input_path, internal_saves, feats_detailed_dict={}, save=True):
token.set_parent(None)
else:
parent_id = int(token.parent) - 1
# if token_id < parent_id:
# token_nodes[parent_id].add_l_child(token)
# elif token_id > parent_id:
# token_nodes[parent_id].add_r_child(token)
# else:
# raise Exception('Root element should not be here!')
if token_nodes[parent_id].children_split == -1 and token_id > parent_id:
token_nodes[parent_id].children_split = len(token_nodes[parent_id].children)
token_nodes[parent_id].add_child(token)
@ -210,35 +161,19 @@ def create_trees(input_path, internal_saves, feats_detailed_dict={}, save=True):
token.children_split = len(token.children)
if root == None:
# print(input_path)
print('No root: ' + sentence.id)
continue
# raise Exception('No root element in sentence!')
all_trees.append(root)
if save:
save_zipped_pickle((all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict), trees_read_outputfile, protocol=2)
# with open(trees_read_outputfile, 'wb') as output:
#
# pickle.dump((all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict), output)
else:
print('Reading trees:')
print('Completed')
all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict = load_zipped_pickle(trees_read_outputfile)
# with open(trees_read_outputfile, 'rb') as pkl_file:
# (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict) = pickle.load(pkl_file)
return all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict
# def order_independent_queries(query_tree):
# all_children = query_tree['l_children'] + query_tree['r_children']
# if all_children > 0:
#
# else:
# return query_tree
# pass
def printable_answers(query):
# all_orders = re.findall(r"(?:[^ ()]|\([^]*\))+", query)
all_orders = re.split(r"\s+(?=[^()]*(?:\(|$))", query)
@ -293,11 +228,6 @@ def tree_calculations_chunks(input_data):
return result_dict
def chunkify(a, n):
k, m = divmod(len(a), n)
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
def add_node(tree):
if 'children' in tree:
tree['children'].append({})
@ -362,30 +292,11 @@ def create_ngrams_query_trees(n, trees):
new_trees.append(new_tree)
trees = new_trees
# delete_duplicates(trees)
# print('here')
# tree_grow(tree)
# tree_grow(tree)
# tree['children'] = [{}]
return trees
def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict, result_dict):
with Pool(cpu_cores) as p:
# 1.25 s (16 cores)
# chunked_trees = list(chunkify(all_trees, cpu_cores))
# if cpu_cores > 1:
# part_results = p.map(tree_calculations_chunks,
# [(tree, query_tree, create_output_string_funct, filters) for tree in chunked_trees])
#
# for part_result in part_results:
# for r_k, r_v in part_result.items():
# if r_k in result_dict:
# result_dict[r_k] += r_v
# else:
# result_dict[r_k] = r_v
# 1.02 s (16 cores)
if cpu_cores > 1:
# input_data = (tree, query_tree, create_output_string_functs, filters)
all_unigrams = p.map(get_unigrams, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees])
for unigrams in all_unigrams:
for unigram in unigrams:
@ -396,24 +307,14 @@ def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, f
all_subtrees = p.map(tree_calculations, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees])
# for subtrees in all_subtrees:
for tree_i, subtrees in enumerate(all_subtrees):
for query_results in subtrees:
for r in query_results:
# if r.key == '(ne <advmod more >xcomp (se <expl izogniti) >punct .)':
# print('HERE')
# print(tree_i)
if filters['node_order']:
key = r.get_key() + r.order
else:
key = r.get_key()
# if r == '(" < , < je < velik) < tem':
# print(tree_i)
# if r in result_dict:
# result_dict[r] += 1
# else:
# result_dict[r] = 1
if key in result_dict:
result_dict[key]['number'] += 1
else:
@ -421,11 +322,7 @@ def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, f
# 3.65 s (1 core)
else:
# for tree_i, tree in enumerate(all_trees[-5:]):
for tree_i, tree in enumerate(all_trees):
# for tree_i, tree in enumerate(all_trees[852:]):
# for tree_i, tree in enumerate(all_trees[1689:]):
# for tree_i, tree in enumerate(all_trees[1:3]):
input_data = (tree, query_tree, create_output_string_functs, filters)
if filters['association_measures']:
unigrams = get_unigrams(input_data)
@ -434,10 +331,7 @@ def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, f
unigrams_dict[unigram] += 1
else:
unigrams_dict[unigram] = 1
# for tree_i, tree in enumerate(all_trees[1:]):
# text = Če pa ostane odrasel otrok doma, se starši le težko sprijaznijo s tem, da je "velik", otrok pa ima ves čas občutek, da se njegovi starši po nepotrebnem vtikajo v njegovo življenje.
# for tree_i, tree in enumerate(all_trees[5170:]):
# for tree in all_trees:
subtrees = tree_calculations(input_data)
for query_results in subtrees:
for r in query_results:
@ -445,8 +339,6 @@ def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, f
key = r.get_key() + r.order
else:
key = r.get_key()
# if r == '(" < , < je < velik) < tem':
# print(tree_i)
if key in result_dict:
result_dict[key]['number'] += 1
else:
@ -465,7 +357,6 @@ def read_filters(config, feats_detailed_list):
query_tree.extend(create_ngrams_query_trees(i, [{}]))
else:
query_tree = [decode_query('(' + config.get('settings', 'query') + ')', '', feats_detailed_list)]
# order_independent_queries(query_tree)
# set filters
node_types = config.get('settings', 'node_type').split('+')
@ -506,11 +397,8 @@ def read_filters(config, feats_detailed_list):
attribute_dict = {}
for attribute in option.split('&'):
value = attribute.split('=')
# assert value[0] in ['deprel', 'lemma', 'upos', 'xpos', 'form',
# 'feats'], '"root_whitelist" is not set up correctly'
attribute_dict[value[0]] = value[1]
filters['root_whitelist'].append(attribute_dict)
# filters['root_whitelist'] = [{'upos': 'NOUN', 'Case': 'Nom'}, {'upos': 'ADJ', 'Degree': 'Sup'}]
else:
filters['root_whitelist'] = []
@ -540,12 +428,6 @@ def main():
internal_saves = config.get('settings', 'internal_saves')
input_path = config.get('settings', 'input')
# a = args.config_file
# config.read('config.ini')
# create queries
if os.path.isdir(input_path):
checkpoint_path = Path(internal_saves, 'checkpoint.pkl')
@ -572,9 +454,6 @@ def main():
for path in sorted(pathlist):
# because path is object not string
path_str = str(path)
# if Path(path_str).name == 'GF0003946-dedup.conllu':
# break
# print(path_in_str)
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, sub_corpus_size,
feats_detailed_list) = create_trees(path_str, internal_saves, feats_detailed_dict=feats_detailed_list, save=False)
@ -593,7 +472,6 @@ def main():
# 15.26
print("Execution time:")
print("--- %s seconds ---" % (time.time() - start_exe_time))
# print(1 + 'asd')
save_zipped_pickle(
(already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list),
checkpoint_path, protocol=2)
@ -620,26 +498,6 @@ def main():
print("Execution time:")
print("--- %s seconds ---" % (time.time() - start_exe_time))
# test 1 layer queries
# # tree.r_children = []
# # tree.children[1].children = []
# # query = [{'children': [{}]}, {'children': [{}]}]
# # query = [{"children": [{}, {}]}, {"children": [{}]}, {"children": [{}, {}, {}]}]
# query = [{"children": [{'form': 'je'}, {}]}, {"children": [{'form': 'je'}]}, {"children": [{'form': 'je'}, {}, {}]}]
# # query = [{'q1':'', "children": [{'a1':''}, {'a2':''}]}, {'q2':'', "children": [{'b1':''}]}, {'q3':'', "children": [{'c1':''}, {'c2':''}, {'c3':''}]}]
# _, _, subtrees = tree.get_subtrees(query, [], create_output_string_funct)
# # _, subtrees = tree.get_subtrees([{'q1':'', "children": [{'a1':''}, {'a2':''}], "children": []}, {'q2':'', "children": [{'b1':''}], "children": []}, {'q3':'', "children": [{'c1':''}, {'c2':''}, {'c3':''}], "children": []}], [])
# print('HERE!')
# test 2 layer queries
# tree.r_children = [Tree('je', '', '', '', '', form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, None)]
# tree.l_children[1].l_children = []
# new_tree = Tree('bil', '', '', '', '', form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, None)
# new_tree.l_children = [tree]
# _, subtrees = new_tree.get_subtrees(
# [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}]}]}], [])
# # _, subtrees = new_tree.get_subtrees(
# # [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}], "r_children": []}], "r_children": []}], [])
sorted_list = sorted(result_dict.items(), key=lambda x: x[1]['number'], reverse=True)
with open(config.get('settings', 'output'), "w", newline="") as f:
@ -660,7 +518,6 @@ def main():
header += ['Root node']
if filters['association_measures']:
header += ['MI', 'MI3', 'Dice', 'logDice', 't-score', 'simple-LL']
# header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Absolute frequency']
writer.writerow(header)
if filters['lines_threshold']:
@ -673,7 +530,6 @@ def main():
if filters['frequency_threshold'] and filters['frequency_threshold'] > v['number']:
break
words_only = [word_att for word in v['object'].array for word_att in word] + ['' for i in range((tree_size_range[-1] - len(v['object'].array)) * len(v['object'].array[0]))]
# words_only = printable_answers(k)
row = [v['object'].get_key()[1:-1]] + words_only + [str(v['number'])]
row += ['%.4f' % relative_frequency]
if filters['node_order']:

3
generic.py

@ -45,7 +45,6 @@ def generate_key(node, create_output_strings, print_lemma=True):
if len(array[0]) > 1:
key = '&'.join(key_array[0])
else:
# output_string = create_output_strings[0](node)
key = key_array[0][0]
return array, key
@ -61,7 +60,6 @@ def generate_name(node, create_output_strings, print_lemma=True):
if len(array) > 1:
name = '&'.join(name_array)
else:
# output_string = create_output_strings[0](node)
name = name_array[0]
return array, name
@ -74,7 +72,6 @@ def get_collocabilities(ngram, unigrams_dict, corpus_size):
if len(key_array) > 1:
key = '&'.join(key_array)
else:
# output_string = create_output_strings[0](node)
key = key_array[0]
sum_fwi += unigrams_dict[key]
mul_fwi *= unigrams_dict[key]

Loading…
Cancel
Save