Cleaning repo

This commit is contained in:
Luka 2019-12-30 11:30:36 +01:00
parent 36223d3a98
commit 421f12cac6
5 changed files with 8 additions and 606 deletions

View File

@ -18,17 +18,8 @@ from generic import generate_key, generate_name
class ResultNode(object): class ResultNode(object):
def __init__(self, node, architecture_order, create_output_strings): def __init__(self, node, architecture_order, create_output_strings):
self.name_parts, self.name = generate_name(node, create_output_strings) self.name_parts, self.name = generate_name(node, create_output_strings)
# self.key_free = self.key
# self.array = [[output_string]]
# self.order_key = str(architecture_order)
self.location = architecture_order self.location = architecture_order
self.deprel = node.deprel.get_value() self.deprel = node.deprel.get_value()
# order with original numbers in sentences
# self.order = str([architecture_order])
# order with numbers from 0 to n of n-gram
# self.root = ''
# self.final_order = ''
# self.separators = []
def __repr__(self): def __repr__(self):
return self.name return self.name

View File

@ -15,9 +15,6 @@
import copy import copy
import string import string
from generic import create_output_string_form, create_output_string_deprel, create_output_string_lemma, \
create_output_string_upos, create_output_string_xpos, create_output_string_feats, generate_key
class ResultTree(object): class ResultTree(object):
def __init__(self, node, children, filters): def __init__(self, node, children, filters):
@ -51,7 +48,6 @@ class ResultTree(object):
for child in children: for child in children:
if self.filters['node_order'] and child.node.location < self.node.location: if self.filters['node_order'] and child.node.location < self.node.location:
if self.filters['dependency_type']: if self.filters['dependency_type']:
# separator = ' <' + deprel[i_child][i_answer] + ' '
separator = ' <' + child.node.deprel + ' ' separator = ' <' + child.node.deprel + ' '
else: else:
separator = ' < ' separator = ' < '
@ -104,7 +100,6 @@ class ResultTree(object):
for child in self.children: for child in self.children:
if self.filters['node_order'] and child.node.location < self.node.location: if self.filters['node_order'] and child.node.location < self.node.location:
if self.filters['dependency_type']: if self.filters['dependency_type']:
# separator = ' <' + deprel[i_child][i_answer] + ' '
separator = ' <' + child.node.deprel + ' ' separator = ' <' + child.node.deprel + ' '
else: else:
separator = ' < ' separator = ' < '
@ -169,93 +164,9 @@ class ResultTree(object):
self.array = [self.node.name_parts] self.array = [self.node.name_parts]
return self.array return self.array
# def add(self, string, architecture_order, separator, is_left):
# if is_left:
# self.array = [string] + self.array
# self.order = [architecture_order] + self.order
# # self.order = [architecture_order] + self.order
# self.separators = [separator] + self.separators
# self.key = string + ' ' + separator + ' ' + self.key
# self.order_key = architecture_order + ' ' + separator + ' ' + self.order_key
#
# else:
# self.array += [string]
# self.order += [architecture_order]
# # self.order += [architecture_order]
# self.separators += [separator]
#
# self.key += ' ' + separator + ' ' + string
# self.order_key += ' ' + separator + ' ' + architecture_order
# def add_separator(self, separator, left=True):
# self_copy = copy.copy(self)
# if left:
# self_copy.separators += [separator]
# self_copy.key += separator
# self_copy.order_key += separator
# else:
# self_copy.separators = [separator] + self_copy.separators
# self_copy.key = separator + self_copy.key
# self_copy.order_key = separator + self_copy.order_key
# return self_copy
# def merge_results2(self):
# def merge_results(self, right_t, separator, left=True):
# left_tree = copy.copy(self)
# right_tree = copy.copy(right_t)
#
# if separator:
# if left:
# # merged_results.append(left_part + right_part + separator)
# left_tree.key = left_tree.key + right_tree.key + separator
# left_tree.order_key = left_tree.order_key + right_tree.order_key + separator
# left_tree.array = left_tree.array + right_tree.array
# left_tree.order = left_tree.order + right_tree.order
# # left_tree.order = str([architecture_order])
# left_tree.separators = left_tree.separators + right_tree.separators + [separator]
# else:
# # merged_results.append(left_part + separator + right_part)
# left_tree.key = left_tree.key + separator + right_tree.key
# left_tree.order_key = left_tree.order_key + separator + right_tree.order_key
# left_tree.array = left_tree.array + right_tree.array
# left_tree.order = left_tree.order + right_tree.order
# # left_tree.order = str([architecture_order])
# left_tree.separators = left_tree.separators + [separator] + right_tree.separators
# else:
# # merged_results.append(left_part + right_part)
# left_tree.key = left_tree.key + right_tree.key
# left_tree.order_key = left_tree.order_key + right_tree.order_key
# left_tree.array = left_tree.array + right_tree.array
# left_tree.order = left_tree.order + right_tree.order
# # left_tree.order = str([architecture_order])
# left_tree.separators = left_tree.separators + right_tree.separators
#
# return left_tree
# def extend_answer(self, other_answer, separator):
# self.array.extend(other_answer.array)
# self.order.extend(other_answer.order)
# self.key += separator + other_answer.key
# self.order_key += separator + other_answer.order_key
# self.separators.extend(separator)
# def put_in_bracelets(self, inplace=False):
# if inplace:
# self.key = ('(' + self.key + ')')
# self.order_key = ('(' + self.order_key + ')')
# return
# result = copy.copy(self)
# result.key = ('(' + result.key + ')')
# result.order_key = ('(' + result.order_key + ')')
# return result
def finalize_result(self): def finalize_result(self):
result = copy.copy(self) result = copy.copy(self)
result.reset_params() result.reset_params()
# result.key = result.get_key()
# result.set_root()
# create order letters # create order letters
order = result.get_order() order = result.get_order()
@ -265,13 +176,5 @@ class ResultTree(object):
order[ind] = 10000 order[ind] = 10000
order_letters[ind] = string.ascii_uppercase[i] order_letters[ind] = string.ascii_uppercase[i]
result.order = ''.join(order_letters) result.order = ''.join(order_letters)
# result.order_key = result.order_key[1:-1]
# TODO When tree is finalized create relative word order (alphabet)! # TODO When tree is finalized create relative word order (alphabet)!
return result return result
# def set_root(self):
# if len(self.array[0]) > 1:
# self.root = '&'.join(self.array[0])
# else:
# # output_string = create_output_strings[0](node)
# self.root = self.array[0][0]

351
Tree.py
View File

@ -4,8 +4,7 @@ from copy import copy
from ResultNode import ResultNode from ResultNode import ResultNode
from ResultTree import ResultTree from ResultTree import ResultTree
from Value import Value from Value import Value
from generic import create_output_string_form, create_output_string_deprel, create_output_string_lemma, \ from generic import generate_key
create_output_string_upos, create_output_string_xpos, create_output_string_feats, generate_key
class Tree(object): class Tree(object):
@ -13,7 +12,6 @@ class Tree(object):
if not hasattr(self, 'feats'): if not hasattr(self, 'feats'):
self.feats_detailed = {} self.feats_detailed = {}
# form_unicode = str(form).encode("utf-8")
if form not in form_dict: if form not in form_dict:
form_dict[form] = Value(form) form_dict[form] = Value(form)
self.form = form_dict[form] self.form = form_dict[form]
@ -40,7 +38,6 @@ class Tree(object):
if not feat in self.feats_detailed: if not feat in self.feats_detailed:
self.feats_detailed[feat] = {} self.feats_detailed[feat] = {}
self.feats_detailed[feat][next(iter(feats_detailed[feat]))] = feats_detailed_dict[feat][next(iter(feats_detailed[feat]))] self.feats_detailed[feat][next(iter(feats_detailed[feat]))] = feats_detailed_dict[feat][next(iter(feats_detailed[feat]))]
# self.position = position
self.parent = head self.parent = head
self.children = [] self.children = []
@ -52,7 +49,6 @@ class Tree(object):
self.cache = {} self.cache = {}
def add_child(self, child): def add_child(self, child):
# child.index = len(self.children)
self.children.append(child) self.children.append(child)
def set_parent(self, parent): def set_parent(self, parent):
@ -68,7 +64,6 @@ class Tree(object):
return True return True
def fits_permanent_requirements(self, filters): def fits_permanent_requirements(self, filters):
main_attributes = ['deprel', 'feats', 'form', 'lemma', 'upos'] main_attributes = ['deprel', 'feats', 'form', 'lemma', 'upos']
@ -121,8 +116,6 @@ class Tree(object):
if result_index in partial_results and result_part_index in partial_results[result_index] and len(partial_results[result_index][result_part_index]) > 0: if result_index in partial_results and result_part_index in partial_results[result_index] and len(partial_results[result_index][result_part_index]) > 0:
if len(all_query_indices[result_index][0]) > result_part_index + 1: if len(all_query_indices[result_index][0]) > result_part_index + 1:
new_queries.append((result_part_index + 1, result_index, is_permanent)) new_queries.append((result_part_index + 1, result_index, is_permanent))
# else:
# completed_subtrees.append((child, result_index))
child_queries_metadata = new_queries child_queries_metadata = new_queries
@ -142,76 +135,11 @@ class Tree(object):
def add_subtrees(self, old_subtree, new_subtree): def add_subtrees(self, old_subtree, new_subtree):
old_subtree.extend(new_subtree) old_subtree.extend(new_subtree)
# def get_results(self, partial_results_dict, result_index, result_part, outcome, last_result_part):
# # save results for later usage
#
# # if result index already in and element 0 exists (otherwise error)
# if result_index in partial_results_dict and 0 in partial_results_dict[result_index]:
# if result_part - 1 in partial_results_dict[result_index]:
# if result_part in partial_results_dict[result_index]:
# partial_results_dict[result_index][result_part].extend(self.merge_results(partial_results_dict[result_index][result_part - 1], outcome))
# else:
# partial_results_dict[result_index][result_part] = self.merge_results(partial_results_dict[result_index][result_part - 1], outcome)
#
# # extend one word layer with output
# else:
# partial_results_dict[result_index][0].extend(outcome)
# else:
# partial_results_dict[result_index] = {0: outcome}
#
# if last_result_part - 1 in partial_results_dict[result_index]:
# return partial_results_dict[result_index].pop(last_result_part - 1)
# return []
# def group_results(self, new_partial_subtrees, child_queries_metadata, all_query_indices, partial_results_dict, partial_subtrees):
# for outcome, (result_part, result_index, is_permanent) in zip(new_partial_subtrees, child_queries_metadata):
# if outcome:
# new_results = self.get_results(partial_results_dict, result_index, result_part, outcome, len(all_query_indices[result_index][0]))
# if new_results:
# self.add_subtrees(partial_subtrees[result_index], new_results)
# else:
# if not is_permanent:
# partial_subtrees[result_index].append([])
# def get_all_query_indices_old(self, temporary_query_trees_size, completed_subtrees_size, permanent_query_trees, l_all_query_indices, children, create_output_string):
# partial_subtrees = [[] for i in range(completed_subtrees_size + temporary_query_trees_size)]
# completed_subtrees = [[] for i in range(completed_subtrees_size)]
#
# # list of pairs (index of query in group, group of query)
# partial_results_dict = {}
#
# children_queries_generator = self.generate_children_queries(l_all_query_indices, children)
#
# child_index = 0
# child, child_queries, child_queries_metadata = next(children_queries_generator)
# while child:
# # obtain children results
# new_partial_subtrees, new_completed_subtrees = child.get_subtrees(permanent_query_trees, child_queries, create_output_string)
#
# self.group_results(new_partial_subtrees, child_queries_metadata, l_all_query_indices,
# partial_results_dict, partial_subtrees)
#
# for i in range(len(new_completed_subtrees)):
# completed_subtrees[i].extend(new_completed_subtrees[i])
# child, child_queries, child_queries_metadata = children_queries_generator.send(partial_results_dict)
# child_index += 1
#
# return partial_subtrees, completed_subtrees
def get_all_query_indices(self, temporary_query_nb, permanent_query_nb, permanent_query_trees, all_query_indices, children, create_output_string, filters): def get_all_query_indices(self, temporary_query_nb, permanent_query_nb, permanent_query_trees, all_query_indices, children, create_output_string, filters):
# l_partial_subtrees, l_completed_subtrees = self.get_all_query_indices(len(temporary_query_trees),
# len(permanent_query_trees),
# permanent_query_trees,
# l_all_query_indices, self.l_children,
# create_output_string)
partial_answers = [[] for i in range(permanent_query_nb + temporary_query_nb)] partial_answers = [[] for i in range(permanent_query_nb + temporary_query_nb)]
partial_answers_index = [[] for i in range(permanent_query_nb + temporary_query_nb)]
complete_answers = [[] for i in range(permanent_query_nb)] complete_answers = [[] for i in range(permanent_query_nb)]
# list of pairs (index of query in group, group of query) # list of pairs (index of query in group, group of query)
partial_results_dict = {}
# TODO try to erase!!! # TODO try to erase!!!
child_queries = [all_query_indice[0] for all_query_indice in all_query_indices] child_queries = [all_query_indice[0] for all_query_indice in all_query_indices]
@ -221,8 +149,6 @@ class Tree(object):
all_new_partial_answers = [[] for query_part in child_queries_flatten] all_new_partial_answers = [[] for query_part in child_queries_flatten]
# if filters['caching']:
# erase duplicate queries
child_queries_flatten_dedup = [] child_queries_flatten_dedup = []
child_queries_flatten_dedup_indices = [] child_queries_flatten_dedup_indices = []
for query_part in child_queries_flatten: for query_part in child_queries_flatten:
@ -237,7 +163,6 @@ class Tree(object):
# ask children all queries/partial queries # ask children all queries/partial queries
for child in children: for child in children:
# obtain children results # obtain children results
# if filters['caching']:
new_partial_answers_dedup, new_complete_answers = child.get_subtrees(permanent_query_trees, child_queries_flatten_dedup, new_partial_answers_dedup, new_complete_answers = child.get_subtrees(permanent_query_trees, child_queries_flatten_dedup,
create_output_string, filters) create_output_string, filters)
@ -247,32 +172,10 @@ class Tree(object):
for i, flattened_index in enumerate(child_queries_flatten_dedup_indices): for i, flattened_index in enumerate(child_queries_flatten_dedup_indices):
all_new_partial_answers[i].append(new_partial_answers_dedup[flattened_index]) all_new_partial_answers[i].append(new_partial_answers_dedup[flattened_index])
# else:
# new_partial_answers_architecture, new_partial_answers, new_complete_answers = child.get_subtrees(
# permanent_query_trees, child_queries_flatten,
# create_output_string, filters)
#
# assert len(new_partial_answers) == len(child_queries_flatten)
#
# for i, new_partial_subtree in enumerate(new_partial_answers):
# all_new_partial_answers[i].append(new_partial_subtree)
# all_new_partial_answers_architecture[i].append(new_partial_answers_architecture[i])
# # if len(new_partial_answers_architecture[i]) > 1:
# # print('HERE!!!')
# all_new_partial_answers_deprel[i].append(create_output_string_deprel(child))
# add 6 queries from 3 split up
# self.group_results(new_partial_subtrees, child_queries_metadata, all_query_indices,
# partial_results_dict, partial_subtrees)
for i in range(len(new_complete_answers)): for i in range(len(new_complete_answers)):
# TODO add order rearagement (TO KEY) # TODO add order rearagement (TO KEY)
complete_answers[i].extend(new_complete_answers[i]) complete_answers[i].extend(new_complete_answers[i])
# if create_output_string_lemma(self) == 'drama':
# print('HERE!@@!')
# if create_output_string_form(self) == 'vpiti':
# print('HERE!@@!')
# merge answers in appropriate way # merge answers in appropriate way
i = 0 i = 0
# iterate over all answers per queries # iterate over all answers per queries
@ -280,60 +183,14 @@ class Tree(object):
# iterate over answers of query # iterate over answers of query
# TODO ERROR IN HERE! # TODO ERROR IN HERE!
partial_answers[answer_i] = self.create_answers(all_new_partial_answers[i:i + answer_length], answer_length, filters) partial_answers[answer_i] = self.create_answers(all_new_partial_answers[i:i + answer_length], answer_length, filters)
# while i < answers_length:
# self.create_grouped_answers()
# i += 1
i += answer_length i += answer_length
# merged_results = []
# for old_result in old_results:
# for new_result in new_results:
# merged_results.append(old_result + new_result)
# return merged_results
# children_queries_generator = self.generate_children_queries(all_query_indices, children)
#
# child_index = 0
# child, child_queries, child_queries_metadata = next(children_queries_generator)
# while child:
# # obtain children results
# new_partial_subtrees, new_completed_subtrees = child.get_subtrees(permanent_query_trees, child_queries, create_output_string)
#
# self.group_results(new_partial_subtrees, child_queries_metadata, all_query_indices,
# partial_results_dict, partial_subtrees)
#
# for i in range(len(new_completed_subtrees)):
# completed_subtrees[i].extend(new_completed_subtrees[i])
# child, child_queries, child_queries_metadata = children_queries_generator.send(partial_results_dict)
# child_index += 1
return partial_answers, complete_answers return partial_answers, complete_answers
def order_dependent_queries(self, active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, def order_dependent_queries(self, active_permanent_query_trees, active_temporary_query_trees, partial_subtrees,
create_output_string, merged_partial_subtrees, i_query, i_answer, filters): create_output_string, merged_partial_subtrees, i_query, i_answer, filters):
# string_output = ''
# if create_output_string_form(self) == 'vožnji':
# print('HERE!@@!')
# if create_output_string_form(self) == 'začelo':
# print('HERE!@@!')
node = ResultNode(self, self.index, create_output_string) node = ResultNode(self, self.index, create_output_string)
# TEST = ResultTree(node, [], filters)
# a = TEST.create_key()
# if i_query < len(active_permanent_query_trees):
# if 'children' in active_permanent_query_trees[i_query]:
# merged_partial_subtrees.append(
# self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters))
# i_answer += 1
# else:
# merged_partial_subtrees.append([Result(self, self.index, create_output_string)])
# else:
# if 'children' in active_temporary_query_trees[i_query - len(active_permanent_query_trees)]:
# merged_partial_subtrees.append(
# self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters))
# i_answer += 1
# else:
# merged_partial_subtrees.append([Result(self, self.index, create_output_string)])
if i_query < len(active_permanent_query_trees): if i_query < len(active_permanent_query_trees):
if 'children' in active_permanent_query_trees[i_query]: if 'children' in active_permanent_query_trees[i_query]:
merged_partial_subtrees.append( merged_partial_subtrees.append(
@ -366,9 +223,6 @@ class Tree(object):
:param temporary_query_trees: :param temporary_query_trees:
""" """
# if create_output_string_form(self) == 'vožnji':
# print('HERE!@@!')
# list of all children queries grouped by parent queries # list of all children queries grouped by parent queries
all_query_indices = [] all_query_indices = []
@ -384,7 +238,6 @@ class Tree(object):
successful_temporary_queries = [] successful_temporary_queries = []
for i, temporary_query_tree in enumerate(temporary_query_trees): for i, temporary_query_tree in enumerate(temporary_query_trees):
if self.fits_static_requirements(temporary_query_tree, filters) and self.fits_temporary_requirements(filters): if self.fits_static_requirements(temporary_query_tree, filters) and self.fits_temporary_requirements(filters):
# if 'l_children' in temporary_query_tree and 'r_children' in temporary_query_tree:
active_temporary_query_trees.append(temporary_query_tree) active_temporary_query_trees.append(temporary_query_tree)
successful_temporary_queries.append(i) successful_temporary_queries.append(i)
if 'children' in temporary_query_tree: if 'children' in temporary_query_tree:
@ -397,7 +250,6 @@ class Tree(object):
create_output_string, filters) create_output_string, filters)
merged_partial_answers = [] merged_partial_answers = []
# merged_partial_answers_architecture = []
i_question = 0 i_question = 0
# i_child is necessary, because some queries may be answered at the beginning and were not passed to children. # i_child is necessary, because some queries may be answered at the beginning and were not passed to children.
# i_child is used to point where we are inside answers # i_child is used to point where we are inside answers
@ -414,82 +266,30 @@ class Tree(object):
# TODO FINALIZE RESULT # TODO FINALIZE RESULT
# erase first and last braclets when adding new query result # erase first and last braclets when adding new query result
add_subtree = [subtree.finalize_result() for subtree in merged_partial_answers[i]] add_subtree = [subtree.finalize_result() for subtree in merged_partial_answers[i]]
# if 0 < len(active_permanent_query_trees):
complete_answers[i].extend(add_subtree) complete_answers[i].extend(add_subtree)
# completed_subtrees[i].extend(merged_partial_subtrees[i])
# answers to valid queries # answers to valid queries
partial_answers = [[] for i in range(len(temporary_query_trees))] partial_answers = [[] for i in range(len(temporary_query_trees))]
for inside_i, outside_i in enumerate(successful_temporary_queries): for inside_i, outside_i in enumerate(successful_temporary_queries):
# partial_answers_architecture[outside_i] = merged_partial_answers_architecture[len(active_permanent_query_trees) + inside_i]
partial_answers[outside_i] = merged_partial_answers[ partial_answers[outside_i] = merged_partial_answers[
len(active_permanent_query_trees) + inside_i] len(active_permanent_query_trees) + inside_i]
# return subtrees_architecture, subtrees, completed_subtrees
return partial_answers, complete_answers return partial_answers, complete_answers
# return merged_partial_subtrees_architecture[len(active_permanent_query_trees):], merged_partial_subtrees[len(active_permanent_query_trees):], completed_subtrees
# @staticmethod
# def merge_results(left_parts, right_parts, separator, left=True, right_part_free=False):
# if not left_parts:
# # return all right_parts
# return [r_p.add_separator(separator, left) for r_p in right_parts]
# # if left:
# # return [r_p + separator for r_p in right_parts]
# # # return [r_p.add_separator(separator, left) for r_p in right_parts]
# # else:
# # return [separator + r_p for r_p in right_parts]
#
# if not right_parts:
# return [l_p.add_separator(separator, False) for l_p in left_parts]
# # return [separator + l_p for l_p in left_parts]
# merged_results = []
# for left_part in left_parts:
# if right_part_free:
# for right_part in right_parts[1]:
# merged_results.append((right_parts[0], left_part.merge_results(right_part, separator, left)))
# else:
# for right_part in right_parts:
# merged_results.append(left_part.merge_results(right_part, separator, left))
# # merged_results.append(left_part.merge_results(right_part, separator))
# # if separator:
# # if left:
# # merged_results.append(left_part + right_part + separator)
# # else:
# # merged_results.append(left_part + separator + right_part)
# # else:
# # merged_results.append(left_part + right_part)
# return merged_results
@staticmethod @staticmethod
def create_children_groups(left_parts, right_parts): def create_children_groups(left_parts, right_parts):
if not left_parts: if not left_parts:
# return all right_parts
return right_parts return right_parts
# if left:
# return [r_p + separator for r_p in right_parts]
# # return [r_p.add_separator(separator, left) for r_p in right_parts]
# else:
# return [separator + r_p for r_p in right_parts]
if not right_parts: if not right_parts:
return left_parts return left_parts
# return [separator + l_p for l_p in left_parts]
all_children_group_possibilities = [] all_children_group_possibilities = []
for left_part in left_parts: for left_part in left_parts:
for right_part in right_parts: for right_part in right_parts:
new_part = copy(left_part) new_part = copy(left_part)
# new_part.reset_params()
new_part.extend(right_part) new_part.extend(right_part)
all_children_group_possibilities.append(new_part) all_children_group_possibilities.append(new_part)
# merged_results.append(left_part.merge_results(right_part, separator))
# if separator:
# if left:
# merged_results.append(left_part + right_part + separator)
# else:
# merged_results.append(left_part + separator + right_part)
# else:
# merged_results.append(left_part + right_part)
return all_children_group_possibilities return all_children_group_possibilities
@staticmethod @staticmethod
@ -500,112 +300,15 @@ class Tree(object):
for answer2p_i, new_result in enumerate(answer2): for answer2p_i, new_result in enumerate(answer2):
if answer1p_i != answer2p_i: if answer1p_i != answer2p_i:
new_indices = [answer1p_i] + [answer2p_i] new_indices = [answer1p_i] + [answer2p_i]
sorted_indices = sorted(new_indices)
if sorted_indices in merged_indices:
test = merged_indices.index(sorted(new_indices))
# TODO add comparison answers with different indices if equal than ignore # TODO add comparison answers with different indices if equal than ignore
merged_results.append(old_result + new_result) merged_results.append(old_result + new_result)
merged_indices.append(new_indices) merged_indices.append(new_indices)
return merged_results, merged_indices return merged_results, merged_indices
# def merge_results2(self, child, new_results, filters):
# if create_output_string_form(self) == 'začelo':
# print('HERE!@@!')
# if create_output_string_form(self) == 'Dogodek':
# print('HERE!@@!')
# if create_output_string_form(self) == 'utišal':
# print('HERE!@@!')
# if create_output_string_form(self) == 'prijel':
# print('HERE!@@!')
# if filters['node_order']:
# new_child = child
# # new_child_sorted = sorted(enumerate(child), key=lambda x: x[1][0].key)
# else:
# new_child = sorted(child, key=lambda x: x[0].key)
#
# l_res = []
# r_res = []
# results = []
# for i_answer, answer in enumerate(new_child):
# if filters['node_order'] and answer[0].order[0] < self.index:
# # if filters['node_order'] and indices[i_child][i_answer] < self.children_split:
# if filters['dependency_type']:
# # separator = ' <' + deprel[i_child][i_answer] + ' '
# separator = ' <' + answer[0].deprel + ' '
# else:
# separator = ' < '
# l_res = self.merge_results(l_res, answer, separator, left=True)
# # l_res += answer + separator
# else:
# if filters['dependency_type']:
# separator = ' >' + answer[0].deprel + ' '
# else:
# separator = ' > '
# r_res = self.merge_results(r_res, answer, separator, left=False)
# # r_res += separator + answer
#
# # if filters['node_order']:
# # r_res_sorted = []
# # for i_answer, answer in new_child_sorted:
# # if filters['dependency_type']:
# # separator = ' >' + answer[0].deprel + ' '
# # else:
# # separator = ' > '
# # r_res_sorted = (i_answer, self.merge_results(r_res_sorted, answer, separator, left=False))
# #
# #
# # r_res_sorted_combined = self.merge_results(new_results, r_res_sorted, None, right_part_free=True)
# # # print('here')
#
# if l_res:
# l_res_combined = self.merge_results(l_res, new_results, None)
# if r_res:
# r_res_combined = self.merge_results(l_res_combined, r_res, None)
# # merged_results.extend(['(' + el + ')' for el in r_res_combined])
# result = r_res_combined
# # results.extend([el.put_in_bracelets() for el in r_res_combined])
# else:
# result = l_res_combined
# # results.extend([el.put_in_bracelets() for el in l_res_combined])
# elif r_res:
# r_res_combined = self.merge_results(new_results, r_res, None)
# result = r_res_combined
# # results.extend([el.put_in_bracelets() for el in r_res_combined])
# else:
# result = []
#
#
# results.extend([el.put_in_bracelets() for el in result])
#
# return results
# def create_merged_results(self, answers, separators, separator_switch):
# new_answers = []
# for answer_i, answer in enumerate(answers):
# new_answer = copy(answer[0])
# print(create_output_string_form(self))
# for answer_part_i, answer_part in enumerate(answer[1:]):
# new_answer.extend_answer(answer_part, separators[answer_part_i])
# new_answer.put_in_bracelets(inplace=True)
# new_answers.append(new_answer)
# return new_answers
# def create_merged_results(self, new_child, new_answers, i_child, indices, deprel, filters):
def merge_results3(self, child, new_results, filters): def merge_results3(self, child, new_results, filters):
# if create_output_string_form(self) == 'Dogodek':
# print('HERE!@@!')
# if create_output_string_form(self) == 'začelo':
# print('HERE!@@!')
# if create_output_string_form(self) == 'utišal':
# print('HERE!@@!')
# if create_output_string_form(self) == 'prijel':
# print('HERE!@@!')
if filters['node_order']: if filters['node_order']:
new_child = child new_child = child
# new_child_sorted = sorted(enumerate(child), key=lambda x: x[1][0].key)
# new_child_sorted = sorted(child, key=lambda x: x[0].get_key())
else: else:
new_child = sorted(child, key=lambda x: x[0].get_key()) new_child = sorted(child, key=lambda x: x[0].get_key())
@ -613,58 +316,28 @@ class Tree(object):
for i_answer, answer in enumerate(new_child): for i_answer, answer in enumerate(new_child):
children_groups = self.create_children_groups(children_groups, [[answer_part] for answer_part in answer]) children_groups = self.create_children_groups(children_groups, [[answer_part] for answer_part in answer])
# r_res += separator + answer
# children_groups_sorted = []
# for i_answer, answer in enumerate(new_child_sorted):
# children_groups_sorted = self.create_children_groups(children_groups_sorted, [[answer_part] for answer_part in answer])
#
#
# results_sorted = {}
# for result in new_results:
# for children in children_groups_sorted:
# new_result = copy(result)
# new_result.set_children(children)
# order = tuple(sorted(new_result.get_order()))
# results_sorted[order] = new_result
results = [] results = []
for result in new_results: for result in new_results:
for children in children_groups: for children in children_groups:
new_result = copy(result) new_result = copy(result)
# if result.key is not None or result.order is not None or result.array is not None or result.order_key is not None:
# print('here')
# new_result.reset_params()
new_result.set_children(children) new_result.set_children(children)
# order = tuple(sorted(new_result.get_order()))
results.append(new_result) results.append(new_result)
return results return results
def create_output_children(self, children, new_results, filters): def create_output_children(self, children, new_results, filters):
# if create_output_string_form(self) == 'Dogodek':
# print('HERE!@@!')
# if create_output_string_form(self) == 'utišal':
# print('HERE!@@!')
# if len(new_results) > 1:
# print('HERE')
merged_results = [] merged_results = []
for i_child, child in enumerate(children): for i_child, child in enumerate(children):
# merged_results.extend(self.merge_results2(child, new_results, filters))
merged_results.extend(self.merge_results3(child, new_results, filters)) merged_results.extend(self.merge_results3(child, new_results, filters))
return merged_results return merged_results
# @staticmethod
def create_answers(self, separated_answers, answer_length, filters): def create_answers(self, separated_answers, answer_length, filters):
partly_built_trees = [[None] * answer_length] partly_built_trees = [[None] * answer_length]
partly_built_trees_architecture_indices = [[None] * answer_length] partly_built_trees_architecture_indices = [[None] * answer_length]
built_trees = [] built_trees = []
built_trees_architecture_indices = [] built_trees_architecture_indices = []
# if create_output_string_form(self) == 'Dogodek':
# print('HERE!@@!')
# iterate over children first, so that new partly built trees are added only after all results of specific # iterate over children first, so that new partly built trees are added only after all results of specific
# child are added # child are added
for child_i in range(len(separated_answers[0])): for child_i in range(len(separated_answers[0])):
@ -712,10 +385,7 @@ class Tree(object):
for unique_tree in unique_trees_architecture: for unique_tree in unique_trees_architecture:
already_in = True already_in = True
for part_i in range(len(unique_tree)): for part_i in range(len(unique_tree)):
# test = unique_tree[part_i][0].get_order_key()
if len(unique_tree[part_i]) != len(new_tree[part_i]) or any(unique_tree[part_i][i_unique_part].get_order_key() != new_tree[part_i][i_unique_part].get_order_key() for i_unique_part in range(len(unique_tree[part_i]))): if len(unique_tree[part_i]) != len(new_tree[part_i]) or any(unique_tree[part_i][i_unique_part].get_order_key() != new_tree[part_i][i_unique_part].get_order_key() for i_unique_part in range(len(unique_tree[part_i]))):
# if len(unique_tree[part_i]) != len(new_tree[part_i]) or any(unique_tree[part_i][i_unique_part].order_key != new_tree[part_i][i_unique_part].order_key for i_unique_part in range(len(unique_tree[part_i]))):
# if unique_tree[part_i].order_key != new_tree[part_i].order_key:
already_in = False already_in = False
break break
if already_in: if already_in:
@ -724,20 +394,5 @@ class Tree(object):
if is_unique: if is_unique:
unique_trees_architecture.append(new_tree) unique_trees_architecture.append(new_tree)
# if not filters['node_order']:
# l_ordered_built_trees_architecture.append(new_tree_architecture)
l_ordered_built_trees.append(new_tree) l_ordered_built_trees.append(new_tree)
# TODO NODE ORDER = FALSE
# else:
#
# ordered_built_trees_architecture.append(tree_architecture)
# ordered_built_trees.append(tree)
# print("test")
# for answer1_i, answer1 in enumerate(separated_answers):
# for answer2_i, answer2 in enumerate(separated_answers):
# if answer1_i != answer2_i:
# res, res_i = self.merge_answer(answer1, answer2, answer1_i, answer2_i)
# print('aaa')
#
# pass
return l_ordered_built_trees return l_ordered_built_trees

View File

@ -23,11 +23,13 @@ import pickle
import re import re
import string import string
import time import time
import timeit
from multiprocessing import Pool from multiprocessing import Pool
from pathlib import Path from pathlib import Path
import gzip import gzip
import sys import sys
import pyconll
from Tree import Tree
from generic import get_collocabilities, create_output_string_form, create_output_string_deprel, create_output_string_lemma, create_output_string_upos, create_output_string_xpos, create_output_string_feats
sys.setrecursionlimit(25000) sys.setrecursionlimit(25000)
def save_zipped_pickle(obj, filename, protocol=-1): def save_zipped_pickle(obj, filename, protocol=-1):
@ -39,31 +41,6 @@ def load_zipped_pickle(filename):
loaded_object = pickle.load(f) loaded_object = pickle.load(f)
return loaded_object return loaded_object
import pyconll
from Tree import Tree, create_output_string_form, create_output_string_deprel, create_output_string_lemma, create_output_string_upos, create_output_string_xpos, create_output_string_feats
# for separate searches of feats
# feats_detailed_list = [
# # lexical features
# 'PronType', 'NumType', 'Poss', 'Reflex', 'Foreign', 'Abbr',
#
# # Inflectional features (nominal)
# 'Gender', 'Animacy', 'NounClass', 'Number', 'Case', 'Definite', 'Degree',
#
# # Inflectional features (verbal)
# 'VerbForm', 'Mood', 'Tense', 'Aspect', 'Voice', 'Evident', 'Polarity', 'Person', 'Polite', 'Clusivity',
#
# # Other
# 'Variant', 'Number[psor]', 'Gender[psor]', 'NumForm'
# ]
# feats_detailed_list = []
# feats_detailed_dict = {key: {} for key in feats_detailed_list}
from generic import get_collocabilities
def decode_query(orig_query, dependency_type, feats_detailed_list): def decode_query(orig_query, dependency_type, feats_detailed_list):
new_query = False new_query = False
@ -72,7 +49,6 @@ def decode_query(orig_query, dependency_type, feats_detailed_list):
new_query = True new_query = True
orig_query = orig_query[1:-1] orig_query = orig_query[1:-1]
# if orig_query is '_' return {}
if dependency_type != '': if dependency_type != '':
decoded_query = {'deprel': dependency_type} decoded_query = {'deprel': dependency_type}
else: else:
@ -88,19 +64,14 @@ def decode_query(orig_query, dependency_type, feats_detailed_list):
if len(orig_query_split) > 1: if len(orig_query_split) > 1:
if orig_query_split[0] == 'L': if orig_query_split[0] == 'L':
decoded_query['lemma'] = orig_query_split[1] decoded_query['lemma'] = orig_query_split[1]
# return decoded_query
elif orig_query_split[0] == 'upos': elif orig_query_split[0] == 'upos':
decoded_query['upos'] = orig_query_split[1] decoded_query['upos'] = orig_query_split[1]
# return decoded_query
elif orig_query_split[0] == 'xpos': elif orig_query_split[0] == 'xpos':
decoded_query['xpos'] = orig_query_split[1] decoded_query['xpos'] = orig_query_split[1]
# return decoded_query
elif orig_query_split[0] == 'form': elif orig_query_split[0] == 'form':
decoded_query['form'] = orig_query_split[1] decoded_query['form'] = orig_query_split[1]
# return decoded_query
elif orig_query_split[0] == 'feats': elif orig_query_split[0] == 'feats':
decoded_query['feats'] = orig_query_split[1] decoded_query['feats'] = orig_query_split[1]
# return decoded_query
elif orig_query_split[0] in feats_detailed_list: elif orig_query_split[0] in feats_detailed_list:
decoded_query['feats_detailed'] = {} decoded_query['feats_detailed'] = {}
decoded_query['feats_detailed'][orig_query_split[0]] = orig_query_split[1] decoded_query['feats_detailed'][orig_query_split[0]] = orig_query_split[1]
@ -111,18 +82,11 @@ def decode_query(orig_query, dependency_type, feats_detailed_list):
print('???') print('???')
elif not new_query: elif not new_query:
decoded_query['form'] = orig_query_split_part decoded_query['form'] = orig_query_split_part
# return decoded_query
return decoded_query return decoded_query
# split over spaces if not inside braces # split over spaces if not inside braces
# PATTERN = re.compile(r'''((?:[^ ()]|\([^.]*\))+)''')
# all_orders = PATTERN.split(orig_query)
# PATTERN = re.compile(r"(?:[^ ()]|\([^.]*\))+")
# all_orders = re.findall(r"(?:[^ ()]|\([^]*\))+", orig_query)
all_orders = re.split(r"\s+(?=[^()]*(?:\(|$))", orig_query) all_orders = re.split(r"\s+(?=[^()]*(?:\(|$))", orig_query)
# all_orders = orig_query.split()
node_actions = all_orders[::2] node_actions = all_orders[::2]
priority_actions = all_orders[1::2] priority_actions = all_orders[1::2]
priority_actions_beginnings = [a[0] for a in priority_actions] priority_actions_beginnings = [a[0] for a in priority_actions]
@ -148,8 +112,6 @@ def decode_query(orig_query, dependency_type, feats_detailed_list):
def create_trees(input_path, internal_saves, feats_detailed_dict={}, save=True): def create_trees(input_path, internal_saves, feats_detailed_dict={}, save=True):
# internal_saves = filters['internal_saves']
# input_path = filters['input']
hash_object = hashlib.sha1(input_path.encode('utf-8')) hash_object = hashlib.sha1(input_path.encode('utf-8'))
hex_dig = hash_object.hexdigest() hex_dig = hash_object.hexdigest()
trees_read_outputfile = os.path.join(internal_saves, hex_dig) trees_read_outputfile = os.path.join(internal_saves, hex_dig)
@ -165,13 +127,8 @@ def create_trees(input_path, internal_saves, feats_detailed_dict={}, save=True):
for sentence in train: for sentence in train:
root = None root = None
root_id = None
token_nodes = [] token_nodes = []
for token in sentence: for token in sentence:
# token_feats = ''
# for k, v in token.feats.items():
# token_feats += k + next(iter(v)) + '|'
# token_feats = token_feats[:-1]
if not token.id.isdigit(): if not token.id.isdigit():
continue continue
@ -194,12 +151,6 @@ def create_trees(input_path, internal_saves, feats_detailed_dict={}, save=True):
token.set_parent(None) token.set_parent(None)
else: else:
parent_id = int(token.parent) - 1 parent_id = int(token.parent) - 1
# if token_id < parent_id:
# token_nodes[parent_id].add_l_child(token)
# elif token_id > parent_id:
# token_nodes[parent_id].add_r_child(token)
# else:
# raise Exception('Root element should not be here!')
if token_nodes[parent_id].children_split == -1 and token_id > parent_id: if token_nodes[parent_id].children_split == -1 and token_id > parent_id:
token_nodes[parent_id].children_split = len(token_nodes[parent_id].children) token_nodes[parent_id].children_split = len(token_nodes[parent_id].children)
token_nodes[parent_id].add_child(token) token_nodes[parent_id].add_child(token)
@ -210,35 +161,19 @@ def create_trees(input_path, internal_saves, feats_detailed_dict={}, save=True):
token.children_split = len(token.children) token.children_split = len(token.children)
if root == None: if root == None:
# print(input_path)
print('No root: ' + sentence.id) print('No root: ' + sentence.id)
continue continue
# raise Exception('No root element in sentence!')
all_trees.append(root) all_trees.append(root)
if save: if save:
save_zipped_pickle((all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict), trees_read_outputfile, protocol=2) save_zipped_pickle((all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict), trees_read_outputfile, protocol=2)
# with open(trees_read_outputfile, 'wb') as output:
#
# pickle.dump((all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict), output)
else: else:
print('Reading trees:') print('Reading trees:')
print('Completed') print('Completed')
all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict = load_zipped_pickle(trees_read_outputfile) all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict = load_zipped_pickle(trees_read_outputfile)
# with open(trees_read_outputfile, 'rb') as pkl_file:
# (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict) = pickle.load(pkl_file)
return all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict return all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict
# def order_independent_queries(query_tree):
# all_children = query_tree['l_children'] + query_tree['r_children']
# if all_children > 0:
#
# else:
# return query_tree
# pass
def printable_answers(query): def printable_answers(query):
# all_orders = re.findall(r"(?:[^ ()]|\([^]*\))+", query) # all_orders = re.findall(r"(?:[^ ()]|\([^]*\))+", query)
all_orders = re.split(r"\s+(?=[^()]*(?:\(|$))", query) all_orders = re.split(r"\s+(?=[^()]*(?:\(|$))", query)
@ -293,11 +228,6 @@ def tree_calculations_chunks(input_data):
return result_dict return result_dict
def chunkify(a, n):
k, m = divmod(len(a), n)
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
def add_node(tree): def add_node(tree):
if 'children' in tree: if 'children' in tree:
tree['children'].append({}) tree['children'].append({})
@ -362,30 +292,11 @@ def create_ngrams_query_trees(n, trees):
new_trees.append(new_tree) new_trees.append(new_tree)
trees = new_trees trees = new_trees
# delete_duplicates(trees)
# print('here')
# tree_grow(tree)
# tree_grow(tree)
# tree['children'] = [{}]
return trees return trees
def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict, result_dict): def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict, result_dict):
with Pool(cpu_cores) as p: with Pool(cpu_cores) as p:
# 1.25 s (16 cores)
# chunked_trees = list(chunkify(all_trees, cpu_cores))
# if cpu_cores > 1:
# part_results = p.map(tree_calculations_chunks,
# [(tree, query_tree, create_output_string_funct, filters) for tree in chunked_trees])
#
# for part_result in part_results:
# for r_k, r_v in part_result.items():
# if r_k in result_dict:
# result_dict[r_k] += r_v
# else:
# result_dict[r_k] = r_v
# 1.02 s (16 cores)
if cpu_cores > 1: if cpu_cores > 1:
# input_data = (tree, query_tree, create_output_string_functs, filters)
all_unigrams = p.map(get_unigrams, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees]) all_unigrams = p.map(get_unigrams, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees])
for unigrams in all_unigrams: for unigrams in all_unigrams:
for unigram in unigrams: for unigram in unigrams:
@ -396,24 +307,14 @@ def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, f
all_subtrees = p.map(tree_calculations, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees]) all_subtrees = p.map(tree_calculations, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees])
# for subtrees in all_subtrees:
for tree_i, subtrees in enumerate(all_subtrees): for tree_i, subtrees in enumerate(all_subtrees):
for query_results in subtrees: for query_results in subtrees:
for r in query_results: for r in query_results:
# if r.key == '(ne <advmod more >xcomp (se <expl izogniti) >punct .)':
# print('HERE')
# print(tree_i)
if filters['node_order']: if filters['node_order']:
key = r.get_key() + r.order key = r.get_key() + r.order
else: else:
key = r.get_key() key = r.get_key()
# if r == '(" < , < je < velik) < tem':
# print(tree_i)
# if r in result_dict:
# result_dict[r] += 1
# else:
# result_dict[r] = 1
if key in result_dict: if key in result_dict:
result_dict[key]['number'] += 1 result_dict[key]['number'] += 1
else: else:
@ -421,11 +322,7 @@ def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, f
# 3.65 s (1 core) # 3.65 s (1 core)
else: else:
# for tree_i, tree in enumerate(all_trees[-5:]):
for tree_i, tree in enumerate(all_trees): for tree_i, tree in enumerate(all_trees):
# for tree_i, tree in enumerate(all_trees[852:]):
# for tree_i, tree in enumerate(all_trees[1689:]):
# for tree_i, tree in enumerate(all_trees[1:3]):
input_data = (tree, query_tree, create_output_string_functs, filters) input_data = (tree, query_tree, create_output_string_functs, filters)
if filters['association_measures']: if filters['association_measures']:
unigrams = get_unigrams(input_data) unigrams = get_unigrams(input_data)
@ -434,10 +331,7 @@ def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, f
unigrams_dict[unigram] += 1 unigrams_dict[unigram] += 1
else: else:
unigrams_dict[unigram] = 1 unigrams_dict[unigram] = 1
# for tree_i, tree in enumerate(all_trees[1:]):
# text = Če pa ostane odrasel otrok doma, se starši le težko sprijaznijo s tem, da je "velik", otrok pa ima ves čas občutek, da se njegovi starši po nepotrebnem vtikajo v njegovo življenje.
# for tree_i, tree in enumerate(all_trees[5170:]):
# for tree in all_trees:
subtrees = tree_calculations(input_data) subtrees = tree_calculations(input_data)
for query_results in subtrees: for query_results in subtrees:
for r in query_results: for r in query_results:
@ -445,8 +339,6 @@ def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, f
key = r.get_key() + r.order key = r.get_key() + r.order
else: else:
key = r.get_key() key = r.get_key()
# if r == '(" < , < je < velik) < tem':
# print(tree_i)
if key in result_dict: if key in result_dict:
result_dict[key]['number'] += 1 result_dict[key]['number'] += 1
else: else:
@ -465,7 +357,6 @@ def read_filters(config, feats_detailed_list):
query_tree.extend(create_ngrams_query_trees(i, [{}])) query_tree.extend(create_ngrams_query_trees(i, [{}]))
else: else:
query_tree = [decode_query('(' + config.get('settings', 'query') + ')', '', feats_detailed_list)] query_tree = [decode_query('(' + config.get('settings', 'query') + ')', '', feats_detailed_list)]
# order_independent_queries(query_tree)
# set filters # set filters
node_types = config.get('settings', 'node_type').split('+') node_types = config.get('settings', 'node_type').split('+')
@ -506,11 +397,8 @@ def read_filters(config, feats_detailed_list):
attribute_dict = {} attribute_dict = {}
for attribute in option.split('&'): for attribute in option.split('&'):
value = attribute.split('=') value = attribute.split('=')
# assert value[0] in ['deprel', 'lemma', 'upos', 'xpos', 'form',
# 'feats'], '"root_whitelist" is not set up correctly'
attribute_dict[value[0]] = value[1] attribute_dict[value[0]] = value[1]
filters['root_whitelist'].append(attribute_dict) filters['root_whitelist'].append(attribute_dict)
# filters['root_whitelist'] = [{'upos': 'NOUN', 'Case': 'Nom'}, {'upos': 'ADJ', 'Degree': 'Sup'}]
else: else:
filters['root_whitelist'] = [] filters['root_whitelist'] = []
@ -540,12 +428,6 @@ def main():
internal_saves = config.get('settings', 'internal_saves') internal_saves = config.get('settings', 'internal_saves')
input_path = config.get('settings', 'input') input_path = config.get('settings', 'input')
# a = args.config_file
# config.read('config.ini')
# create queries
if os.path.isdir(input_path): if os.path.isdir(input_path):
checkpoint_path = Path(internal_saves, 'checkpoint.pkl') checkpoint_path = Path(internal_saves, 'checkpoint.pkl')
@ -572,9 +454,6 @@ def main():
for path in sorted(pathlist): for path in sorted(pathlist):
# because path is object not string # because path is object not string
path_str = str(path) path_str = str(path)
# if Path(path_str).name == 'GF0003946-dedup.conllu':
# break
# print(path_in_str)
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, sub_corpus_size, (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, sub_corpus_size,
feats_detailed_list) = create_trees(path_str, internal_saves, feats_detailed_dict=feats_detailed_list, save=False) feats_detailed_list) = create_trees(path_str, internal_saves, feats_detailed_dict=feats_detailed_list, save=False)
@ -593,7 +472,6 @@ def main():
# 15.26 # 15.26
print("Execution time:") print("Execution time:")
print("--- %s seconds ---" % (time.time() - start_exe_time)) print("--- %s seconds ---" % (time.time() - start_exe_time))
# print(1 + 'asd')
save_zipped_pickle( save_zipped_pickle(
(already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list), (already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list),
checkpoint_path, protocol=2) checkpoint_path, protocol=2)
@ -620,26 +498,6 @@ def main():
print("Execution time:") print("Execution time:")
print("--- %s seconds ---" % (time.time() - start_exe_time)) print("--- %s seconds ---" % (time.time() - start_exe_time))
# test 1 layer queries
# # tree.r_children = []
# # tree.children[1].children = []
# # query = [{'children': [{}]}, {'children': [{}]}]
# # query = [{"children": [{}, {}]}, {"children": [{}]}, {"children": [{}, {}, {}]}]
# query = [{"children": [{'form': 'je'}, {}]}, {"children": [{'form': 'je'}]}, {"children": [{'form': 'je'}, {}, {}]}]
# # query = [{'q1':'', "children": [{'a1':''}, {'a2':''}]}, {'q2':'', "children": [{'b1':''}]}, {'q3':'', "children": [{'c1':''}, {'c2':''}, {'c3':''}]}]
# _, _, subtrees = tree.get_subtrees(query, [], create_output_string_funct)
# # _, subtrees = tree.get_subtrees([{'q1':'', "children": [{'a1':''}, {'a2':''}], "children": []}, {'q2':'', "children": [{'b1':''}], "children": []}, {'q3':'', "children": [{'c1':''}, {'c2':''}, {'c3':''}], "children": []}], [])
# print('HERE!')
# test 2 layer queries
# tree.r_children = [Tree('je', '', '', '', '', form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, None)]
# tree.l_children[1].l_children = []
# new_tree = Tree('bil', '', '', '', '', form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, None)
# new_tree.l_children = [tree]
# _, subtrees = new_tree.get_subtrees(
# [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}]}]}], [])
# # _, subtrees = new_tree.get_subtrees(
# # [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}], "r_children": []}], "r_children": []}], [])
sorted_list = sorted(result_dict.items(), key=lambda x: x[1]['number'], reverse=True) sorted_list = sorted(result_dict.items(), key=lambda x: x[1]['number'], reverse=True)
with open(config.get('settings', 'output'), "w", newline="") as f: with open(config.get('settings', 'output'), "w", newline="") as f:
@ -660,7 +518,6 @@ def main():
header += ['Root node'] header += ['Root node']
if filters['association_measures']: if filters['association_measures']:
header += ['MI', 'MI3', 'Dice', 'logDice', 't-score', 'simple-LL'] header += ['MI', 'MI3', 'Dice', 'logDice', 't-score', 'simple-LL']
# header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Absolute frequency']
writer.writerow(header) writer.writerow(header)
if filters['lines_threshold']: if filters['lines_threshold']:
@ -673,7 +530,6 @@ def main():
if filters['frequency_threshold'] and filters['frequency_threshold'] > v['number']: if filters['frequency_threshold'] and filters['frequency_threshold'] > v['number']:
break break
words_only = [word_att for word in v['object'].array for word_att in word] + ['' for i in range((tree_size_range[-1] - len(v['object'].array)) * len(v['object'].array[0]))] words_only = [word_att for word in v['object'].array for word_att in word] + ['' for i in range((tree_size_range[-1] - len(v['object'].array)) * len(v['object'].array[0]))]
# words_only = printable_answers(k)
row = [v['object'].get_key()[1:-1]] + words_only + [str(v['number'])] row = [v['object'].get_key()[1:-1]] + words_only + [str(v['number'])]
row += ['%.4f' % relative_frequency] row += ['%.4f' % relative_frequency]
if filters['node_order']: if filters['node_order']:

View File

@ -45,7 +45,6 @@ def generate_key(node, create_output_strings, print_lemma=True):
if len(array[0]) > 1: if len(array[0]) > 1:
key = '&'.join(key_array[0]) key = '&'.join(key_array[0])
else: else:
# output_string = create_output_strings[0](node)
key = key_array[0][0] key = key_array[0][0]
return array, key return array, key
@ -61,7 +60,6 @@ def generate_name(node, create_output_strings, print_lemma=True):
if len(array) > 1: if len(array) > 1:
name = '&'.join(name_array) name = '&'.join(name_array)
else: else:
# output_string = create_output_strings[0](node)
name = name_array[0] name = name_array[0]
return array, name return array, name
@ -74,7 +72,6 @@ def get_collocabilities(ngram, unigrams_dict, corpus_size):
if len(key_array) > 1: if len(key_array) > 1:
key = '&'.join(key_array) key = '&'.join(key_array)
else: else:
# output_string = create_output_strings[0](node)
key = key_array[0] key = key_array[0]
sum_fwi += unigrams_dict[key] sum_fwi += unigrams_dict[key]
mul_fwi *= unigrams_dict[key] mul_fwi *= unigrams_dict[key]