Added all but 2 key output
This commit is contained in:
parent
7c5aba1ca9
commit
eeab026313
24
Result.py
24
Result.py
|
@ -1,18 +1,28 @@
|
||||||
import copy
|
import copy
|
||||||
import string
|
import string
|
||||||
|
|
||||||
|
from generic import create_output_string_form, create_output_string_deprel, create_output_string_lemma, \
|
||||||
|
create_output_string_upos, create_output_string_xpos, create_output_string_feats, generate_key
|
||||||
|
|
||||||
|
|
||||||
class Result(object):
|
class Result(object):
|
||||||
def __init__(self, node, architecture_order, create_output_strings):
|
def __init__(self, node, architecture_order, create_output_strings):
|
||||||
self.array = [[create_output_string(node) for create_output_string in create_output_strings]]
|
# self.array = [[create_output_string(node) for create_output_string in create_output_strings]]
|
||||||
if len(self.array[0]) > 1:
|
# if create_output_string_lemma in create_output_strings:
|
||||||
self.key = '{' + ','.join(self.array[0]) + '}'
|
# key_array = [[create_output_string(node) if create_output_string != create_output_string_lemma else 'L=' + create_output_string(node) for create_output_string in create_output_strings]]
|
||||||
else:
|
# else:
|
||||||
# output_string = create_output_strings[0](node)
|
# key_array = self.array
|
||||||
self.key = self.array[0][0]
|
# if len(self.array[0]) > 1:
|
||||||
|
# self.key = '&'.join(key_array[0])
|
||||||
|
# else:
|
||||||
|
# # output_string = create_output_strings[0](node)
|
||||||
|
# self.key = key_array[0][0]
|
||||||
|
|
||||||
|
self.array, self.key = generate_key(node, create_output_strings)
|
||||||
# self.array = [[output_string]]
|
# self.array = [[output_string]]
|
||||||
self.order_key = str([architecture_order])
|
self.order_key = str([architecture_order])
|
||||||
self.order = [architecture_order]
|
self.order = [architecture_order]
|
||||||
|
self.deprel = node.deprel.get_value()
|
||||||
# order with original numbers in sentences
|
# order with original numbers in sentences
|
||||||
# self.order = str([architecture_order])
|
# self.order = str([architecture_order])
|
||||||
# order with numbers from 0 to n of n-gram
|
# order with numbers from 0 to n of n-gram
|
||||||
|
@ -123,7 +133,7 @@ class Result(object):
|
||||||
|
|
||||||
def set_root(self):
|
def set_root(self):
|
||||||
if len(self.array[0]) > 1:
|
if len(self.array[0]) > 1:
|
||||||
self.root = '{' + ','.join(self.array[0]) + '}'
|
self.root = '&'.join(self.array[0])
|
||||||
else:
|
else:
|
||||||
# output_string = create_output_strings[0](node)
|
# output_string = create_output_strings[0](node)
|
||||||
self.root = self.array[0][0]
|
self.root = self.array[0][0]
|
96
Tree.py
96
Tree.py
|
@ -5,6 +5,8 @@ from pyconll.unit import Token
|
||||||
|
|
||||||
from Result import Result
|
from Result import Result
|
||||||
from Value import Value
|
from Value import Value
|
||||||
|
from generic import create_output_string_form, create_output_string_deprel, create_output_string_lemma, \
|
||||||
|
create_output_string_upos, create_output_string_xpos, create_output_string_feats, generate_key
|
||||||
|
|
||||||
|
|
||||||
class Tree(object):
|
class Tree(object):
|
||||||
|
@ -206,7 +208,6 @@ class Tree(object):
|
||||||
# create_output_string)
|
# create_output_string)
|
||||||
partial_answers = [[] for i in range(permanent_query_nb + temporary_query_nb)]
|
partial_answers = [[] for i in range(permanent_query_nb + temporary_query_nb)]
|
||||||
partial_answers_index = [[] for i in range(permanent_query_nb + temporary_query_nb)]
|
partial_answers_index = [[] for i in range(permanent_query_nb + temporary_query_nb)]
|
||||||
partial_answers_deprel = [[] for i in range(permanent_query_nb + temporary_query_nb)]
|
|
||||||
complete_answers = [[] for i in range(permanent_query_nb)]
|
complete_answers = [[] for i in range(permanent_query_nb)]
|
||||||
|
|
||||||
# list of pairs (index of query in group, group of query)
|
# list of pairs (index of query in group, group of query)
|
||||||
|
@ -220,7 +221,6 @@ class Tree(object):
|
||||||
child_queries_flatten = [query_part for query in child_queries for query_part in query]
|
child_queries_flatten = [query_part for query in child_queries for query_part in query]
|
||||||
|
|
||||||
all_new_partial_answers = [[] for query_part in child_queries_flatten]
|
all_new_partial_answers = [[] for query_part in child_queries_flatten]
|
||||||
all_new_partial_answers_deprel = [[] for query_part in child_queries_flatten]
|
|
||||||
|
|
||||||
# if filters['caching']:
|
# if filters['caching']:
|
||||||
# erase duplicate queries
|
# erase duplicate queries
|
||||||
|
@ -247,7 +247,6 @@ class Tree(object):
|
||||||
# duplicate results again on correct places
|
# duplicate results again on correct places
|
||||||
for i, flattened_index in enumerate(child_queries_flatten_dedup_indices):
|
for i, flattened_index in enumerate(child_queries_flatten_dedup_indices):
|
||||||
all_new_partial_answers[i].append(new_partial_answers_dedup[flattened_index])
|
all_new_partial_answers[i].append(new_partial_answers_dedup[flattened_index])
|
||||||
all_new_partial_answers_deprel[i].append(create_output_string_deprel(child))
|
|
||||||
|
|
||||||
# else:
|
# else:
|
||||||
# new_partial_answers_architecture, new_partial_answers, new_complete_answers = child.get_subtrees(
|
# new_partial_answers_architecture, new_partial_answers, new_complete_answers = child.get_subtrees(
|
||||||
|
@ -281,7 +280,7 @@ class Tree(object):
|
||||||
for answer_i, answer_length in enumerate(answers_lengths):
|
for answer_i, answer_length in enumerate(answers_lengths):
|
||||||
# iterate over answers of query
|
# iterate over answers of query
|
||||||
# TODO ERROR IN HERE!
|
# TODO ERROR IN HERE!
|
||||||
partial_answers[answer_i], partial_answers_index[answer_i], partial_answers_deprel[answer_i] = self.create_answers(all_new_partial_answers[i:i + answer_length], all_new_partial_answers_deprel[i:i + answer_length], answer_length, filters)
|
partial_answers[answer_i], partial_answers_index[answer_i] = self.create_answers(all_new_partial_answers[i:i + answer_length], answer_length, filters)
|
||||||
# while i < answers_length:
|
# while i < answers_length:
|
||||||
# self.create_grouped_answers()
|
# self.create_grouped_answers()
|
||||||
# i += 1
|
# i += 1
|
||||||
|
@ -308,9 +307,9 @@ class Tree(object):
|
||||||
# child, child_queries, child_queries_metadata = children_queries_generator.send(partial_results_dict)
|
# child, child_queries, child_queries_metadata = children_queries_generator.send(partial_results_dict)
|
||||||
# child_index += 1
|
# child_index += 1
|
||||||
|
|
||||||
return partial_answers, partial_answers_index, partial_answers_deprel, complete_answers
|
return partial_answers, partial_answers_index, complete_answers
|
||||||
|
|
||||||
def order_dependent_queries(self, active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index, partial_subtrees_deprel,
|
def order_dependent_queries(self, active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index,
|
||||||
create_output_string, merged_partial_subtrees, i_query, i_answer, filters):
|
create_output_string, merged_partial_subtrees, i_query, i_answer, filters):
|
||||||
# string_output = ''
|
# string_output = ''
|
||||||
# if create_output_string_form(self) == 'vožnji':
|
# if create_output_string_form(self) == 'vožnji':
|
||||||
|
@ -324,7 +323,7 @@ class Tree(object):
|
||||||
# self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer]))
|
# self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer]))
|
||||||
|
|
||||||
merged_partial_subtrees.append(
|
merged_partial_subtrees.append(
|
||||||
self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer]))
|
self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer]))
|
||||||
|
|
||||||
i_answer += 1
|
i_answer += 1
|
||||||
else:
|
else:
|
||||||
|
@ -341,7 +340,7 @@ class Tree(object):
|
||||||
# self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer]))
|
# self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer]))
|
||||||
|
|
||||||
merged_partial_subtrees.append(
|
merged_partial_subtrees.append(
|
||||||
self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer]))
|
self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer]))
|
||||||
|
|
||||||
i_answer += 1
|
i_answer += 1
|
||||||
else:
|
else:
|
||||||
|
@ -351,6 +350,12 @@ class Tree(object):
|
||||||
|
|
||||||
return i_answer
|
return i_answer
|
||||||
|
|
||||||
|
def get_unigrams(self, create_output_strings, filters):
|
||||||
|
unigrams = [generate_key(self, create_output_strings, print_lemma=False)[1]]
|
||||||
|
for child in self.children:
|
||||||
|
unigrams += child.get_unigrams(create_output_strings, filters)
|
||||||
|
return unigrams
|
||||||
|
|
||||||
def get_subtrees(self, permanent_query_trees, temporary_query_trees, create_output_string, filters):
|
def get_subtrees(self, permanent_query_trees, temporary_query_trees, create_output_string, filters):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -382,7 +387,7 @@ class Tree(object):
|
||||||
if 'children' in temporary_query_tree:
|
if 'children' in temporary_query_tree:
|
||||||
all_query_indices.append((temporary_query_tree['children'], False))
|
all_query_indices.append((temporary_query_tree['children'], False))
|
||||||
|
|
||||||
partial_subtrees, partial_subtrees_index, partial_subtrees_deprel, complete_answers = self.get_all_query_indices(len(temporary_query_trees),
|
partial_subtrees, partial_subtrees_index, complete_answers = self.get_all_query_indices(len(temporary_query_trees),
|
||||||
len(permanent_query_trees),
|
len(permanent_query_trees),
|
||||||
permanent_query_trees,
|
permanent_query_trees,
|
||||||
all_query_indices, self.children,
|
all_query_indices, self.children,
|
||||||
|
@ -397,7 +402,7 @@ class Tree(object):
|
||||||
# go over all permanent and temporary query trees
|
# go over all permanent and temporary query trees
|
||||||
while i_question < len(active_permanent_query_trees) + len(active_temporary_query_trees):
|
while i_question < len(active_permanent_query_trees) + len(active_temporary_query_trees):
|
||||||
# permanent query trees always have left and right child
|
# permanent query trees always have left and right child
|
||||||
i_answer = self.order_dependent_queries(active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index, partial_subtrees_deprel,
|
i_answer = self.order_dependent_queries(active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index,
|
||||||
create_output_string, merged_partial_answers, i_question, i_answer, filters)
|
create_output_string, merged_partial_answers, i_question, i_answer, filters)
|
||||||
|
|
||||||
i_question += 1
|
i_question += 1
|
||||||
|
@ -466,21 +471,22 @@ class Tree(object):
|
||||||
merged_indices.append(new_indices)
|
merged_indices.append(new_indices)
|
||||||
return merged_results, merged_indices
|
return merged_results, merged_indices
|
||||||
|
|
||||||
def merge_results2(self, new_child, new_results, i_child, indices, deprel, filters):
|
def merge_results2(self, new_child, new_results, i_child, indices, filters):
|
||||||
l_res = []
|
l_res = []
|
||||||
r_res = []
|
r_res = []
|
||||||
results = []
|
results = []
|
||||||
for i_answer, answer in enumerate(new_child):
|
for i_answer, answer in enumerate(new_child):
|
||||||
if filters['node_order'] and indices[i_child][i_answer] < self.children_split:
|
if filters['node_order'] and indices[i_child][i_answer] < self.children_split:
|
||||||
if filters['dependency_type']:
|
if filters['dependency_type']:
|
||||||
separator = ' <' + deprel[i_child][i_answer] + ' '
|
# separator = ' <' + deprel[i_child][i_answer] + ' '
|
||||||
|
separator = ' <' + answer[0].deprel + ' '
|
||||||
else:
|
else:
|
||||||
separator = ' < '
|
separator = ' < '
|
||||||
l_res = self.merge_results(l_res, answer, separator, left=True)
|
l_res = self.merge_results(l_res, answer, separator, left=True)
|
||||||
# l_res += answer + separator
|
# l_res += answer + separator
|
||||||
else:
|
else:
|
||||||
if filters['dependency_type']:
|
if filters['dependency_type']:
|
||||||
separator = ' >' + deprel[i_child][i_answer] + ' '
|
separator = ' >' + answer[0].deprel + ' '
|
||||||
else:
|
else:
|
||||||
separator = ' > '
|
separator = ' > '
|
||||||
r_res = self.merge_results(r_res, answer, separator, left=False)
|
r_res = self.merge_results(r_res, answer, separator, left=False)
|
||||||
|
@ -572,7 +578,7 @@ class Tree(object):
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def create_output_children(self, children, new_results, filters, indices, deprel):
|
def create_output_children(self, children, new_results, filters, indices):
|
||||||
# if create_output_string_form(self) == 'Dogodek':
|
# if create_output_string_form(self) == 'Dogodek':
|
||||||
# print('HERE!@@!')
|
# print('HERE!@@!')
|
||||||
# if create_output_string_form(self) == 'utišal':
|
# if create_output_string_form(self) == 'utišal':
|
||||||
|
@ -586,7 +592,7 @@ class Tree(object):
|
||||||
else:
|
else:
|
||||||
new_child = sorted(child, key=lambda x: x[0].key)
|
new_child = sorted(child, key=lambda x: x[0].key)
|
||||||
#################
|
#################
|
||||||
merged_results.extend(self.merge_results2(new_child, new_results, i_child, indices, deprel, filters))
|
merged_results.extend(self.merge_results2(new_child, new_results, i_child, indices, filters))
|
||||||
return merged_results
|
return merged_results
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -631,17 +637,11 @@ class Tree(object):
|
||||||
return merged_results
|
return merged_results
|
||||||
|
|
||||||
# @staticmethod
|
# @staticmethod
|
||||||
def create_answers(self, separated_answers, separated_answers_deprel, answer_length, filters):
|
def create_answers(self, separated_answers, answer_length, filters):
|
||||||
# TODO
|
|
||||||
# node_order = False
|
|
||||||
partly_built_trees = [[None] * answer_length]
|
partly_built_trees = [[None] * answer_length]
|
||||||
# partly_built_trees_architecture = [[None] * answer_length]
|
|
||||||
partly_built_trees_architecture_indices = [[None] * answer_length]
|
partly_built_trees_architecture_indices = [[None] * answer_length]
|
||||||
partly_built_trees_deprel = [[None] * answer_length]
|
|
||||||
built_trees = []
|
built_trees = []
|
||||||
# built_trees_architecture = []
|
|
||||||
built_trees_architecture_indices = []
|
built_trees_architecture_indices = []
|
||||||
built_trees_deprel = []
|
|
||||||
|
|
||||||
# if create_output_string_form(self) == 'Dogodek':
|
# if create_output_string_form(self) == 'Dogodek':
|
||||||
# print('HERE!@@!')
|
# print('HERE!@@!')
|
||||||
|
@ -650,64 +650,44 @@ class Tree(object):
|
||||||
# child are added
|
# child are added
|
||||||
for child_i in range(len(separated_answers[0])):
|
for child_i in range(len(separated_answers[0])):
|
||||||
new_partly_built_trees = []
|
new_partly_built_trees = []
|
||||||
# new_partly_built_trees_architecture = []
|
|
||||||
new_partly_built_trees_architecture_indices = []
|
new_partly_built_trees_architecture_indices = []
|
||||||
new_partly_built_trees_deprel = []
|
|
||||||
# iterate over answers parts
|
# iterate over answers parts
|
||||||
for answer_part_i in range(len(separated_answers)):
|
for answer_part_i in range(len(separated_answers)):
|
||||||
# necessary because some parts do not pass filters and are not added
|
# necessary because some parts do not pass filters and are not added
|
||||||
# if child_i < len(separated_answers[answer_part_i]) and separated_answers[answer_part_i][child_i]:
|
|
||||||
if separated_answers[answer_part_i][child_i]:
|
if separated_answers[answer_part_i][child_i]:
|
||||||
for tree_part_i, tree_part in enumerate(partly_built_trees):
|
for tree_part_i, tree_part in enumerate(partly_built_trees):
|
||||||
# if tree_part[answer_part_i] equals None add new element in its place
|
|
||||||
if not tree_part[answer_part_i]:
|
if not tree_part[answer_part_i]:
|
||||||
new_tree_part = copy(tree_part)
|
new_tree_part = copy(tree_part)
|
||||||
# new_tree_part_architecture = copy(partly_built_trees_architecture[tree_part_i])
|
|
||||||
new_tree_part_architecture_indices = copy(partly_built_trees_architecture_indices[tree_part_i])
|
new_tree_part_architecture_indices = copy(partly_built_trees_architecture_indices[tree_part_i])
|
||||||
new_tree_part_deprel = copy(partly_built_trees_deprel[tree_part_i])
|
|
||||||
new_tree_part[answer_part_i] = separated_answers[answer_part_i][child_i]
|
new_tree_part[answer_part_i] = separated_answers[answer_part_i][child_i]
|
||||||
# new_tree_part_architecture[answer_part_i] = separated_answers_architecture[answer_part_i][child_i]
|
|
||||||
new_tree_part_architecture_indices[answer_part_i] = child_i
|
new_tree_part_architecture_indices[answer_part_i] = child_i
|
||||||
new_tree_part_deprel[answer_part_i] = separated_answers_deprel[answer_part_i][child_i]
|
|
||||||
completed_tree_part = True
|
completed_tree_part = True
|
||||||
for val_i, val in enumerate(new_tree_part):
|
for val_i, val in enumerate(new_tree_part):
|
||||||
if not val:
|
if not val:
|
||||||
completed_tree_part = False
|
completed_tree_part = False
|
||||||
if completed_tree_part:
|
if completed_tree_part:
|
||||||
built_trees.append(new_tree_part)
|
built_trees.append(new_tree_part)
|
||||||
# built_trees_architecture.append(new_tree_part_architecture)
|
|
||||||
built_trees_architecture_indices.append(new_tree_part_architecture_indices)
|
built_trees_architecture_indices.append(new_tree_part_architecture_indices)
|
||||||
built_trees_deprel.append(new_tree_part_deprel)
|
|
||||||
else:
|
else:
|
||||||
new_partly_built_trees.append(new_tree_part)
|
new_partly_built_trees.append(new_tree_part)
|
||||||
# new_partly_built_trees_architecture.append(new_tree_part_architecture)
|
|
||||||
new_partly_built_trees_architecture_indices.append(new_tree_part_architecture_indices)
|
new_partly_built_trees_architecture_indices.append(new_tree_part_architecture_indices)
|
||||||
new_partly_built_trees_deprel.append(new_tree_part_deprel)
|
|
||||||
else:
|
else:
|
||||||
# pass over repetitions of same words
|
# pass over repetitions of same words
|
||||||
pass
|
pass
|
||||||
# print('HERE!!!')
|
|
||||||
|
|
||||||
partly_built_trees.extend(new_partly_built_trees)
|
partly_built_trees.extend(new_partly_built_trees)
|
||||||
# partly_built_trees_architecture.extend(new_partly_built_trees_architecture)
|
|
||||||
partly_built_trees_architecture_indices.extend(new_partly_built_trees_architecture_indices)
|
partly_built_trees_architecture_indices.extend(new_partly_built_trees_architecture_indices)
|
||||||
partly_built_trees_deprel.extend(new_partly_built_trees_deprel)
|
|
||||||
|
|
||||||
l_ordered_built_trees, l_ordered_built_trees_index, l_ordered_built_trees_deprel, unique_trees_architecture = [], [], [], []
|
l_ordered_built_trees, l_ordered_built_trees_index, unique_trees_architecture = [], [], []
|
||||||
|
|
||||||
if built_trees:
|
if built_trees:
|
||||||
# sort 3 arrays by architecture indices
|
# sort 3 arrays by architecture indices
|
||||||
# temp_trees_index, temp_trees, temp_trees_architectures, temp_trees_deprel = (list(t) for t in zip(
|
temp_trees_index, temp_trees = (list(t) for t in zip(
|
||||||
# *sorted(zip(built_trees_architecture_indices, built_trees, built_trees_architecture, built_trees_deprel))))
|
*sorted(zip(built_trees_architecture_indices, built_trees))))
|
||||||
temp_trees_index, temp_trees, temp_trees_deprel = (list(t) for t in zip(
|
|
||||||
*sorted(zip(built_trees_architecture_indices, built_trees, built_trees_deprel))))
|
|
||||||
|
|
||||||
# order outputs and erase duplicates
|
# order outputs and erase duplicates
|
||||||
# for tree, tree_architecture, tree_architecture_indice in zip(built_trees, built_trees_architecture, built_trees_architecture_indices):
|
for tree, tree_index in zip(temp_trees, temp_trees_index):
|
||||||
# for tree, tree_architecture, tree_index, tree_deprel in zip(temp_trees, temp_trees_architectures, temp_trees_index, temp_trees_deprel):
|
new_tree_index, new_tree = (list(t) for t in zip(*sorted(zip(tree_index, tree))))
|
||||||
for tree, tree_index, tree_deprel in zip(temp_trees, temp_trees_index, temp_trees_deprel):
|
|
||||||
# new_tree_index, new_tree, new_tree_architecture, new_tree_deprel = (list(t) for t in zip(*sorted(zip(tree_index, tree, tree_architecture, tree_deprel))))
|
|
||||||
new_tree_index, new_tree, new_tree_deprel = (list(t) for t in zip(*sorted(zip(tree_index, tree, tree_deprel))))
|
|
||||||
# TODO check if inside new_tree_architecture in ordered_built_trees_architecture and if not append!
|
# TODO check if inside new_tree_architecture in ordered_built_trees_architecture and if not append!
|
||||||
is_unique = True
|
is_unique = True
|
||||||
for unique_tree in unique_trees_architecture:
|
for unique_tree in unique_trees_architecture:
|
||||||
|
@ -728,7 +708,6 @@ class Tree(object):
|
||||||
# l_ordered_built_trees_architecture.append(new_tree_architecture)
|
# l_ordered_built_trees_architecture.append(new_tree_architecture)
|
||||||
l_ordered_built_trees.append(new_tree)
|
l_ordered_built_trees.append(new_tree)
|
||||||
l_ordered_built_trees_index.append(new_tree_index)
|
l_ordered_built_trees_index.append(new_tree_index)
|
||||||
l_ordered_built_trees_deprel.append(new_tree_deprel)
|
|
||||||
# TODO NODE ORDER = FALSE
|
# TODO NODE ORDER = FALSE
|
||||||
# else:
|
# else:
|
||||||
#
|
#
|
||||||
|
@ -742,23 +721,4 @@ class Tree(object):
|
||||||
# print('aaa')
|
# print('aaa')
|
||||||
#
|
#
|
||||||
# pass
|
# pass
|
||||||
return l_ordered_built_trees, l_ordered_built_trees_index, l_ordered_built_trees_deprel
|
return l_ordered_built_trees, l_ordered_built_trees_index
|
||||||
|
|
||||||
|
|
||||||
def create_output_string_form(tree):
|
|
||||||
return tree.form.get_value()
|
|
||||||
|
|
||||||
def create_output_string_deprel(tree):
|
|
||||||
return tree.deprel.get_value()
|
|
||||||
|
|
||||||
def create_output_string_lemma(tree):
|
|
||||||
return tree.lemma.get_value()
|
|
||||||
|
|
||||||
def create_output_string_upos(tree):
|
|
||||||
return tree.upos.get_value()
|
|
||||||
|
|
||||||
def create_output_string_xpos(tree):
|
|
||||||
return tree.xpos.get_value()
|
|
||||||
|
|
||||||
def create_output_string_feats(tree):
|
|
||||||
return tree.feats.get_value()
|
|
||||||
|
|
|
@ -6,6 +6,7 @@ import hashlib
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
import re
|
import re
|
||||||
|
import string
|
||||||
import time
|
import time
|
||||||
import timeit
|
import timeit
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
|
@ -32,6 +33,7 @@ from Tree import Tree, create_output_string_form, create_output_string_deprel, c
|
||||||
# feats_detailed_list = []
|
# feats_detailed_list = []
|
||||||
|
|
||||||
# feats_detailed_dict = {key: {} for key in feats_detailed_list}
|
# feats_detailed_dict = {key: {} for key in feats_detailed_list}
|
||||||
|
from generic import get_collocabilities
|
||||||
|
|
||||||
|
|
||||||
def decode_query(orig_query, dependency_type, feats_detailed_list):
|
def decode_query(orig_query, dependency_type, feats_detailed_list):
|
||||||
|
@ -232,6 +234,11 @@ def tree_calculations(input_data):
|
||||||
_, subtrees = tree.get_subtrees(query_tree, [], create_output_string_funct, filters)
|
_, subtrees = tree.get_subtrees(query_tree, [], create_output_string_funct, filters)
|
||||||
return subtrees
|
return subtrees
|
||||||
|
|
||||||
|
def get_unigrams(input_data):
|
||||||
|
tree, query_tree, create_output_string_funct, filters = input_data
|
||||||
|
unigrams = tree.get_unigrams(create_output_string_funct, filters)
|
||||||
|
return unigrams
|
||||||
|
|
||||||
|
|
||||||
def tree_calculations_chunks(input_data):
|
def tree_calculations_chunks(input_data):
|
||||||
trees, query_tree, create_output_string_funct, filters = input_data
|
trees, query_tree, create_output_string_funct, filters = input_data
|
||||||
|
@ -404,6 +411,7 @@ def main():
|
||||||
create_output_string_functs.append(create_output_string_funct)
|
create_output_string_functs.append(create_output_string_funct)
|
||||||
|
|
||||||
result_dict = {}
|
result_dict = {}
|
||||||
|
unigrams_dict = {}
|
||||||
filters = {}
|
filters = {}
|
||||||
filters['node_order'] = config.get('settings', 'node_order') == 'fixed'
|
filters['node_order'] = config.get('settings', 'node_order') == 'fixed'
|
||||||
# filters['caching'] = config.getboolean('settings', 'caching')
|
# filters['caching'] = config.getboolean('settings', 'caching')
|
||||||
|
@ -430,6 +438,11 @@ def main():
|
||||||
filters['root_whitelist'] = []
|
filters['root_whitelist'] = []
|
||||||
|
|
||||||
filters['complete_tree_type'] = config.get('settings', 'tree_type') == 'complete'
|
filters['complete_tree_type'] = config.get('settings', 'tree_type') == 'complete'
|
||||||
|
filters['association_measures'] = config.getboolean('settings', 'association_measures')
|
||||||
|
filters['nodes_number'] = config.getboolean('settings', 'nodes_number')
|
||||||
|
filters['frequency_threshold'] = config.getfloat('settings', 'frequency_threshold')
|
||||||
|
filters['lines_threshold'] = config.getint('settings', 'lines_threshold')
|
||||||
|
filters['print_root'] = config.getboolean('settings', 'print_root')
|
||||||
|
|
||||||
|
|
||||||
# for tree in all_trees[2:]:
|
# for tree in all_trees[2:]:
|
||||||
|
@ -448,9 +461,17 @@ def main():
|
||||||
# result_dict[r_k] += r_v
|
# result_dict[r_k] += r_v
|
||||||
# else:
|
# else:
|
||||||
# result_dict[r_k] = r_v
|
# result_dict[r_k] = r_v
|
||||||
|
|
||||||
# 1.02 s (16 cores)
|
# 1.02 s (16 cores)
|
||||||
if cpu_cores > 1:
|
if cpu_cores > 1:
|
||||||
|
# input_data = (tree, query_tree, create_output_string_functs, filters)
|
||||||
|
all_unigrams = p.map(get_unigrams, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees])
|
||||||
|
for unigrams in all_unigrams:
|
||||||
|
for unigram in unigrams:
|
||||||
|
if unigram in unigrams_dict:
|
||||||
|
unigrams_dict[unigram] += 1
|
||||||
|
else:
|
||||||
|
unigrams_dict[unigram] = 1
|
||||||
|
|
||||||
all_subtrees = p.map(tree_calculations, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees])
|
all_subtrees = p.map(tree_calculations, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees])
|
||||||
|
|
||||||
# for subtrees in all_subtrees:
|
# for subtrees in all_subtrees:
|
||||||
|
@ -477,10 +498,19 @@ def main():
|
||||||
# for tree_i, tree in enumerate(all_trees[-5:]):
|
# for tree_i, tree in enumerate(all_trees[-5:]):
|
||||||
# for tree_i, tree in enumerate(all_trees):
|
# for tree_i, tree in enumerate(all_trees):
|
||||||
for tree_i, tree in enumerate(all_trees[1:]):
|
for tree_i, tree in enumerate(all_trees[1:]):
|
||||||
|
input_data = (tree, query_tree, create_output_string_functs, filters)
|
||||||
|
if filters['association_measures']:
|
||||||
|
unigrams = get_unigrams(input_data)
|
||||||
|
for unigram in unigrams:
|
||||||
|
if unigram in unigrams_dict:
|
||||||
|
unigrams_dict[unigram] += 1
|
||||||
|
else:
|
||||||
|
unigrams_dict[unigram] = 1
|
||||||
|
# for tree_i, tree in enumerate(all_trees[1:]):
|
||||||
# text = Če pa ostane odrasel otrok doma, se starši le težko sprijaznijo s tem, da je "velik", otrok pa ima ves čas občutek, da se njegovi starši po nepotrebnem vtikajo v njegovo življenje.
|
# text = Če pa ostane odrasel otrok doma, se starši le težko sprijaznijo s tem, da je "velik", otrok pa ima ves čas občutek, da se njegovi starši po nepotrebnem vtikajo v njegovo življenje.
|
||||||
# for tree_i, tree in enumerate(all_trees[5170:]):
|
# for tree_i, tree in enumerate(all_trees[5170:]):
|
||||||
# for tree in all_trees:
|
# for tree in all_trees:
|
||||||
subtrees = tree_calculations((tree, query_tree, create_output_string_functs, filters))
|
subtrees = tree_calculations(input_data)
|
||||||
for query_results in subtrees:
|
for query_results in subtrees:
|
||||||
for r in query_results:
|
for r in query_results:
|
||||||
if filters['node_order']:
|
if filters['node_order']:
|
||||||
|
@ -525,33 +555,39 @@ def main():
|
||||||
len_words = tree_size_range[-1]
|
len_words = tree_size_range[-1]
|
||||||
else:
|
else:
|
||||||
len_words = int(len(config.get('settings', 'query').split(" "))/2 + 1)
|
len_words = int(len(config.get('settings', 'query').split(" "))/2 + 1)
|
||||||
header = ["Structure"] + ["Node #" + str(i) + "-" + node_type for i in range(1, len_words + 1) for node_type in node_types] + ['Absolute frequency']
|
header = ["Structure"] + ["Node " + string.ascii_uppercase[i] + "-" + node_type for i in range(len_words) for node_type in node_types] + ['Absolute frequency']
|
||||||
header += ['Relative frequency']
|
header += ['Relative frequency']
|
||||||
if filters['node_order']:
|
if filters['node_order']:
|
||||||
header += ['Order']
|
header += ['Order']
|
||||||
if config.getboolean('settings', 'nodes_number'):
|
if filters['nodes_number']:
|
||||||
header += ['Number of nodes']
|
header += ['Number of nodes']
|
||||||
if config.getboolean('settings', 'print_root'):
|
if filters['print_root']:
|
||||||
header += ['Root node']
|
header += ['Root node']
|
||||||
|
if filters['association_measures']:
|
||||||
|
header += ['MI', 'MI3', 'Dice', 't-score', 'simple-LL']
|
||||||
# header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Absolute frequency']
|
# header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Absolute frequency']
|
||||||
writer.writerow(header)
|
writer.writerow(header)
|
||||||
|
|
||||||
if config.getint('settings', 'lines_threshold'):
|
if filters['lines_threshold']:
|
||||||
sorted_list = sorted_list[:config.getint('settings', 'lines_threshold')]
|
sorted_list = sorted_list[:filters['lines_threshold']]
|
||||||
|
|
||||||
# body
|
# body
|
||||||
for k, v in sorted_list:
|
for k, v in sorted_list:
|
||||||
|
absolute_frequency = v['number'] * 1000000.0 / corpus_size
|
||||||
|
if filters['frequency_threshold'] and filters['frequency_threshold'] > absolute_frequency:
|
||||||
|
break
|
||||||
words_only = [word_att for word in v['object'].array for word_att in word] + ['' for i in range((tree_size_range[-1] - len(v['object'].array)) * len(v['object'].array[0]))]
|
words_only = [word_att for word in v['object'].array for word_att in word] + ['' for i in range((tree_size_range[-1] - len(v['object'].array)) * len(v['object'].array[0]))]
|
||||||
# words_only = printable_answers(k)
|
# words_only = printable_answers(k)
|
||||||
row = [v['object'].key] + words_only + [str(v['number'])]
|
row = [v['object'].key] + words_only + [str(v['number'])]
|
||||||
row += ['%.4f' % (v['number'] * 1000000.0 / corpus_size)]
|
row += ['%.4f' % absolute_frequency]
|
||||||
if filters['node_order']:
|
if filters['node_order']:
|
||||||
row += [v['object'].order]
|
row += [v['object'].order]
|
||||||
if config.get('settings', 'nodes_number'):
|
if filters['nodes_number']:
|
||||||
row += ['%d' % len(v['object'].array)]
|
row += ['%d' % len(v['object'].array)]
|
||||||
if config.get('settings', 'print_root'):
|
if filters['print_root']:
|
||||||
row += [v['object'].root]
|
row += [v['object'].root]
|
||||||
|
if filters['association_measures']:
|
||||||
|
row += get_collocabilities(v, unigrams_dict, corpus_size)
|
||||||
writer.writerow(row)
|
writer.writerow(row)
|
||||||
|
|
||||||
return "Done"
|
return "Done"
|
||||||
|
|
71
generic.py
Normal file
71
generic.py
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
import math
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def create_output_string_form(tree):
|
||||||
|
return tree.form.get_value()
|
||||||
|
|
||||||
|
def create_output_string_deprel(tree):
|
||||||
|
return tree.deprel.get_value()
|
||||||
|
|
||||||
|
def create_output_string_lemma(tree):
|
||||||
|
return tree.lemma.get_value()
|
||||||
|
|
||||||
|
def create_output_string_upos(tree):
|
||||||
|
return tree.upos.get_value()
|
||||||
|
|
||||||
|
def create_output_string_xpos(tree):
|
||||||
|
return tree.xpos.get_value()
|
||||||
|
|
||||||
|
def create_output_string_feats(tree):
|
||||||
|
return tree.feats.get_value()
|
||||||
|
|
||||||
|
def generate_key(node, create_output_strings, print_lemma=True):
|
||||||
|
array = [[create_output_string(node) for create_output_string in create_output_strings]]
|
||||||
|
if create_output_string_lemma in create_output_strings and print_lemma:
|
||||||
|
key_array = [[create_output_string(
|
||||||
|
node) if create_output_string != create_output_string_lemma else 'L=' + create_output_string(node) for
|
||||||
|
create_output_string in create_output_strings]]
|
||||||
|
else:
|
||||||
|
key_array = array
|
||||||
|
if len(array[0]) > 1:
|
||||||
|
key = '&'.join(key_array[0])
|
||||||
|
else:
|
||||||
|
# output_string = create_output_strings[0](node)
|
||||||
|
key = key_array[0][0]
|
||||||
|
|
||||||
|
return array, key
|
||||||
|
|
||||||
|
def get_collocabilities(ngram, unigrams_dict, corpus_size):
|
||||||
|
sum_fwi = 0.0
|
||||||
|
mul_fwi = 1.0
|
||||||
|
for key_array in ngram['object'].array:
|
||||||
|
# create key for unigrams
|
||||||
|
if len(key_array) > 1:
|
||||||
|
key = '&'.join(key_array)
|
||||||
|
else:
|
||||||
|
# output_string = create_output_strings[0](node)
|
||||||
|
key = key_array[0]
|
||||||
|
sum_fwi += unigrams_dict[key]
|
||||||
|
mul_fwi *= unigrams_dict[key]
|
||||||
|
|
||||||
|
if mul_fwi < 0:
|
||||||
|
mul_fwi = sys.maxsize
|
||||||
|
|
||||||
|
# number of all words
|
||||||
|
N = corpus_size
|
||||||
|
|
||||||
|
# n of ngram
|
||||||
|
n = len(ngram['object'].array)
|
||||||
|
O = ngram['number']
|
||||||
|
E = mul_fwi / pow(N, n-1)
|
||||||
|
|
||||||
|
# ['MI', 'MI3', 'Dice', 't-score', 'simple-LL']
|
||||||
|
# mi = Math.log(O / E) / Math.log(2);
|
||||||
|
mi = math.log(O / E, 2)
|
||||||
|
# Math.log(Math.pow(O, 3.0) / E) / Math.log(2);
|
||||||
|
mi3 = math.log(pow(O, 3) / E, 2)
|
||||||
|
dice = n * O / sum_fwi
|
||||||
|
tscore = (O - E) / math.sqrt(O)
|
||||||
|
simplell = 2 * (O * math.log10(O / E) - (O - E))
|
||||||
|
return ['%.4f' % mi, '%.4f' % mi3, '%.4f' % dice, '%.4f' % tscore, '%.4f' % simplell]
|
Loading…
Reference in New Issue
Block a user