Added all but 2 key output

This commit is contained in:
Luka 2019-12-14 09:36:29 +01:00
parent 7c5aba1ca9
commit eeab026313
4 changed files with 163 additions and 86 deletions

View File

@ -1,18 +1,28 @@
import copy
import string
from generic import create_output_string_form, create_output_string_deprel, create_output_string_lemma, \
create_output_string_upos, create_output_string_xpos, create_output_string_feats, generate_key
class Result(object):
def __init__(self, node, architecture_order, create_output_strings):
self.array = [[create_output_string(node) for create_output_string in create_output_strings]]
if len(self.array[0]) > 1:
self.key = '{' + ','.join(self.array[0]) + '}'
else:
# output_string = create_output_strings[0](node)
self.key = self.array[0][0]
# self.array = [[create_output_string(node) for create_output_string in create_output_strings]]
# if create_output_string_lemma in create_output_strings:
# key_array = [[create_output_string(node) if create_output_string != create_output_string_lemma else 'L=' + create_output_string(node) for create_output_string in create_output_strings]]
# else:
# key_array = self.array
# if len(self.array[0]) > 1:
# self.key = '&'.join(key_array[0])
# else:
# # output_string = create_output_strings[0](node)
# self.key = key_array[0][0]
self.array, self.key = generate_key(node, create_output_strings)
# self.array = [[output_string]]
self.order_key = str([architecture_order])
self.order = [architecture_order]
self.deprel = node.deprel.get_value()
# order with original numbers in sentences
# self.order = str([architecture_order])
# order with numbers from 0 to n of n-gram
@ -123,7 +133,7 @@ class Result(object):
def set_root(self):
if len(self.array[0]) > 1:
self.root = '{' + ','.join(self.array[0]) + '}'
self.root = '&'.join(self.array[0])
else:
# output_string = create_output_strings[0](node)
self.root = self.array[0][0]

96
Tree.py
View File

@ -5,6 +5,8 @@ from pyconll.unit import Token
from Result import Result
from Value import Value
from generic import create_output_string_form, create_output_string_deprel, create_output_string_lemma, \
create_output_string_upos, create_output_string_xpos, create_output_string_feats, generate_key
class Tree(object):
@ -206,7 +208,6 @@ class Tree(object):
# create_output_string)
partial_answers = [[] for i in range(permanent_query_nb + temporary_query_nb)]
partial_answers_index = [[] for i in range(permanent_query_nb + temporary_query_nb)]
partial_answers_deprel = [[] for i in range(permanent_query_nb + temporary_query_nb)]
complete_answers = [[] for i in range(permanent_query_nb)]
# list of pairs (index of query in group, group of query)
@ -220,7 +221,6 @@ class Tree(object):
child_queries_flatten = [query_part for query in child_queries for query_part in query]
all_new_partial_answers = [[] for query_part in child_queries_flatten]
all_new_partial_answers_deprel = [[] for query_part in child_queries_flatten]
# if filters['caching']:
# erase duplicate queries
@ -247,7 +247,6 @@ class Tree(object):
# duplicate results again on correct places
for i, flattened_index in enumerate(child_queries_flatten_dedup_indices):
all_new_partial_answers[i].append(new_partial_answers_dedup[flattened_index])
all_new_partial_answers_deprel[i].append(create_output_string_deprel(child))
# else:
# new_partial_answers_architecture, new_partial_answers, new_complete_answers = child.get_subtrees(
@ -281,7 +280,7 @@ class Tree(object):
for answer_i, answer_length in enumerate(answers_lengths):
# iterate over answers of query
# TODO ERROR IN HERE!
partial_answers[answer_i], partial_answers_index[answer_i], partial_answers_deprel[answer_i] = self.create_answers(all_new_partial_answers[i:i + answer_length], all_new_partial_answers_deprel[i:i + answer_length], answer_length, filters)
partial_answers[answer_i], partial_answers_index[answer_i] = self.create_answers(all_new_partial_answers[i:i + answer_length], answer_length, filters)
# while i < answers_length:
# self.create_grouped_answers()
# i += 1
@ -308,9 +307,9 @@ class Tree(object):
# child, child_queries, child_queries_metadata = children_queries_generator.send(partial_results_dict)
# child_index += 1
return partial_answers, partial_answers_index, partial_answers_deprel, complete_answers
return partial_answers, partial_answers_index, complete_answers
def order_dependent_queries(self, active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index, partial_subtrees_deprel,
def order_dependent_queries(self, active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index,
create_output_string, merged_partial_subtrees, i_query, i_answer, filters):
# string_output = ''
# if create_output_string_form(self) == 'vožnji':
@ -324,7 +323,7 @@ class Tree(object):
# self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer]))
merged_partial_subtrees.append(
self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer]))
self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer]))
i_answer += 1
else:
@ -341,7 +340,7 @@ class Tree(object):
# self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer]))
merged_partial_subtrees.append(
self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer]))
self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer]))
i_answer += 1
else:
@ -351,6 +350,12 @@ class Tree(object):
return i_answer
def get_unigrams(self, create_output_strings, filters):
unigrams = [generate_key(self, create_output_strings, print_lemma=False)[1]]
for child in self.children:
unigrams += child.get_unigrams(create_output_strings, filters)
return unigrams
def get_subtrees(self, permanent_query_trees, temporary_query_trees, create_output_string, filters):
"""
@ -382,7 +387,7 @@ class Tree(object):
if 'children' in temporary_query_tree:
all_query_indices.append((temporary_query_tree['children'], False))
partial_subtrees, partial_subtrees_index, partial_subtrees_deprel, complete_answers = self.get_all_query_indices(len(temporary_query_trees),
partial_subtrees, partial_subtrees_index, complete_answers = self.get_all_query_indices(len(temporary_query_trees),
len(permanent_query_trees),
permanent_query_trees,
all_query_indices, self.children,
@ -397,7 +402,7 @@ class Tree(object):
# go over all permanent and temporary query trees
while i_question < len(active_permanent_query_trees) + len(active_temporary_query_trees):
# permanent query trees always have left and right child
i_answer = self.order_dependent_queries(active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index, partial_subtrees_deprel,
i_answer = self.order_dependent_queries(active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index,
create_output_string, merged_partial_answers, i_question, i_answer, filters)
i_question += 1
@ -466,21 +471,22 @@ class Tree(object):
merged_indices.append(new_indices)
return merged_results, merged_indices
def merge_results2(self, new_child, new_results, i_child, indices, deprel, filters):
def merge_results2(self, new_child, new_results, i_child, indices, filters):
l_res = []
r_res = []
results = []
for i_answer, answer in enumerate(new_child):
if filters['node_order'] and indices[i_child][i_answer] < self.children_split:
if filters['dependency_type']:
separator = ' <' + deprel[i_child][i_answer] + ' '
# separator = ' <' + deprel[i_child][i_answer] + ' '
separator = ' <' + answer[0].deprel + ' '
else:
separator = ' < '
l_res = self.merge_results(l_res, answer, separator, left=True)
# l_res += answer + separator
else:
if filters['dependency_type']:
separator = ' >' + deprel[i_child][i_answer] + ' '
separator = ' >' + answer[0].deprel + ' '
else:
separator = ' > '
r_res = self.merge_results(r_res, answer, separator, left=False)
@ -572,7 +578,7 @@ class Tree(object):
return results
def create_output_children(self, children, new_results, filters, indices, deprel):
def create_output_children(self, children, new_results, filters, indices):
# if create_output_string_form(self) == 'Dogodek':
# print('HERE!@@!')
# if create_output_string_form(self) == 'utišal':
@ -586,7 +592,7 @@ class Tree(object):
else:
new_child = sorted(child, key=lambda x: x[0].key)
#################
merged_results.extend(self.merge_results2(new_child, new_results, i_child, indices, deprel, filters))
merged_results.extend(self.merge_results2(new_child, new_results, i_child, indices, filters))
return merged_results
@staticmethod
@ -631,17 +637,11 @@ class Tree(object):
return merged_results
# @staticmethod
def create_answers(self, separated_answers, separated_answers_deprel, answer_length, filters):
# TODO
# node_order = False
def create_answers(self, separated_answers, answer_length, filters):
partly_built_trees = [[None] * answer_length]
# partly_built_trees_architecture = [[None] * answer_length]
partly_built_trees_architecture_indices = [[None] * answer_length]
partly_built_trees_deprel = [[None] * answer_length]
built_trees = []
# built_trees_architecture = []
built_trees_architecture_indices = []
built_trees_deprel = []
# if create_output_string_form(self) == 'Dogodek':
# print('HERE!@@!')
@ -650,64 +650,44 @@ class Tree(object):
# child are added
for child_i in range(len(separated_answers[0])):
new_partly_built_trees = []
# new_partly_built_trees_architecture = []
new_partly_built_trees_architecture_indices = []
new_partly_built_trees_deprel = []
# iterate over answers parts
for answer_part_i in range(len(separated_answers)):
# necessary because some parts do not pass filters and are not added
# if child_i < len(separated_answers[answer_part_i]) and separated_answers[answer_part_i][child_i]:
if separated_answers[answer_part_i][child_i]:
for tree_part_i, tree_part in enumerate(partly_built_trees):
# if tree_part[answer_part_i] equals None add new element in its place
if not tree_part[answer_part_i]:
new_tree_part = copy(tree_part)
# new_tree_part_architecture = copy(partly_built_trees_architecture[tree_part_i])
new_tree_part_architecture_indices = copy(partly_built_trees_architecture_indices[tree_part_i])
new_tree_part_deprel = copy(partly_built_trees_deprel[tree_part_i])
new_tree_part[answer_part_i] = separated_answers[answer_part_i][child_i]
# new_tree_part_architecture[answer_part_i] = separated_answers_architecture[answer_part_i][child_i]
new_tree_part_architecture_indices[answer_part_i] = child_i
new_tree_part_deprel[answer_part_i] = separated_answers_deprel[answer_part_i][child_i]
completed_tree_part = True
for val_i, val in enumerate(new_tree_part):
if not val:
completed_tree_part = False
if completed_tree_part:
built_trees.append(new_tree_part)
# built_trees_architecture.append(new_tree_part_architecture)
built_trees_architecture_indices.append(new_tree_part_architecture_indices)
built_trees_deprel.append(new_tree_part_deprel)
else:
new_partly_built_trees.append(new_tree_part)
# new_partly_built_trees_architecture.append(new_tree_part_architecture)
new_partly_built_trees_architecture_indices.append(new_tree_part_architecture_indices)
new_partly_built_trees_deprel.append(new_tree_part_deprel)
else:
# pass over repetitions of same words
pass
# print('HERE!!!')
partly_built_trees.extend(new_partly_built_trees)
# partly_built_trees_architecture.extend(new_partly_built_trees_architecture)
partly_built_trees_architecture_indices.extend(new_partly_built_trees_architecture_indices)
partly_built_trees_deprel.extend(new_partly_built_trees_deprel)
l_ordered_built_trees, l_ordered_built_trees_index, l_ordered_built_trees_deprel, unique_trees_architecture = [], [], [], []
l_ordered_built_trees, l_ordered_built_trees_index, unique_trees_architecture = [], [], []
if built_trees:
# sort 3 arrays by architecture indices
# temp_trees_index, temp_trees, temp_trees_architectures, temp_trees_deprel = (list(t) for t in zip(
# *sorted(zip(built_trees_architecture_indices, built_trees, built_trees_architecture, built_trees_deprel))))
temp_trees_index, temp_trees, temp_trees_deprel = (list(t) for t in zip(
*sorted(zip(built_trees_architecture_indices, built_trees, built_trees_deprel))))
temp_trees_index, temp_trees = (list(t) for t in zip(
*sorted(zip(built_trees_architecture_indices, built_trees))))
# order outputs and erase duplicates
# for tree, tree_architecture, tree_architecture_indice in zip(built_trees, built_trees_architecture, built_trees_architecture_indices):
# for tree, tree_architecture, tree_index, tree_deprel in zip(temp_trees, temp_trees_architectures, temp_trees_index, temp_trees_deprel):
for tree, tree_index, tree_deprel in zip(temp_trees, temp_trees_index, temp_trees_deprel):
# new_tree_index, new_tree, new_tree_architecture, new_tree_deprel = (list(t) for t in zip(*sorted(zip(tree_index, tree, tree_architecture, tree_deprel))))
new_tree_index, new_tree, new_tree_deprel = (list(t) for t in zip(*sorted(zip(tree_index, tree, tree_deprel))))
for tree, tree_index in zip(temp_trees, temp_trees_index):
new_tree_index, new_tree = (list(t) for t in zip(*sorted(zip(tree_index, tree))))
# TODO check if inside new_tree_architecture in ordered_built_trees_architecture and if not append!
is_unique = True
for unique_tree in unique_trees_architecture:
@ -728,7 +708,6 @@ class Tree(object):
# l_ordered_built_trees_architecture.append(new_tree_architecture)
l_ordered_built_trees.append(new_tree)
l_ordered_built_trees_index.append(new_tree_index)
l_ordered_built_trees_deprel.append(new_tree_deprel)
# TODO NODE ORDER = FALSE
# else:
#
@ -742,23 +721,4 @@ class Tree(object):
# print('aaa')
#
# pass
return l_ordered_built_trees, l_ordered_built_trees_index, l_ordered_built_trees_deprel
def create_output_string_form(tree):
return tree.form.get_value()
def create_output_string_deprel(tree):
return tree.deprel.get_value()
def create_output_string_lemma(tree):
return tree.lemma.get_value()
def create_output_string_upos(tree):
return tree.upos.get_value()
def create_output_string_xpos(tree):
return tree.xpos.get_value()
def create_output_string_feats(tree):
return tree.feats.get_value()
return l_ordered_built_trees, l_ordered_built_trees_index

View File

@ -6,6 +6,7 @@ import hashlib
import os
import pickle
import re
import string
import time
import timeit
from multiprocessing import Pool
@ -32,6 +33,7 @@ from Tree import Tree, create_output_string_form, create_output_string_deprel, c
# feats_detailed_list = []
# feats_detailed_dict = {key: {} for key in feats_detailed_list}
from generic import get_collocabilities
def decode_query(orig_query, dependency_type, feats_detailed_list):
@ -232,6 +234,11 @@ def tree_calculations(input_data):
_, subtrees = tree.get_subtrees(query_tree, [], create_output_string_funct, filters)
return subtrees
def get_unigrams(input_data):
tree, query_tree, create_output_string_funct, filters = input_data
unigrams = tree.get_unigrams(create_output_string_funct, filters)
return unigrams
def tree_calculations_chunks(input_data):
trees, query_tree, create_output_string_funct, filters = input_data
@ -404,6 +411,7 @@ def main():
create_output_string_functs.append(create_output_string_funct)
result_dict = {}
unigrams_dict = {}
filters = {}
filters['node_order'] = config.get('settings', 'node_order') == 'fixed'
# filters['caching'] = config.getboolean('settings', 'caching')
@ -430,6 +438,11 @@ def main():
filters['root_whitelist'] = []
filters['complete_tree_type'] = config.get('settings', 'tree_type') == 'complete'
filters['association_measures'] = config.getboolean('settings', 'association_measures')
filters['nodes_number'] = config.getboolean('settings', 'nodes_number')
filters['frequency_threshold'] = config.getfloat('settings', 'frequency_threshold')
filters['lines_threshold'] = config.getint('settings', 'lines_threshold')
filters['print_root'] = config.getboolean('settings', 'print_root')
# for tree in all_trees[2:]:
@ -448,9 +461,17 @@ def main():
# result_dict[r_k] += r_v
# else:
# result_dict[r_k] = r_v
# 1.02 s (16 cores)
if cpu_cores > 1:
# input_data = (tree, query_tree, create_output_string_functs, filters)
all_unigrams = p.map(get_unigrams, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees])
for unigrams in all_unigrams:
for unigram in unigrams:
if unigram in unigrams_dict:
unigrams_dict[unigram] += 1
else:
unigrams_dict[unigram] = 1
all_subtrees = p.map(tree_calculations, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees])
# for subtrees in all_subtrees:
@ -477,10 +498,19 @@ def main():
# for tree_i, tree in enumerate(all_trees[-5:]):
# for tree_i, tree in enumerate(all_trees):
for tree_i, tree in enumerate(all_trees[1:]):
input_data = (tree, query_tree, create_output_string_functs, filters)
if filters['association_measures']:
unigrams = get_unigrams(input_data)
for unigram in unigrams:
if unigram in unigrams_dict:
unigrams_dict[unigram] += 1
else:
unigrams_dict[unigram] = 1
# for tree_i, tree in enumerate(all_trees[1:]):
# text = Če pa ostane odrasel otrok doma, se starši le težko sprijaznijo s tem, da je "velik", otrok pa ima ves čas občutek, da se njegovi starši po nepotrebnem vtikajo v njegovo življenje.
# for tree_i, tree in enumerate(all_trees[5170:]):
# for tree in all_trees:
subtrees = tree_calculations((tree, query_tree, create_output_string_functs, filters))
subtrees = tree_calculations(input_data)
for query_results in subtrees:
for r in query_results:
if filters['node_order']:
@ -525,33 +555,39 @@ def main():
len_words = tree_size_range[-1]
else:
len_words = int(len(config.get('settings', 'query').split(" "))/2 + 1)
header = ["Structure"] + ["Node #" + str(i) + "-" + node_type for i in range(1, len_words + 1) for node_type in node_types] + ['Absolute frequency']
header = ["Structure"] + ["Node " + string.ascii_uppercase[i] + "-" + node_type for i in range(len_words) for node_type in node_types] + ['Absolute frequency']
header += ['Relative frequency']
if filters['node_order']:
header += ['Order']
if config.getboolean('settings', 'nodes_number'):
if filters['nodes_number']:
header += ['Number of nodes']
if config.getboolean('settings', 'print_root'):
if filters['print_root']:
header += ['Root node']
if filters['association_measures']:
header += ['MI', 'MI3', 'Dice', 't-score', 'simple-LL']
# header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Absolute frequency']
writer.writerow(header)
if config.getint('settings', 'lines_threshold'):
sorted_list = sorted_list[:config.getint('settings', 'lines_threshold')]
if filters['lines_threshold']:
sorted_list = sorted_list[:filters['lines_threshold']]
# body
for k, v in sorted_list:
absolute_frequency = v['number'] * 1000000.0 / corpus_size
if filters['frequency_threshold'] and filters['frequency_threshold'] > absolute_frequency:
break
words_only = [word_att for word in v['object'].array for word_att in word] + ['' for i in range((tree_size_range[-1] - len(v['object'].array)) * len(v['object'].array[0]))]
# words_only = printable_answers(k)
row = [v['object'].key] + words_only + [str(v['number'])]
row += ['%.4f' % (v['number'] * 1000000.0 / corpus_size)]
row += ['%.4f' % absolute_frequency]
if filters['node_order']:
row += [v['object'].order]
if config.get('settings', 'nodes_number'):
if filters['nodes_number']:
row += ['%d' % len(v['object'].array)]
if config.get('settings', 'print_root'):
if filters['print_root']:
row += [v['object'].root]
if filters['association_measures']:
row += get_collocabilities(v, unigrams_dict, corpus_size)
writer.writerow(row)
return "Done"

71
generic.py Normal file
View File

@ -0,0 +1,71 @@
import math
import sys
def create_output_string_form(tree):
return tree.form.get_value()
def create_output_string_deprel(tree):
return tree.deprel.get_value()
def create_output_string_lemma(tree):
return tree.lemma.get_value()
def create_output_string_upos(tree):
return tree.upos.get_value()
def create_output_string_xpos(tree):
return tree.xpos.get_value()
def create_output_string_feats(tree):
return tree.feats.get_value()
def generate_key(node, create_output_strings, print_lemma=True):
array = [[create_output_string(node) for create_output_string in create_output_strings]]
if create_output_string_lemma in create_output_strings and print_lemma:
key_array = [[create_output_string(
node) if create_output_string != create_output_string_lemma else 'L=' + create_output_string(node) for
create_output_string in create_output_strings]]
else:
key_array = array
if len(array[0]) > 1:
key = '&'.join(key_array[0])
else:
# output_string = create_output_strings[0](node)
key = key_array[0][0]
return array, key
def get_collocabilities(ngram, unigrams_dict, corpus_size):
sum_fwi = 0.0
mul_fwi = 1.0
for key_array in ngram['object'].array:
# create key for unigrams
if len(key_array) > 1:
key = '&'.join(key_array)
else:
# output_string = create_output_strings[0](node)
key = key_array[0]
sum_fwi += unigrams_dict[key]
mul_fwi *= unigrams_dict[key]
if mul_fwi < 0:
mul_fwi = sys.maxsize
# number of all words
N = corpus_size
# n of ngram
n = len(ngram['object'].array)
O = ngram['number']
E = mul_fwi / pow(N, n-1)
# ['MI', 'MI3', 'Dice', 't-score', 'simple-LL']
# mi = Math.log(O / E) / Math.log(2);
mi = math.log(O / E, 2)
# Math.log(Math.pow(O, 3.0) / E) / Math.log(2);
mi3 = math.log(pow(O, 3) / E, 2)
dice = n * O / sum_fwi
tscore = (O - E) / math.sqrt(O)
simplell = 2 * (O * math.log10(O / E) - (O - E))
return ['%.4f' % mi, '%.4f' % mi3, '%.4f' % dice, '%.4f' % tscore, '%.4f' % simplell]