Added all but 2 key output

This commit is contained in:
Luka 2019-12-14 09:36:29 +01:00
parent 7c5aba1ca9
commit eeab026313
4 changed files with 163 additions and 86 deletions

View File

@ -1,18 +1,28 @@
import copy import copy
import string import string
from generic import create_output_string_form, create_output_string_deprel, create_output_string_lemma, \
create_output_string_upos, create_output_string_xpos, create_output_string_feats, generate_key
class Result(object): class Result(object):
def __init__(self, node, architecture_order, create_output_strings): def __init__(self, node, architecture_order, create_output_strings):
self.array = [[create_output_string(node) for create_output_string in create_output_strings]] # self.array = [[create_output_string(node) for create_output_string in create_output_strings]]
if len(self.array[0]) > 1: # if create_output_string_lemma in create_output_strings:
self.key = '{' + ','.join(self.array[0]) + '}' # key_array = [[create_output_string(node) if create_output_string != create_output_string_lemma else 'L=' + create_output_string(node) for create_output_string in create_output_strings]]
else: # else:
# output_string = create_output_strings[0](node) # key_array = self.array
self.key = self.array[0][0] # if len(self.array[0]) > 1:
# self.key = '&'.join(key_array[0])
# else:
# # output_string = create_output_strings[0](node)
# self.key = key_array[0][0]
self.array, self.key = generate_key(node, create_output_strings)
# self.array = [[output_string]] # self.array = [[output_string]]
self.order_key = str([architecture_order]) self.order_key = str([architecture_order])
self.order = [architecture_order] self.order = [architecture_order]
self.deprel = node.deprel.get_value()
# order with original numbers in sentences # order with original numbers in sentences
# self.order = str([architecture_order]) # self.order = str([architecture_order])
# order with numbers from 0 to n of n-gram # order with numbers from 0 to n of n-gram
@ -123,7 +133,7 @@ class Result(object):
def set_root(self): def set_root(self):
if len(self.array[0]) > 1: if len(self.array[0]) > 1:
self.root = '{' + ','.join(self.array[0]) + '}' self.root = '&'.join(self.array[0])
else: else:
# output_string = create_output_strings[0](node) # output_string = create_output_strings[0](node)
self.root = self.array[0][0] self.root = self.array[0][0]

96
Tree.py
View File

@ -5,6 +5,8 @@ from pyconll.unit import Token
from Result import Result from Result import Result
from Value import Value from Value import Value
from generic import create_output_string_form, create_output_string_deprel, create_output_string_lemma, \
create_output_string_upos, create_output_string_xpos, create_output_string_feats, generate_key
class Tree(object): class Tree(object):
@ -206,7 +208,6 @@ class Tree(object):
# create_output_string) # create_output_string)
partial_answers = [[] for i in range(permanent_query_nb + temporary_query_nb)] partial_answers = [[] for i in range(permanent_query_nb + temporary_query_nb)]
partial_answers_index = [[] for i in range(permanent_query_nb + temporary_query_nb)] partial_answers_index = [[] for i in range(permanent_query_nb + temporary_query_nb)]
partial_answers_deprel = [[] for i in range(permanent_query_nb + temporary_query_nb)]
complete_answers = [[] for i in range(permanent_query_nb)] complete_answers = [[] for i in range(permanent_query_nb)]
# list of pairs (index of query in group, group of query) # list of pairs (index of query in group, group of query)
@ -220,7 +221,6 @@ class Tree(object):
child_queries_flatten = [query_part for query in child_queries for query_part in query] child_queries_flatten = [query_part for query in child_queries for query_part in query]
all_new_partial_answers = [[] for query_part in child_queries_flatten] all_new_partial_answers = [[] for query_part in child_queries_flatten]
all_new_partial_answers_deprel = [[] for query_part in child_queries_flatten]
# if filters['caching']: # if filters['caching']:
# erase duplicate queries # erase duplicate queries
@ -247,7 +247,6 @@ class Tree(object):
# duplicate results again on correct places # duplicate results again on correct places
for i, flattened_index in enumerate(child_queries_flatten_dedup_indices): for i, flattened_index in enumerate(child_queries_flatten_dedup_indices):
all_new_partial_answers[i].append(new_partial_answers_dedup[flattened_index]) all_new_partial_answers[i].append(new_partial_answers_dedup[flattened_index])
all_new_partial_answers_deprel[i].append(create_output_string_deprel(child))
# else: # else:
# new_partial_answers_architecture, new_partial_answers, new_complete_answers = child.get_subtrees( # new_partial_answers_architecture, new_partial_answers, new_complete_answers = child.get_subtrees(
@ -281,7 +280,7 @@ class Tree(object):
for answer_i, answer_length in enumerate(answers_lengths): for answer_i, answer_length in enumerate(answers_lengths):
# iterate over answers of query # iterate over answers of query
# TODO ERROR IN HERE! # TODO ERROR IN HERE!
partial_answers[answer_i], partial_answers_index[answer_i], partial_answers_deprel[answer_i] = self.create_answers(all_new_partial_answers[i:i + answer_length], all_new_partial_answers_deprel[i:i + answer_length], answer_length, filters) partial_answers[answer_i], partial_answers_index[answer_i] = self.create_answers(all_new_partial_answers[i:i + answer_length], answer_length, filters)
# while i < answers_length: # while i < answers_length:
# self.create_grouped_answers() # self.create_grouped_answers()
# i += 1 # i += 1
@ -308,9 +307,9 @@ class Tree(object):
# child, child_queries, child_queries_metadata = children_queries_generator.send(partial_results_dict) # child, child_queries, child_queries_metadata = children_queries_generator.send(partial_results_dict)
# child_index += 1 # child_index += 1
return partial_answers, partial_answers_index, partial_answers_deprel, complete_answers return partial_answers, partial_answers_index, complete_answers
def order_dependent_queries(self, active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index, partial_subtrees_deprel, def order_dependent_queries(self, active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index,
create_output_string, merged_partial_subtrees, i_query, i_answer, filters): create_output_string, merged_partial_subtrees, i_query, i_answer, filters):
# string_output = '' # string_output = ''
# if create_output_string_form(self) == 'vožnji': # if create_output_string_form(self) == 'vožnji':
@ -324,7 +323,7 @@ class Tree(object):
# self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) # self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer]))
merged_partial_subtrees.append( merged_partial_subtrees.append(
self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer]))
i_answer += 1 i_answer += 1
else: else:
@ -341,7 +340,7 @@ class Tree(object):
# self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) # self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer]))
merged_partial_subtrees.append( merged_partial_subtrees.append(
self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer]))
i_answer += 1 i_answer += 1
else: else:
@ -351,6 +350,12 @@ class Tree(object):
return i_answer return i_answer
def get_unigrams(self, create_output_strings, filters):
unigrams = [generate_key(self, create_output_strings, print_lemma=False)[1]]
for child in self.children:
unigrams += child.get_unigrams(create_output_strings, filters)
return unigrams
def get_subtrees(self, permanent_query_trees, temporary_query_trees, create_output_string, filters): def get_subtrees(self, permanent_query_trees, temporary_query_trees, create_output_string, filters):
""" """
@ -382,7 +387,7 @@ class Tree(object):
if 'children' in temporary_query_tree: if 'children' in temporary_query_tree:
all_query_indices.append((temporary_query_tree['children'], False)) all_query_indices.append((temporary_query_tree['children'], False))
partial_subtrees, partial_subtrees_index, partial_subtrees_deprel, complete_answers = self.get_all_query_indices(len(temporary_query_trees), partial_subtrees, partial_subtrees_index, complete_answers = self.get_all_query_indices(len(temporary_query_trees),
len(permanent_query_trees), len(permanent_query_trees),
permanent_query_trees, permanent_query_trees,
all_query_indices, self.children, all_query_indices, self.children,
@ -397,7 +402,7 @@ class Tree(object):
# go over all permanent and temporary query trees # go over all permanent and temporary query trees
while i_question < len(active_permanent_query_trees) + len(active_temporary_query_trees): while i_question < len(active_permanent_query_trees) + len(active_temporary_query_trees):
# permanent query trees always have left and right child # permanent query trees always have left and right child
i_answer = self.order_dependent_queries(active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index, partial_subtrees_deprel, i_answer = self.order_dependent_queries(active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index,
create_output_string, merged_partial_answers, i_question, i_answer, filters) create_output_string, merged_partial_answers, i_question, i_answer, filters)
i_question += 1 i_question += 1
@ -466,21 +471,22 @@ class Tree(object):
merged_indices.append(new_indices) merged_indices.append(new_indices)
return merged_results, merged_indices return merged_results, merged_indices
def merge_results2(self, new_child, new_results, i_child, indices, deprel, filters): def merge_results2(self, new_child, new_results, i_child, indices, filters):
l_res = [] l_res = []
r_res = [] r_res = []
results = [] results = []
for i_answer, answer in enumerate(new_child): for i_answer, answer in enumerate(new_child):
if filters['node_order'] and indices[i_child][i_answer] < self.children_split: if filters['node_order'] and indices[i_child][i_answer] < self.children_split:
if filters['dependency_type']: if filters['dependency_type']:
separator = ' <' + deprel[i_child][i_answer] + ' ' # separator = ' <' + deprel[i_child][i_answer] + ' '
separator = ' <' + answer[0].deprel + ' '
else: else:
separator = ' < ' separator = ' < '
l_res = self.merge_results(l_res, answer, separator, left=True) l_res = self.merge_results(l_res, answer, separator, left=True)
# l_res += answer + separator # l_res += answer + separator
else: else:
if filters['dependency_type']: if filters['dependency_type']:
separator = ' >' + deprel[i_child][i_answer] + ' ' separator = ' >' + answer[0].deprel + ' '
else: else:
separator = ' > ' separator = ' > '
r_res = self.merge_results(r_res, answer, separator, left=False) r_res = self.merge_results(r_res, answer, separator, left=False)
@ -572,7 +578,7 @@ class Tree(object):
return results return results
def create_output_children(self, children, new_results, filters, indices, deprel): def create_output_children(self, children, new_results, filters, indices):
# if create_output_string_form(self) == 'Dogodek': # if create_output_string_form(self) == 'Dogodek':
# print('HERE!@@!') # print('HERE!@@!')
# if create_output_string_form(self) == 'utišal': # if create_output_string_form(self) == 'utišal':
@ -586,7 +592,7 @@ class Tree(object):
else: else:
new_child = sorted(child, key=lambda x: x[0].key) new_child = sorted(child, key=lambda x: x[0].key)
################# #################
merged_results.extend(self.merge_results2(new_child, new_results, i_child, indices, deprel, filters)) merged_results.extend(self.merge_results2(new_child, new_results, i_child, indices, filters))
return merged_results return merged_results
@staticmethod @staticmethod
@ -631,17 +637,11 @@ class Tree(object):
return merged_results return merged_results
# @staticmethod # @staticmethod
def create_answers(self, separated_answers, separated_answers_deprel, answer_length, filters): def create_answers(self, separated_answers, answer_length, filters):
# TODO
# node_order = False
partly_built_trees = [[None] * answer_length] partly_built_trees = [[None] * answer_length]
# partly_built_trees_architecture = [[None] * answer_length]
partly_built_trees_architecture_indices = [[None] * answer_length] partly_built_trees_architecture_indices = [[None] * answer_length]
partly_built_trees_deprel = [[None] * answer_length]
built_trees = [] built_trees = []
# built_trees_architecture = []
built_trees_architecture_indices = [] built_trees_architecture_indices = []
built_trees_deprel = []
# if create_output_string_form(self) == 'Dogodek': # if create_output_string_form(self) == 'Dogodek':
# print('HERE!@@!') # print('HERE!@@!')
@ -650,64 +650,44 @@ class Tree(object):
# child are added # child are added
for child_i in range(len(separated_answers[0])): for child_i in range(len(separated_answers[0])):
new_partly_built_trees = [] new_partly_built_trees = []
# new_partly_built_trees_architecture = []
new_partly_built_trees_architecture_indices = [] new_partly_built_trees_architecture_indices = []
new_partly_built_trees_deprel = []
# iterate over answers parts # iterate over answers parts
for answer_part_i in range(len(separated_answers)): for answer_part_i in range(len(separated_answers)):
# necessary because some parts do not pass filters and are not added # necessary because some parts do not pass filters and are not added
# if child_i < len(separated_answers[answer_part_i]) and separated_answers[answer_part_i][child_i]:
if separated_answers[answer_part_i][child_i]: if separated_answers[answer_part_i][child_i]:
for tree_part_i, tree_part in enumerate(partly_built_trees): for tree_part_i, tree_part in enumerate(partly_built_trees):
# if tree_part[answer_part_i] equals None add new element in its place
if not tree_part[answer_part_i]: if not tree_part[answer_part_i]:
new_tree_part = copy(tree_part) new_tree_part = copy(tree_part)
# new_tree_part_architecture = copy(partly_built_trees_architecture[tree_part_i])
new_tree_part_architecture_indices = copy(partly_built_trees_architecture_indices[tree_part_i]) new_tree_part_architecture_indices = copy(partly_built_trees_architecture_indices[tree_part_i])
new_tree_part_deprel = copy(partly_built_trees_deprel[tree_part_i])
new_tree_part[answer_part_i] = separated_answers[answer_part_i][child_i] new_tree_part[answer_part_i] = separated_answers[answer_part_i][child_i]
# new_tree_part_architecture[answer_part_i] = separated_answers_architecture[answer_part_i][child_i]
new_tree_part_architecture_indices[answer_part_i] = child_i new_tree_part_architecture_indices[answer_part_i] = child_i
new_tree_part_deprel[answer_part_i] = separated_answers_deprel[answer_part_i][child_i]
completed_tree_part = True completed_tree_part = True
for val_i, val in enumerate(new_tree_part): for val_i, val in enumerate(new_tree_part):
if not val: if not val:
completed_tree_part = False completed_tree_part = False
if completed_tree_part: if completed_tree_part:
built_trees.append(new_tree_part) built_trees.append(new_tree_part)
# built_trees_architecture.append(new_tree_part_architecture)
built_trees_architecture_indices.append(new_tree_part_architecture_indices) built_trees_architecture_indices.append(new_tree_part_architecture_indices)
built_trees_deprel.append(new_tree_part_deprel)
else: else:
new_partly_built_trees.append(new_tree_part) new_partly_built_trees.append(new_tree_part)
# new_partly_built_trees_architecture.append(new_tree_part_architecture)
new_partly_built_trees_architecture_indices.append(new_tree_part_architecture_indices) new_partly_built_trees_architecture_indices.append(new_tree_part_architecture_indices)
new_partly_built_trees_deprel.append(new_tree_part_deprel)
else: else:
# pass over repetitions of same words # pass over repetitions of same words
pass pass
# print('HERE!!!')
partly_built_trees.extend(new_partly_built_trees) partly_built_trees.extend(new_partly_built_trees)
# partly_built_trees_architecture.extend(new_partly_built_trees_architecture)
partly_built_trees_architecture_indices.extend(new_partly_built_trees_architecture_indices) partly_built_trees_architecture_indices.extend(new_partly_built_trees_architecture_indices)
partly_built_trees_deprel.extend(new_partly_built_trees_deprel)
l_ordered_built_trees, l_ordered_built_trees_index, l_ordered_built_trees_deprel, unique_trees_architecture = [], [], [], [] l_ordered_built_trees, l_ordered_built_trees_index, unique_trees_architecture = [], [], []
if built_trees: if built_trees:
# sort 3 arrays by architecture indices # sort 3 arrays by architecture indices
# temp_trees_index, temp_trees, temp_trees_architectures, temp_trees_deprel = (list(t) for t in zip( temp_trees_index, temp_trees = (list(t) for t in zip(
# *sorted(zip(built_trees_architecture_indices, built_trees, built_trees_architecture, built_trees_deprel)))) *sorted(zip(built_trees_architecture_indices, built_trees))))
temp_trees_index, temp_trees, temp_trees_deprel = (list(t) for t in zip(
*sorted(zip(built_trees_architecture_indices, built_trees, built_trees_deprel))))
# order outputs and erase duplicates # order outputs and erase duplicates
# for tree, tree_architecture, tree_architecture_indice in zip(built_trees, built_trees_architecture, built_trees_architecture_indices): for tree, tree_index in zip(temp_trees, temp_trees_index):
# for tree, tree_architecture, tree_index, tree_deprel in zip(temp_trees, temp_trees_architectures, temp_trees_index, temp_trees_deprel): new_tree_index, new_tree = (list(t) for t in zip(*sorted(zip(tree_index, tree))))
for tree, tree_index, tree_deprel in zip(temp_trees, temp_trees_index, temp_trees_deprel):
# new_tree_index, new_tree, new_tree_architecture, new_tree_deprel = (list(t) for t in zip(*sorted(zip(tree_index, tree, tree_architecture, tree_deprel))))
new_tree_index, new_tree, new_tree_deprel = (list(t) for t in zip(*sorted(zip(tree_index, tree, tree_deprel))))
# TODO check if inside new_tree_architecture in ordered_built_trees_architecture and if not append! # TODO check if inside new_tree_architecture in ordered_built_trees_architecture and if not append!
is_unique = True is_unique = True
for unique_tree in unique_trees_architecture: for unique_tree in unique_trees_architecture:
@ -728,7 +708,6 @@ class Tree(object):
# l_ordered_built_trees_architecture.append(new_tree_architecture) # l_ordered_built_trees_architecture.append(new_tree_architecture)
l_ordered_built_trees.append(new_tree) l_ordered_built_trees.append(new_tree)
l_ordered_built_trees_index.append(new_tree_index) l_ordered_built_trees_index.append(new_tree_index)
l_ordered_built_trees_deprel.append(new_tree_deprel)
# TODO NODE ORDER = FALSE # TODO NODE ORDER = FALSE
# else: # else:
# #
@ -742,23 +721,4 @@ class Tree(object):
# print('aaa') # print('aaa')
# #
# pass # pass
return l_ordered_built_trees, l_ordered_built_trees_index, l_ordered_built_trees_deprel return l_ordered_built_trees, l_ordered_built_trees_index
def create_output_string_form(tree):
return tree.form.get_value()
def create_output_string_deprel(tree):
return tree.deprel.get_value()
def create_output_string_lemma(tree):
return tree.lemma.get_value()
def create_output_string_upos(tree):
return tree.upos.get_value()
def create_output_string_xpos(tree):
return tree.xpos.get_value()
def create_output_string_feats(tree):
return tree.feats.get_value()

View File

@ -6,6 +6,7 @@ import hashlib
import os import os
import pickle import pickle
import re import re
import string
import time import time
import timeit import timeit
from multiprocessing import Pool from multiprocessing import Pool
@ -32,6 +33,7 @@ from Tree import Tree, create_output_string_form, create_output_string_deprel, c
# feats_detailed_list = [] # feats_detailed_list = []
# feats_detailed_dict = {key: {} for key in feats_detailed_list} # feats_detailed_dict = {key: {} for key in feats_detailed_list}
from generic import get_collocabilities
def decode_query(orig_query, dependency_type, feats_detailed_list): def decode_query(orig_query, dependency_type, feats_detailed_list):
@ -232,6 +234,11 @@ def tree_calculations(input_data):
_, subtrees = tree.get_subtrees(query_tree, [], create_output_string_funct, filters) _, subtrees = tree.get_subtrees(query_tree, [], create_output_string_funct, filters)
return subtrees return subtrees
def get_unigrams(input_data):
tree, query_tree, create_output_string_funct, filters = input_data
unigrams = tree.get_unigrams(create_output_string_funct, filters)
return unigrams
def tree_calculations_chunks(input_data): def tree_calculations_chunks(input_data):
trees, query_tree, create_output_string_funct, filters = input_data trees, query_tree, create_output_string_funct, filters = input_data
@ -404,6 +411,7 @@ def main():
create_output_string_functs.append(create_output_string_funct) create_output_string_functs.append(create_output_string_funct)
result_dict = {} result_dict = {}
unigrams_dict = {}
filters = {} filters = {}
filters['node_order'] = config.get('settings', 'node_order') == 'fixed' filters['node_order'] = config.get('settings', 'node_order') == 'fixed'
# filters['caching'] = config.getboolean('settings', 'caching') # filters['caching'] = config.getboolean('settings', 'caching')
@ -430,6 +438,11 @@ def main():
filters['root_whitelist'] = [] filters['root_whitelist'] = []
filters['complete_tree_type'] = config.get('settings', 'tree_type') == 'complete' filters['complete_tree_type'] = config.get('settings', 'tree_type') == 'complete'
filters['association_measures'] = config.getboolean('settings', 'association_measures')
filters['nodes_number'] = config.getboolean('settings', 'nodes_number')
filters['frequency_threshold'] = config.getfloat('settings', 'frequency_threshold')
filters['lines_threshold'] = config.getint('settings', 'lines_threshold')
filters['print_root'] = config.getboolean('settings', 'print_root')
# for tree in all_trees[2:]: # for tree in all_trees[2:]:
@ -448,9 +461,17 @@ def main():
# result_dict[r_k] += r_v # result_dict[r_k] += r_v
# else: # else:
# result_dict[r_k] = r_v # result_dict[r_k] = r_v
# 1.02 s (16 cores) # 1.02 s (16 cores)
if cpu_cores > 1: if cpu_cores > 1:
# input_data = (tree, query_tree, create_output_string_functs, filters)
all_unigrams = p.map(get_unigrams, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees])
for unigrams in all_unigrams:
for unigram in unigrams:
if unigram in unigrams_dict:
unigrams_dict[unigram] += 1
else:
unigrams_dict[unigram] = 1
all_subtrees = p.map(tree_calculations, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees]) all_subtrees = p.map(tree_calculations, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees])
# for subtrees in all_subtrees: # for subtrees in all_subtrees:
@ -477,10 +498,19 @@ def main():
# for tree_i, tree in enumerate(all_trees[-5:]): # for tree_i, tree in enumerate(all_trees[-5:]):
# for tree_i, tree in enumerate(all_trees): # for tree_i, tree in enumerate(all_trees):
for tree_i, tree in enumerate(all_trees[1:]): for tree_i, tree in enumerate(all_trees[1:]):
input_data = (tree, query_tree, create_output_string_functs, filters)
if filters['association_measures']:
unigrams = get_unigrams(input_data)
for unigram in unigrams:
if unigram in unigrams_dict:
unigrams_dict[unigram] += 1
else:
unigrams_dict[unigram] = 1
# for tree_i, tree in enumerate(all_trees[1:]):
# text = Če pa ostane odrasel otrok doma, se starši le težko sprijaznijo s tem, da je "velik", otrok pa ima ves čas občutek, da se njegovi starši po nepotrebnem vtikajo v njegovo življenje. # text = Če pa ostane odrasel otrok doma, se starši le težko sprijaznijo s tem, da je "velik", otrok pa ima ves čas občutek, da se njegovi starši po nepotrebnem vtikajo v njegovo življenje.
# for tree_i, tree in enumerate(all_trees[5170:]): # for tree_i, tree in enumerate(all_trees[5170:]):
# for tree in all_trees: # for tree in all_trees:
subtrees = tree_calculations((tree, query_tree, create_output_string_functs, filters)) subtrees = tree_calculations(input_data)
for query_results in subtrees: for query_results in subtrees:
for r in query_results: for r in query_results:
if filters['node_order']: if filters['node_order']:
@ -525,33 +555,39 @@ def main():
len_words = tree_size_range[-1] len_words = tree_size_range[-1]
else: else:
len_words = int(len(config.get('settings', 'query').split(" "))/2 + 1) len_words = int(len(config.get('settings', 'query').split(" "))/2 + 1)
header = ["Structure"] + ["Node #" + str(i) + "-" + node_type for i in range(1, len_words + 1) for node_type in node_types] + ['Absolute frequency'] header = ["Structure"] + ["Node " + string.ascii_uppercase[i] + "-" + node_type for i in range(len_words) for node_type in node_types] + ['Absolute frequency']
header += ['Relative frequency'] header += ['Relative frequency']
if filters['node_order']: if filters['node_order']:
header += ['Order'] header += ['Order']
if config.getboolean('settings', 'nodes_number'): if filters['nodes_number']:
header += ['Number of nodes'] header += ['Number of nodes']
if config.getboolean('settings', 'print_root'): if filters['print_root']:
header += ['Root node'] header += ['Root node']
if filters['association_measures']:
header += ['MI', 'MI3', 'Dice', 't-score', 'simple-LL']
# header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Absolute frequency'] # header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Absolute frequency']
writer.writerow(header) writer.writerow(header)
if config.getint('settings', 'lines_threshold'): if filters['lines_threshold']:
sorted_list = sorted_list[:config.getint('settings', 'lines_threshold')] sorted_list = sorted_list[:filters['lines_threshold']]
# body # body
for k, v in sorted_list: for k, v in sorted_list:
absolute_frequency = v['number'] * 1000000.0 / corpus_size
if filters['frequency_threshold'] and filters['frequency_threshold'] > absolute_frequency:
break
words_only = [word_att for word in v['object'].array for word_att in word] + ['' for i in range((tree_size_range[-1] - len(v['object'].array)) * len(v['object'].array[0]))] words_only = [word_att for word in v['object'].array for word_att in word] + ['' for i in range((tree_size_range[-1] - len(v['object'].array)) * len(v['object'].array[0]))]
# words_only = printable_answers(k) # words_only = printable_answers(k)
row = [v['object'].key] + words_only + [str(v['number'])] row = [v['object'].key] + words_only + [str(v['number'])]
row += ['%.4f' % (v['number'] * 1000000.0 / corpus_size)] row += ['%.4f' % absolute_frequency]
if filters['node_order']: if filters['node_order']:
row += [v['object'].order] row += [v['object'].order]
if config.get('settings', 'nodes_number'): if filters['nodes_number']:
row += ['%d' % len(v['object'].array)] row += ['%d' % len(v['object'].array)]
if config.get('settings', 'print_root'): if filters['print_root']:
row += [v['object'].root] row += [v['object'].root]
if filters['association_measures']:
row += get_collocabilities(v, unigrams_dict, corpus_size)
writer.writerow(row) writer.writerow(row)
return "Done" return "Done"

71
generic.py Normal file
View File

@ -0,0 +1,71 @@
import math
import sys
def create_output_string_form(tree):
return tree.form.get_value()
def create_output_string_deprel(tree):
return tree.deprel.get_value()
def create_output_string_lemma(tree):
return tree.lemma.get_value()
def create_output_string_upos(tree):
return tree.upos.get_value()
def create_output_string_xpos(tree):
return tree.xpos.get_value()
def create_output_string_feats(tree):
return tree.feats.get_value()
def generate_key(node, create_output_strings, print_lemma=True):
array = [[create_output_string(node) for create_output_string in create_output_strings]]
if create_output_string_lemma in create_output_strings and print_lemma:
key_array = [[create_output_string(
node) if create_output_string != create_output_string_lemma else 'L=' + create_output_string(node) for
create_output_string in create_output_strings]]
else:
key_array = array
if len(array[0]) > 1:
key = '&'.join(key_array[0])
else:
# output_string = create_output_strings[0](node)
key = key_array[0][0]
return array, key
def get_collocabilities(ngram, unigrams_dict, corpus_size):
sum_fwi = 0.0
mul_fwi = 1.0
for key_array in ngram['object'].array:
# create key for unigrams
if len(key_array) > 1:
key = '&'.join(key_array)
else:
# output_string = create_output_strings[0](node)
key = key_array[0]
sum_fwi += unigrams_dict[key]
mul_fwi *= unigrams_dict[key]
if mul_fwi < 0:
mul_fwi = sys.maxsize
# number of all words
N = corpus_size
# n of ngram
n = len(ngram['object'].array)
O = ngram['number']
E = mul_fwi / pow(N, n-1)
# ['MI', 'MI3', 'Dice', 't-score', 'simple-LL']
# mi = Math.log(O / E) / Math.log(2);
mi = math.log(O / E, 2)
# Math.log(Math.pow(O, 3.0) / E) / Math.log(2);
mi3 = math.log(pow(O, 3) / E, 2)
dice = n * O / sum_fwi
tscore = (O - E) / math.sqrt(O)
simplell = 2 * (O * math.log10(O / E) - (O - E))
return ['%.4f' % mi, '%.4f' % mi3, '%.4f' % dice, '%.4f' % tscore, '%.4f' % simplell]