diff --git a/Result.py b/Result.py index 2a8d4b1..19dffbe 100644 --- a/Result.py +++ b/Result.py @@ -1,18 +1,28 @@ import copy import string +from generic import create_output_string_form, create_output_string_deprel, create_output_string_lemma, \ + create_output_string_upos, create_output_string_xpos, create_output_string_feats, generate_key + class Result(object): def __init__(self, node, architecture_order, create_output_strings): - self.array = [[create_output_string(node) for create_output_string in create_output_strings]] - if len(self.array[0]) > 1: - self.key = '{' + ','.join(self.array[0]) + '}' - else: - # output_string = create_output_strings[0](node) - self.key = self.array[0][0] + # self.array = [[create_output_string(node) for create_output_string in create_output_strings]] + # if create_output_string_lemma in create_output_strings: + # key_array = [[create_output_string(node) if create_output_string != create_output_string_lemma else 'L=' + create_output_string(node) for create_output_string in create_output_strings]] + # else: + # key_array = self.array + # if len(self.array[0]) > 1: + # self.key = '&'.join(key_array[0]) + # else: + # # output_string = create_output_strings[0](node) + # self.key = key_array[0][0] + + self.array, self.key = generate_key(node, create_output_strings) # self.array = [[output_string]] self.order_key = str([architecture_order]) self.order = [architecture_order] + self.deprel = node.deprel.get_value() # order with original numbers in sentences # self.order = str([architecture_order]) # order with numbers from 0 to n of n-gram @@ -123,7 +133,7 @@ class Result(object): def set_root(self): if len(self.array[0]) > 1: - self.root = '{' + ','.join(self.array[0]) + '}' + self.root = '&'.join(self.array[0]) else: # output_string = create_output_strings[0](node) self.root = self.array[0][0] \ No newline at end of file diff --git a/Tree.py b/Tree.py index 6f08682..b23292e 100644 --- a/Tree.py +++ b/Tree.py @@ -5,6 +5,8 @@ from pyconll.unit import Token from Result import Result from Value import Value +from generic import create_output_string_form, create_output_string_deprel, create_output_string_lemma, \ + create_output_string_upos, create_output_string_xpos, create_output_string_feats, generate_key class Tree(object): @@ -206,7 +208,6 @@ class Tree(object): # create_output_string) partial_answers = [[] for i in range(permanent_query_nb + temporary_query_nb)] partial_answers_index = [[] for i in range(permanent_query_nb + temporary_query_nb)] - partial_answers_deprel = [[] for i in range(permanent_query_nb + temporary_query_nb)] complete_answers = [[] for i in range(permanent_query_nb)] # list of pairs (index of query in group, group of query) @@ -220,7 +221,6 @@ class Tree(object): child_queries_flatten = [query_part for query in child_queries for query_part in query] all_new_partial_answers = [[] for query_part in child_queries_flatten] - all_new_partial_answers_deprel = [[] for query_part in child_queries_flatten] # if filters['caching']: # erase duplicate queries @@ -247,7 +247,6 @@ class Tree(object): # duplicate results again on correct places for i, flattened_index in enumerate(child_queries_flatten_dedup_indices): all_new_partial_answers[i].append(new_partial_answers_dedup[flattened_index]) - all_new_partial_answers_deprel[i].append(create_output_string_deprel(child)) # else: # new_partial_answers_architecture, new_partial_answers, new_complete_answers = child.get_subtrees( @@ -281,7 +280,7 @@ class Tree(object): for answer_i, answer_length in enumerate(answers_lengths): # iterate over answers of query # TODO ERROR IN HERE! - partial_answers[answer_i], partial_answers_index[answer_i], partial_answers_deprel[answer_i] = self.create_answers(all_new_partial_answers[i:i + answer_length], all_new_partial_answers_deprel[i:i + answer_length], answer_length, filters) + partial_answers[answer_i], partial_answers_index[answer_i] = self.create_answers(all_new_partial_answers[i:i + answer_length], answer_length, filters) # while i < answers_length: # self.create_grouped_answers() # i += 1 @@ -308,9 +307,9 @@ class Tree(object): # child, child_queries, child_queries_metadata = children_queries_generator.send(partial_results_dict) # child_index += 1 - return partial_answers, partial_answers_index, partial_answers_deprel, complete_answers + return partial_answers, partial_answers_index, complete_answers - def order_dependent_queries(self, active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index, partial_subtrees_deprel, + def order_dependent_queries(self, active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index, create_output_string, merged_partial_subtrees, i_query, i_answer, filters): # string_output = '' # if create_output_string_form(self) == 'vožnji': @@ -324,7 +323,7 @@ class Tree(object): # self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) merged_partial_subtrees.append( - self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) + self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer])) i_answer += 1 else: @@ -341,7 +340,7 @@ class Tree(object): # self.create_output_children(partial_subtrees_architecture[i_answer], [str([self.index])], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) merged_partial_subtrees.append( - self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer], partial_subtrees_deprel[i_answer])) + self.create_output_children(partial_subtrees[i_answer], [Result(self, self.index, create_output_string)], filters, partial_subtrees_index[i_answer])) i_answer += 1 else: @@ -351,6 +350,12 @@ class Tree(object): return i_answer + def get_unigrams(self, create_output_strings, filters): + unigrams = [generate_key(self, create_output_strings, print_lemma=False)[1]] + for child in self.children: + unigrams += child.get_unigrams(create_output_strings, filters) + return unigrams + def get_subtrees(self, permanent_query_trees, temporary_query_trees, create_output_string, filters): """ @@ -382,7 +387,7 @@ class Tree(object): if 'children' in temporary_query_tree: all_query_indices.append((temporary_query_tree['children'], False)) - partial_subtrees, partial_subtrees_index, partial_subtrees_deprel, complete_answers = self.get_all_query_indices(len(temporary_query_trees), + partial_subtrees, partial_subtrees_index, complete_answers = self.get_all_query_indices(len(temporary_query_trees), len(permanent_query_trees), permanent_query_trees, all_query_indices, self.children, @@ -397,7 +402,7 @@ class Tree(object): # go over all permanent and temporary query trees while i_question < len(active_permanent_query_trees) + len(active_temporary_query_trees): # permanent query trees always have left and right child - i_answer = self.order_dependent_queries(active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index, partial_subtrees_deprel, + i_answer = self.order_dependent_queries(active_permanent_query_trees, active_temporary_query_trees, partial_subtrees, partial_subtrees_index, create_output_string, merged_partial_answers, i_question, i_answer, filters) i_question += 1 @@ -466,21 +471,22 @@ class Tree(object): merged_indices.append(new_indices) return merged_results, merged_indices - def merge_results2(self, new_child, new_results, i_child, indices, deprel, filters): + def merge_results2(self, new_child, new_results, i_child, indices, filters): l_res = [] r_res = [] results = [] for i_answer, answer in enumerate(new_child): if filters['node_order'] and indices[i_child][i_answer] < self.children_split: if filters['dependency_type']: - separator = ' <' + deprel[i_child][i_answer] + ' ' + # separator = ' <' + deprel[i_child][i_answer] + ' ' + separator = ' <' + answer[0].deprel + ' ' else: separator = ' < ' l_res = self.merge_results(l_res, answer, separator, left=True) # l_res += answer + separator else: if filters['dependency_type']: - separator = ' >' + deprel[i_child][i_answer] + ' ' + separator = ' >' + answer[0].deprel + ' ' else: separator = ' > ' r_res = self.merge_results(r_res, answer, separator, left=False) @@ -572,7 +578,7 @@ class Tree(object): return results - def create_output_children(self, children, new_results, filters, indices, deprel): + def create_output_children(self, children, new_results, filters, indices): # if create_output_string_form(self) == 'Dogodek': # print('HERE!@@!') # if create_output_string_form(self) == 'utišal': @@ -586,7 +592,7 @@ class Tree(object): else: new_child = sorted(child, key=lambda x: x[0].key) ################# - merged_results.extend(self.merge_results2(new_child, new_results, i_child, indices, deprel, filters)) + merged_results.extend(self.merge_results2(new_child, new_results, i_child, indices, filters)) return merged_results @staticmethod @@ -631,17 +637,11 @@ class Tree(object): return merged_results # @staticmethod - def create_answers(self, separated_answers, separated_answers_deprel, answer_length, filters): - # TODO - # node_order = False + def create_answers(self, separated_answers, answer_length, filters): partly_built_trees = [[None] * answer_length] - # partly_built_trees_architecture = [[None] * answer_length] partly_built_trees_architecture_indices = [[None] * answer_length] - partly_built_trees_deprel = [[None] * answer_length] built_trees = [] - # built_trees_architecture = [] built_trees_architecture_indices = [] - built_trees_deprel = [] # if create_output_string_form(self) == 'Dogodek': # print('HERE!@@!') @@ -650,64 +650,44 @@ class Tree(object): # child are added for child_i in range(len(separated_answers[0])): new_partly_built_trees = [] - # new_partly_built_trees_architecture = [] new_partly_built_trees_architecture_indices = [] - new_partly_built_trees_deprel = [] # iterate over answers parts for answer_part_i in range(len(separated_answers)): # necessary because some parts do not pass filters and are not added - # if child_i < len(separated_answers[answer_part_i]) and separated_answers[answer_part_i][child_i]: if separated_answers[answer_part_i][child_i]: for tree_part_i, tree_part in enumerate(partly_built_trees): - # if tree_part[answer_part_i] equals None add new element in its place if not tree_part[answer_part_i]: new_tree_part = copy(tree_part) - # new_tree_part_architecture = copy(partly_built_trees_architecture[tree_part_i]) new_tree_part_architecture_indices = copy(partly_built_trees_architecture_indices[tree_part_i]) - new_tree_part_deprel = copy(partly_built_trees_deprel[tree_part_i]) new_tree_part[answer_part_i] = separated_answers[answer_part_i][child_i] - # new_tree_part_architecture[answer_part_i] = separated_answers_architecture[answer_part_i][child_i] new_tree_part_architecture_indices[answer_part_i] = child_i - new_tree_part_deprel[answer_part_i] = separated_answers_deprel[answer_part_i][child_i] completed_tree_part = True for val_i, val in enumerate(new_tree_part): if not val: completed_tree_part = False if completed_tree_part: built_trees.append(new_tree_part) - # built_trees_architecture.append(new_tree_part_architecture) built_trees_architecture_indices.append(new_tree_part_architecture_indices) - built_trees_deprel.append(new_tree_part_deprel) else: new_partly_built_trees.append(new_tree_part) - # new_partly_built_trees_architecture.append(new_tree_part_architecture) new_partly_built_trees_architecture_indices.append(new_tree_part_architecture_indices) - new_partly_built_trees_deprel.append(new_tree_part_deprel) else: # pass over repetitions of same words pass - # print('HERE!!!') partly_built_trees.extend(new_partly_built_trees) - # partly_built_trees_architecture.extend(new_partly_built_trees_architecture) partly_built_trees_architecture_indices.extend(new_partly_built_trees_architecture_indices) - partly_built_trees_deprel.extend(new_partly_built_trees_deprel) - l_ordered_built_trees, l_ordered_built_trees_index, l_ordered_built_trees_deprel, unique_trees_architecture = [], [], [], [] + l_ordered_built_trees, l_ordered_built_trees_index, unique_trees_architecture = [], [], [] if built_trees: # sort 3 arrays by architecture indices - # temp_trees_index, temp_trees, temp_trees_architectures, temp_trees_deprel = (list(t) for t in zip( - # *sorted(zip(built_trees_architecture_indices, built_trees, built_trees_architecture, built_trees_deprel)))) - temp_trees_index, temp_trees, temp_trees_deprel = (list(t) for t in zip( - *sorted(zip(built_trees_architecture_indices, built_trees, built_trees_deprel)))) + temp_trees_index, temp_trees = (list(t) for t in zip( + *sorted(zip(built_trees_architecture_indices, built_trees)))) # order outputs and erase duplicates - # for tree, tree_architecture, tree_architecture_indice in zip(built_trees, built_trees_architecture, built_trees_architecture_indices): - # for tree, tree_architecture, tree_index, tree_deprel in zip(temp_trees, temp_trees_architectures, temp_trees_index, temp_trees_deprel): - for tree, tree_index, tree_deprel in zip(temp_trees, temp_trees_index, temp_trees_deprel): - # new_tree_index, new_tree, new_tree_architecture, new_tree_deprel = (list(t) for t in zip(*sorted(zip(tree_index, tree, tree_architecture, tree_deprel)))) - new_tree_index, new_tree, new_tree_deprel = (list(t) for t in zip(*sorted(zip(tree_index, tree, tree_deprel)))) + for tree, tree_index in zip(temp_trees, temp_trees_index): + new_tree_index, new_tree = (list(t) for t in zip(*sorted(zip(tree_index, tree)))) # TODO check if inside new_tree_architecture in ordered_built_trees_architecture and if not append! is_unique = True for unique_tree in unique_trees_architecture: @@ -728,7 +708,6 @@ class Tree(object): # l_ordered_built_trees_architecture.append(new_tree_architecture) l_ordered_built_trees.append(new_tree) l_ordered_built_trees_index.append(new_tree_index) - l_ordered_built_trees_deprel.append(new_tree_deprel) # TODO NODE ORDER = FALSE # else: # @@ -742,23 +721,4 @@ class Tree(object): # print('aaa') # # pass - return l_ordered_built_trees, l_ordered_built_trees_index, l_ordered_built_trees_deprel - - -def create_output_string_form(tree): - return tree.form.get_value() - -def create_output_string_deprel(tree): - return tree.deprel.get_value() - -def create_output_string_lemma(tree): - return tree.lemma.get_value() - -def create_output_string_upos(tree): - return tree.upos.get_value() - -def create_output_string_xpos(tree): - return tree.xpos.get_value() - -def create_output_string_feats(tree): - return tree.feats.get_value() + return l_ordered_built_trees, l_ordered_built_trees_index diff --git a/dependency-parsetree.py b/dependency-parsetree.py index 5ff9132..3679d2b 100644 --- a/dependency-parsetree.py +++ b/dependency-parsetree.py @@ -6,6 +6,7 @@ import hashlib import os import pickle import re +import string import time import timeit from multiprocessing import Pool @@ -32,6 +33,7 @@ from Tree import Tree, create_output_string_form, create_output_string_deprel, c # feats_detailed_list = [] # feats_detailed_dict = {key: {} for key in feats_detailed_list} +from generic import get_collocabilities def decode_query(orig_query, dependency_type, feats_detailed_list): @@ -232,6 +234,11 @@ def tree_calculations(input_data): _, subtrees = tree.get_subtrees(query_tree, [], create_output_string_funct, filters) return subtrees +def get_unigrams(input_data): + tree, query_tree, create_output_string_funct, filters = input_data + unigrams = tree.get_unigrams(create_output_string_funct, filters) + return unigrams + def tree_calculations_chunks(input_data): trees, query_tree, create_output_string_funct, filters = input_data @@ -404,6 +411,7 @@ def main(): create_output_string_functs.append(create_output_string_funct) result_dict = {} + unigrams_dict = {} filters = {} filters['node_order'] = config.get('settings', 'node_order') == 'fixed' # filters['caching'] = config.getboolean('settings', 'caching') @@ -430,6 +438,11 @@ def main(): filters['root_whitelist'] = [] filters['complete_tree_type'] = config.get('settings', 'tree_type') == 'complete' + filters['association_measures'] = config.getboolean('settings', 'association_measures') + filters['nodes_number'] = config.getboolean('settings', 'nodes_number') + filters['frequency_threshold'] = config.getfloat('settings', 'frequency_threshold') + filters['lines_threshold'] = config.getint('settings', 'lines_threshold') + filters['print_root'] = config.getboolean('settings', 'print_root') # for tree in all_trees[2:]: @@ -448,9 +461,17 @@ def main(): # result_dict[r_k] += r_v # else: # result_dict[r_k] = r_v - # 1.02 s (16 cores) if cpu_cores > 1: + # input_data = (tree, query_tree, create_output_string_functs, filters) + all_unigrams = p.map(get_unigrams, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees]) + for unigrams in all_unigrams: + for unigram in unigrams: + if unigram in unigrams_dict: + unigrams_dict[unigram] += 1 + else: + unigrams_dict[unigram] = 1 + all_subtrees = p.map(tree_calculations, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees]) # for subtrees in all_subtrees: @@ -477,10 +498,19 @@ def main(): # for tree_i, tree in enumerate(all_trees[-5:]): # for tree_i, tree in enumerate(all_trees): for tree_i, tree in enumerate(all_trees[1:]): + input_data = (tree, query_tree, create_output_string_functs, filters) + if filters['association_measures']: + unigrams = get_unigrams(input_data) + for unigram in unigrams: + if unigram in unigrams_dict: + unigrams_dict[unigram] += 1 + else: + unigrams_dict[unigram] = 1 + # for tree_i, tree in enumerate(all_trees[1:]): # text = Če pa ostane odrasel otrok doma, se starši le težko sprijaznijo s tem, da je "velik", otrok pa ima ves čas občutek, da se njegovi starši po nepotrebnem vtikajo v njegovo življenje. # for tree_i, tree in enumerate(all_trees[5170:]): # for tree in all_trees: - subtrees = tree_calculations((tree, query_tree, create_output_string_functs, filters)) + subtrees = tree_calculations(input_data) for query_results in subtrees: for r in query_results: if filters['node_order']: @@ -525,33 +555,39 @@ def main(): len_words = tree_size_range[-1] else: len_words = int(len(config.get('settings', 'query').split(" "))/2 + 1) - header = ["Structure"] + ["Node #" + str(i) + "-" + node_type for i in range(1, len_words + 1) for node_type in node_types] + ['Absolute frequency'] + header = ["Structure"] + ["Node " + string.ascii_uppercase[i] + "-" + node_type for i in range(len_words) for node_type in node_types] + ['Absolute frequency'] header += ['Relative frequency'] if filters['node_order']: header += ['Order'] - if config.getboolean('settings', 'nodes_number'): + if filters['nodes_number']: header += ['Number of nodes'] - if config.getboolean('settings', 'print_root'): + if filters['print_root']: header += ['Root node'] + if filters['association_measures']: + header += ['MI', 'MI3', 'Dice', 't-score', 'simple-LL'] # header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Absolute frequency'] writer.writerow(header) - if config.getint('settings', 'lines_threshold'): - sorted_list = sorted_list[:config.getint('settings', 'lines_threshold')] + if filters['lines_threshold']: + sorted_list = sorted_list[:filters['lines_threshold']] # body for k, v in sorted_list: + absolute_frequency = v['number'] * 1000000.0 / corpus_size + if filters['frequency_threshold'] and filters['frequency_threshold'] > absolute_frequency: + break words_only = [word_att for word in v['object'].array for word_att in word] + ['' for i in range((tree_size_range[-1] - len(v['object'].array)) * len(v['object'].array[0]))] # words_only = printable_answers(k) row = [v['object'].key] + words_only + [str(v['number'])] - row += ['%.4f' % (v['number'] * 1000000.0 / corpus_size)] + row += ['%.4f' % absolute_frequency] if filters['node_order']: row += [v['object'].order] - if config.get('settings', 'nodes_number'): + if filters['nodes_number']: row += ['%d' % len(v['object'].array)] - if config.get('settings', 'print_root'): + if filters['print_root']: row += [v['object'].root] - + if filters['association_measures']: + row += get_collocabilities(v, unigrams_dict, corpus_size) writer.writerow(row) return "Done" diff --git a/generic.py b/generic.py new file mode 100644 index 0000000..408ee82 --- /dev/null +++ b/generic.py @@ -0,0 +1,71 @@ +import math +import sys + + +def create_output_string_form(tree): + return tree.form.get_value() + +def create_output_string_deprel(tree): + return tree.deprel.get_value() + +def create_output_string_lemma(tree): + return tree.lemma.get_value() + +def create_output_string_upos(tree): + return tree.upos.get_value() + +def create_output_string_xpos(tree): + return tree.xpos.get_value() + +def create_output_string_feats(tree): + return tree.feats.get_value() + +def generate_key(node, create_output_strings, print_lemma=True): + array = [[create_output_string(node) for create_output_string in create_output_strings]] + if create_output_string_lemma in create_output_strings and print_lemma: + key_array = [[create_output_string( + node) if create_output_string != create_output_string_lemma else 'L=' + create_output_string(node) for + create_output_string in create_output_strings]] + else: + key_array = array + if len(array[0]) > 1: + key = '&'.join(key_array[0]) + else: + # output_string = create_output_strings[0](node) + key = key_array[0][0] + + return array, key + +def get_collocabilities(ngram, unigrams_dict, corpus_size): + sum_fwi = 0.0 + mul_fwi = 1.0 + for key_array in ngram['object'].array: + # create key for unigrams + if len(key_array) > 1: + key = '&'.join(key_array) + else: + # output_string = create_output_strings[0](node) + key = key_array[0] + sum_fwi += unigrams_dict[key] + mul_fwi *= unigrams_dict[key] + + if mul_fwi < 0: + mul_fwi = sys.maxsize + + # number of all words + N = corpus_size + + # n of ngram + n = len(ngram['object'].array) + O = ngram['number'] + E = mul_fwi / pow(N, n-1) + + # ['MI', 'MI3', 'Dice', 't-score', 'simple-LL'] + # mi = Math.log(O / E) / Math.log(2); + mi = math.log(O / E, 2) + # Math.log(Math.pow(O, 3.0) / E) / Math.log(2); + mi3 = math.log(pow(O, 3) / E, 2) + dice = n * O / sum_fwi + tscore = (O - E) / math.sqrt(O) + simplell = 2 * (O * math.log10(O / E) - (O - E)) + return ['%.4f' % mi, '%.4f' % mi3, '%.4f' % dice, '%.4f' % tscore, '%.4f' % simplell]