From efe11ff83c16df3445c0eccf2c9b3853e60ee4c7 Mon Sep 17 00:00:00 2001 From: Luka Date: Fri, 15 Nov 2019 17:43:37 +0100 Subject: [PATCH] Added unlimited ngrams --- Result.py | 9 ++++ Tree.py | 74 +++++++++++++------------- dependency-parsetree.py | 113 ++++++++++++++++++++++++++++++++++------ 3 files changed, 144 insertions(+), 52 deletions(-) diff --git a/Result.py b/Result.py index e69de29..eea2d2e 100644 --- a/Result.py +++ b/Result.py @@ -0,0 +1,9 @@ + +class Result(object): + def __init__(self, string, order): + self.key = string + self.key_split = [string] + # order with original numbers in sentences + self.build_order = [order] + # order with numbers from 0 to n of n-gram + self.final_order = '' diff --git a/Tree.py b/Tree.py index dbaf02f..958fbd3 100644 --- a/Tree.py +++ b/Tree.py @@ -221,47 +221,47 @@ class Tree(object): all_new_partial_answers_architecture = [[] for query_part in child_queries_flatten] all_new_partial_answers_deprel = [[] for query_part in child_queries_flatten] - if filters['caching']: - # erase duplicate queries - child_queries_flatten_dedup = [] - child_queries_flatten_dedup_indices = [] - for query_part in child_queries_flatten: - try: - index = child_queries_flatten_dedup.index(query_part) - except ValueError: - index = len(child_queries_flatten_dedup) - child_queries_flatten_dedup.append(query_part) - - child_queries_flatten_dedup_indices.append(index) + # if filters['caching']: + # erase duplicate queries + child_queries_flatten_dedup = [] + child_queries_flatten_dedup_indices = [] + for query_part in child_queries_flatten: + try: + index = child_queries_flatten_dedup.index(query_part) + except ValueError: + index = len(child_queries_flatten_dedup) + child_queries_flatten_dedup.append(query_part) + + child_queries_flatten_dedup_indices.append(index) # ask children all queries/partial queries for child in children: # obtain children results - if filters['caching']: - new_partial_answers_architecture_dedup, new_partial_answers_dedup, new_complete_answers = child.get_subtrees(permanent_query_trees, child_queries_flatten_dedup, - create_output_string, filters) - - assert len(new_partial_answers_dedup) == len(child_queries_flatten_dedup) - - # duplicate results again on correct places - for i, flattened_index in enumerate(child_queries_flatten_dedup_indices): - all_new_partial_answers[i].append(new_partial_answers_dedup[flattened_index]) - all_new_partial_answers_architecture[i].append(new_partial_answers_architecture_dedup[flattened_index]) - all_new_partial_answers_deprel[i].append(create_output_string_deprel(child)) - - else: - new_partial_answers_architecture, new_partial_answers, new_complete_answers = child.get_subtrees( - permanent_query_trees, child_queries_flatten, - create_output_string, filters) - - assert len(new_partial_answers) == len(child_queries_flatten) - - for i, new_partial_subtree in enumerate(new_partial_answers): - all_new_partial_answers[i].append(new_partial_subtree) - all_new_partial_answers_architecture[i].append(new_partial_answers_architecture[i]) - # if len(new_partial_answers_architecture[i]) > 1: - # print('HERE!!!') - all_new_partial_answers_deprel[i].append(create_output_string_deprel(child)) + # if filters['caching']: + new_partial_answers_architecture_dedup, new_partial_answers_dedup, new_complete_answers = child.get_subtrees(permanent_query_trees, child_queries_flatten_dedup, + create_output_string, filters) + + assert len(new_partial_answers_dedup) == len(child_queries_flatten_dedup) + + # duplicate results again on correct places + for i, flattened_index in enumerate(child_queries_flatten_dedup_indices): + all_new_partial_answers[i].append(new_partial_answers_dedup[flattened_index]) + all_new_partial_answers_architecture[i].append(new_partial_answers_architecture_dedup[flattened_index]) + all_new_partial_answers_deprel[i].append(create_output_string_deprel(child)) + + # else: + # new_partial_answers_architecture, new_partial_answers, new_complete_answers = child.get_subtrees( + # permanent_query_trees, child_queries_flatten, + # create_output_string, filters) + # + # assert len(new_partial_answers) == len(child_queries_flatten) + # + # for i, new_partial_subtree in enumerate(new_partial_answers): + # all_new_partial_answers[i].append(new_partial_subtree) + # all_new_partial_answers_architecture[i].append(new_partial_answers_architecture[i]) + # # if len(new_partial_answers_architecture[i]) > 1: + # # print('HERE!!!') + # all_new_partial_answers_deprel[i].append(create_output_string_deprel(child)) # add 6 queries from 3 split up # self.group_results(new_partial_subtrees, child_queries_metadata, all_query_indices, diff --git a/dependency-parsetree.py b/dependency-parsetree.py index 9cd19d5..91acf43 100644 --- a/dependency-parsetree.py +++ b/dependency-parsetree.py @@ -1,5 +1,6 @@ import argparse import configparser +import copy import csv import hashlib import os @@ -245,6 +246,78 @@ def chunkify(a, n): return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)) +def add_node(tree): + if 'children' in tree: + tree['children'].append({}) + else: + tree['children'] = [{}] + + +# walk over all nodes in tree and add a node to each possible node +def tree_grow(orig_tree): + new_trees = [] + new_tree = copy.deepcopy(orig_tree) + add_node(new_tree) + new_trees.append(new_tree) + if 'children' in orig_tree: + children = [] + for child_tree in orig_tree['children']: + children.append(tree_grow(child_tree)) + for i, child in enumerate(children): + for child_res in child: + new_tree = copy.deepcopy(orig_tree) + new_tree['children'][i] = child_res + new_trees.append(new_tree) + + return new_trees + + +def compare_trees(tree1, tree2): + if tree1 == {} and tree2 == {}: + return True + + if 'children' not in tree1 or 'children' not in tree2 or len(tree1['children']) != len(tree2['children']): + return False + + children2_connections = [] + + for child1_i, child1 in enumerate(tree1['children']): + child_duplicated = False + for child2_i, child2 in enumerate(tree2['children']): + if child2_i in children2_connections: + pass + if compare_trees(child1, child2): + children2_connections.append(child2_i) + child_duplicated = True + break + if not child_duplicated: + return False + + return True + +def create_ngrams_query_trees(n, trees): + for i in range(n - 1): + new_trees = [] + for tree in trees: + # append new_tree only if it is not already inside + for new_tree in tree_grow(tree): + duplicate = False + for confirmed_new_tree in new_trees: + if compare_trees(new_tree, confirmed_new_tree): + duplicate = True + break + if not duplicate: + new_trees.append(new_tree) + + trees = new_trees + # delete_duplicates(trees) + # print('here') + # tree_grow(tree) + # tree_grow(tree) + # tree['children'] = [{}] + return trees + + def main(): parser = argparse.ArgumentParser() @@ -262,24 +335,34 @@ def main(): # config.read('config.ini') # create queries ngrams = 0 - if config.getint('settings', 'ngrams') == 2: - ngrams = 2 - query_tree = [{"children": [{}]}] - elif config.getint('settings', 'ngrams') == 3: - ngrams = 3 - query_tree = [{"children": [{}, {}]}, {"children": [{"children": [{}]}]}] - elif config.getint('settings', 'ngrams') == 4: - ngrams = 4 - query_tree = [{"children": [{}, {}, {}]}, {"children": [{"children": [{}, {}]}]}, {"children": [{"children": [{}]}, {}]}, {"children": [{"children": [{"children": [{}]}]}]}] - elif config.getint('settings', 'ngrams') == 5: - ngrams = 5 - query_tree = [{"children": [{}, {}, {}, {}]}, {"children": [{"children": [{}]}, {}, {}]}, {"children": [{"children": [{}, {}]}, {}]}, {"children": [{"children": [{}]}, {"children": [{}]}]}, - {"children": [{"children": [{"children": [{}]}]}, {}]}, {"children": [{"children": [{"children": [{}]}, {}]}]}, {"children": [{"children": [{"children": [{}, {}]}]}]}, - {"children": [{"children": [{"children": [{"children": [{}]}]}]}]}] + + + + # if config.getint('settings', 'ngrams') == 2: + # ngrams = 2 + # query_tree = [{"children": [{}]}] + # elif config.getint('settings', 'ngrams') == 3: + # ngrams = 3 + # query_tree = [{"children": [{}, {}]}, {"children": [{"children": [{}]}]}] + # elif config.getint('settings', 'ngrams') == 4: + # ngrams = 4 + # query_tree = [{"children": [{}, {}, {}]}, {"children": [{"children": [{}, {}]}]}, {"children": [{"children": [{}]}, {}]}, {"children": [{"children": [{"children": [{}]}]}]}] + # elif config.getint('settings', 'ngrams') == 5: + # ngrams = 5 + # query_tree = [{"children": [{}, {}, {}, {}]}, {"children": [{"children": [{}]}, {}, {}]}, {"children": [{"children": [{}, {}]}, {}]}, {"children": [{"children": [{}]}, {"children": [{}]}]}, + # {"children": [{"children": [{"children": [{}]}]}, {}]}, {"children": [{"children": [{"children": [{}]}, {}]}]}, {"children": [{"children": [{"children": [{}, {}]}]}]}, + # {"children": [{"children": [{"children": [{"children": [{}]}]}]}]}, {'children': [{'children': [{}, {}, {}]}]}] + if config.getint('settings', 'ngrams') > 1: + query_tree = create_ngrams_query_trees(config.getint('settings', 'ngrams'), [{}]) else: query_tree = [decode_query('(' + config.get('settings', 'query') + ')', '')] # order_independent_queries(query_tree) + # 261 - 9 grams + # 647 - 10 grams + # 1622 - 11 grams + # 4126 - 12 grams + # 10598 - 13 grams (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = create_trees(config) @@ -302,7 +385,7 @@ def main(): result_dict = {} filters = {} filters['node_order'] = config.get('settings', 'node_order') == 'fixed' - filters['caching'] = config.getboolean('settings', 'caching') + # filters['caching'] = config.getboolean('settings', 'caching') filters['dependency_type'] = config.get('settings', 'dependency_type') == 'labeled' if config.has_option('settings', 'label_whitelist'): filters['label_whitelist'] = config.get('settings', 'label_whitelist').split('|')