Added unlimited ngrams
This commit is contained in:
parent
a2ce2e0fee
commit
efe11ff83c
|
@ -0,0 +1,9 @@
|
|||
|
||||
class Result(object):
|
||||
def __init__(self, string, order):
|
||||
self.key = string
|
||||
self.key_split = [string]
|
||||
# order with original numbers in sentences
|
||||
self.build_order = [order]
|
||||
# order with numbers from 0 to n of n-gram
|
||||
self.final_order = ''
|
30
Tree.py
30
Tree.py
|
@ -221,7 +221,7 @@ class Tree(object):
|
|||
all_new_partial_answers_architecture = [[] for query_part in child_queries_flatten]
|
||||
all_new_partial_answers_deprel = [[] for query_part in child_queries_flatten]
|
||||
|
||||
if filters['caching']:
|
||||
# if filters['caching']:
|
||||
# erase duplicate queries
|
||||
child_queries_flatten_dedup = []
|
||||
child_queries_flatten_dedup_indices = []
|
||||
|
@ -237,7 +237,7 @@ class Tree(object):
|
|||
# ask children all queries/partial queries
|
||||
for child in children:
|
||||
# obtain children results
|
||||
if filters['caching']:
|
||||
# if filters['caching']:
|
||||
new_partial_answers_architecture_dedup, new_partial_answers_dedup, new_complete_answers = child.get_subtrees(permanent_query_trees, child_queries_flatten_dedup,
|
||||
create_output_string, filters)
|
||||
|
||||
|
@ -249,19 +249,19 @@ class Tree(object):
|
|||
all_new_partial_answers_architecture[i].append(new_partial_answers_architecture_dedup[flattened_index])
|
||||
all_new_partial_answers_deprel[i].append(create_output_string_deprel(child))
|
||||
|
||||
else:
|
||||
new_partial_answers_architecture, new_partial_answers, new_complete_answers = child.get_subtrees(
|
||||
permanent_query_trees, child_queries_flatten,
|
||||
create_output_string, filters)
|
||||
|
||||
assert len(new_partial_answers) == len(child_queries_flatten)
|
||||
|
||||
for i, new_partial_subtree in enumerate(new_partial_answers):
|
||||
all_new_partial_answers[i].append(new_partial_subtree)
|
||||
all_new_partial_answers_architecture[i].append(new_partial_answers_architecture[i])
|
||||
# if len(new_partial_answers_architecture[i]) > 1:
|
||||
# print('HERE!!!')
|
||||
all_new_partial_answers_deprel[i].append(create_output_string_deprel(child))
|
||||
# else:
|
||||
# new_partial_answers_architecture, new_partial_answers, new_complete_answers = child.get_subtrees(
|
||||
# permanent_query_trees, child_queries_flatten,
|
||||
# create_output_string, filters)
|
||||
#
|
||||
# assert len(new_partial_answers) == len(child_queries_flatten)
|
||||
#
|
||||
# for i, new_partial_subtree in enumerate(new_partial_answers):
|
||||
# all_new_partial_answers[i].append(new_partial_subtree)
|
||||
# all_new_partial_answers_architecture[i].append(new_partial_answers_architecture[i])
|
||||
# # if len(new_partial_answers_architecture[i]) > 1:
|
||||
# # print('HERE!!!')
|
||||
# all_new_partial_answers_deprel[i].append(create_output_string_deprel(child))
|
||||
|
||||
# add 6 queries from 3 split up
|
||||
# self.group_results(new_partial_subtrees, child_queries_metadata, all_query_indices,
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import argparse
|
||||
import configparser
|
||||
import copy
|
||||
import csv
|
||||
import hashlib
|
||||
import os
|
||||
|
@ -245,6 +246,78 @@ def chunkify(a, n):
|
|||
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
|
||||
|
||||
|
||||
def add_node(tree):
|
||||
if 'children' in tree:
|
||||
tree['children'].append({})
|
||||
else:
|
||||
tree['children'] = [{}]
|
||||
|
||||
|
||||
# walk over all nodes in tree and add a node to each possible node
|
||||
def tree_grow(orig_tree):
|
||||
new_trees = []
|
||||
new_tree = copy.deepcopy(orig_tree)
|
||||
add_node(new_tree)
|
||||
new_trees.append(new_tree)
|
||||
if 'children' in orig_tree:
|
||||
children = []
|
||||
for child_tree in orig_tree['children']:
|
||||
children.append(tree_grow(child_tree))
|
||||
for i, child in enumerate(children):
|
||||
for child_res in child:
|
||||
new_tree = copy.deepcopy(orig_tree)
|
||||
new_tree['children'][i] = child_res
|
||||
new_trees.append(new_tree)
|
||||
|
||||
return new_trees
|
||||
|
||||
|
||||
def compare_trees(tree1, tree2):
|
||||
if tree1 == {} and tree2 == {}:
|
||||
return True
|
||||
|
||||
if 'children' not in tree1 or 'children' not in tree2 or len(tree1['children']) != len(tree2['children']):
|
||||
return False
|
||||
|
||||
children2_connections = []
|
||||
|
||||
for child1_i, child1 in enumerate(tree1['children']):
|
||||
child_duplicated = False
|
||||
for child2_i, child2 in enumerate(tree2['children']):
|
||||
if child2_i in children2_connections:
|
||||
pass
|
||||
if compare_trees(child1, child2):
|
||||
children2_connections.append(child2_i)
|
||||
child_duplicated = True
|
||||
break
|
||||
if not child_duplicated:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def create_ngrams_query_trees(n, trees):
|
||||
for i in range(n - 1):
|
||||
new_trees = []
|
||||
for tree in trees:
|
||||
# append new_tree only if it is not already inside
|
||||
for new_tree in tree_grow(tree):
|
||||
duplicate = False
|
||||
for confirmed_new_tree in new_trees:
|
||||
if compare_trees(new_tree, confirmed_new_tree):
|
||||
duplicate = True
|
||||
break
|
||||
if not duplicate:
|
||||
new_trees.append(new_tree)
|
||||
|
||||
trees = new_trees
|
||||
# delete_duplicates(trees)
|
||||
# print('here')
|
||||
# tree_grow(tree)
|
||||
# tree_grow(tree)
|
||||
# tree['children'] = [{}]
|
||||
return trees
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
|
@ -262,24 +335,34 @@ def main():
|
|||
# config.read('config.ini')
|
||||
# create queries
|
||||
ngrams = 0
|
||||
if config.getint('settings', 'ngrams') == 2:
|
||||
ngrams = 2
|
||||
query_tree = [{"children": [{}]}]
|
||||
elif config.getint('settings', 'ngrams') == 3:
|
||||
ngrams = 3
|
||||
query_tree = [{"children": [{}, {}]}, {"children": [{"children": [{}]}]}]
|
||||
elif config.getint('settings', 'ngrams') == 4:
|
||||
ngrams = 4
|
||||
query_tree = [{"children": [{}, {}, {}]}, {"children": [{"children": [{}, {}]}]}, {"children": [{"children": [{}]}, {}]}, {"children": [{"children": [{"children": [{}]}]}]}]
|
||||
elif config.getint('settings', 'ngrams') == 5:
|
||||
ngrams = 5
|
||||
query_tree = [{"children": [{}, {}, {}, {}]}, {"children": [{"children": [{}]}, {}, {}]}, {"children": [{"children": [{}, {}]}, {}]}, {"children": [{"children": [{}]}, {"children": [{}]}]},
|
||||
{"children": [{"children": [{"children": [{}]}]}, {}]}, {"children": [{"children": [{"children": [{}]}, {}]}]}, {"children": [{"children": [{"children": [{}, {}]}]}]},
|
||||
{"children": [{"children": [{"children": [{"children": [{}]}]}]}]}]
|
||||
|
||||
|
||||
|
||||
# if config.getint('settings', 'ngrams') == 2:
|
||||
# ngrams = 2
|
||||
# query_tree = [{"children": [{}]}]
|
||||
# elif config.getint('settings', 'ngrams') == 3:
|
||||
# ngrams = 3
|
||||
# query_tree = [{"children": [{}, {}]}, {"children": [{"children": [{}]}]}]
|
||||
# elif config.getint('settings', 'ngrams') == 4:
|
||||
# ngrams = 4
|
||||
# query_tree = [{"children": [{}, {}, {}]}, {"children": [{"children": [{}, {}]}]}, {"children": [{"children": [{}]}, {}]}, {"children": [{"children": [{"children": [{}]}]}]}]
|
||||
# elif config.getint('settings', 'ngrams') == 5:
|
||||
# ngrams = 5
|
||||
# query_tree = [{"children": [{}, {}, {}, {}]}, {"children": [{"children": [{}]}, {}, {}]}, {"children": [{"children": [{}, {}]}, {}]}, {"children": [{"children": [{}]}, {"children": [{}]}]},
|
||||
# {"children": [{"children": [{"children": [{}]}]}, {}]}, {"children": [{"children": [{"children": [{}]}, {}]}]}, {"children": [{"children": [{"children": [{}, {}]}]}]},
|
||||
# {"children": [{"children": [{"children": [{"children": [{}]}]}]}]}, {'children': [{'children': [{}, {}, {}]}]}]
|
||||
if config.getint('settings', 'ngrams') > 1:
|
||||
query_tree = create_ngrams_query_trees(config.getint('settings', 'ngrams'), [{}])
|
||||
else:
|
||||
query_tree = [decode_query('(' + config.get('settings', 'query') + ')', '')]
|
||||
# order_independent_queries(query_tree)
|
||||
|
||||
# 261 - 9 grams
|
||||
# 647 - 10 grams
|
||||
# 1622 - 11 grams
|
||||
# 4126 - 12 grams
|
||||
# 10598 - 13 grams
|
||||
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = create_trees(config)
|
||||
|
||||
|
||||
|
@ -302,7 +385,7 @@ def main():
|
|||
result_dict = {}
|
||||
filters = {}
|
||||
filters['node_order'] = config.get('settings', 'node_order') == 'fixed'
|
||||
filters['caching'] = config.getboolean('settings', 'caching')
|
||||
# filters['caching'] = config.getboolean('settings', 'caching')
|
||||
filters['dependency_type'] = config.get('settings', 'dependency_type') == 'labeled'
|
||||
if config.has_option('settings', 'label_whitelist'):
|
||||
filters['label_whitelist'] = config.get('settings', 'label_whitelist').split('|')
|
||||
|
|
Loading…
Reference in New Issue
Block a user