Added unlimited ngrams
This commit is contained in:
parent
a2ce2e0fee
commit
efe11ff83c
|
@ -0,0 +1,9 @@
|
||||||
|
|
||||||
|
class Result(object):
|
||||||
|
def __init__(self, string, order):
|
||||||
|
self.key = string
|
||||||
|
self.key_split = [string]
|
||||||
|
# order with original numbers in sentences
|
||||||
|
self.build_order = [order]
|
||||||
|
# order with numbers from 0 to n of n-gram
|
||||||
|
self.final_order = ''
|
66
Tree.py
66
Tree.py
|
@ -221,47 +221,47 @@ class Tree(object):
|
||||||
all_new_partial_answers_architecture = [[] for query_part in child_queries_flatten]
|
all_new_partial_answers_architecture = [[] for query_part in child_queries_flatten]
|
||||||
all_new_partial_answers_deprel = [[] for query_part in child_queries_flatten]
|
all_new_partial_answers_deprel = [[] for query_part in child_queries_flatten]
|
||||||
|
|
||||||
if filters['caching']:
|
# if filters['caching']:
|
||||||
# erase duplicate queries
|
# erase duplicate queries
|
||||||
child_queries_flatten_dedup = []
|
child_queries_flatten_dedup = []
|
||||||
child_queries_flatten_dedup_indices = []
|
child_queries_flatten_dedup_indices = []
|
||||||
for query_part in child_queries_flatten:
|
for query_part in child_queries_flatten:
|
||||||
try:
|
try:
|
||||||
index = child_queries_flatten_dedup.index(query_part)
|
index = child_queries_flatten_dedup.index(query_part)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
index = len(child_queries_flatten_dedup)
|
index = len(child_queries_flatten_dedup)
|
||||||
child_queries_flatten_dedup.append(query_part)
|
child_queries_flatten_dedup.append(query_part)
|
||||||
|
|
||||||
child_queries_flatten_dedup_indices.append(index)
|
child_queries_flatten_dedup_indices.append(index)
|
||||||
|
|
||||||
# ask children all queries/partial queries
|
# ask children all queries/partial queries
|
||||||
for child in children:
|
for child in children:
|
||||||
# obtain children results
|
# obtain children results
|
||||||
if filters['caching']:
|
# if filters['caching']:
|
||||||
new_partial_answers_architecture_dedup, new_partial_answers_dedup, new_complete_answers = child.get_subtrees(permanent_query_trees, child_queries_flatten_dedup,
|
new_partial_answers_architecture_dedup, new_partial_answers_dedup, new_complete_answers = child.get_subtrees(permanent_query_trees, child_queries_flatten_dedup,
|
||||||
create_output_string, filters)
|
create_output_string, filters)
|
||||||
|
|
||||||
assert len(new_partial_answers_dedup) == len(child_queries_flatten_dedup)
|
assert len(new_partial_answers_dedup) == len(child_queries_flatten_dedup)
|
||||||
|
|
||||||
# duplicate results again on correct places
|
# duplicate results again on correct places
|
||||||
for i, flattened_index in enumerate(child_queries_flatten_dedup_indices):
|
for i, flattened_index in enumerate(child_queries_flatten_dedup_indices):
|
||||||
all_new_partial_answers[i].append(new_partial_answers_dedup[flattened_index])
|
all_new_partial_answers[i].append(new_partial_answers_dedup[flattened_index])
|
||||||
all_new_partial_answers_architecture[i].append(new_partial_answers_architecture_dedup[flattened_index])
|
all_new_partial_answers_architecture[i].append(new_partial_answers_architecture_dedup[flattened_index])
|
||||||
all_new_partial_answers_deprel[i].append(create_output_string_deprel(child))
|
all_new_partial_answers_deprel[i].append(create_output_string_deprel(child))
|
||||||
|
|
||||||
else:
|
# else:
|
||||||
new_partial_answers_architecture, new_partial_answers, new_complete_answers = child.get_subtrees(
|
# new_partial_answers_architecture, new_partial_answers, new_complete_answers = child.get_subtrees(
|
||||||
permanent_query_trees, child_queries_flatten,
|
# permanent_query_trees, child_queries_flatten,
|
||||||
create_output_string, filters)
|
# create_output_string, filters)
|
||||||
|
#
|
||||||
assert len(new_partial_answers) == len(child_queries_flatten)
|
# assert len(new_partial_answers) == len(child_queries_flatten)
|
||||||
|
#
|
||||||
for i, new_partial_subtree in enumerate(new_partial_answers):
|
# for i, new_partial_subtree in enumerate(new_partial_answers):
|
||||||
all_new_partial_answers[i].append(new_partial_subtree)
|
# all_new_partial_answers[i].append(new_partial_subtree)
|
||||||
all_new_partial_answers_architecture[i].append(new_partial_answers_architecture[i])
|
# all_new_partial_answers_architecture[i].append(new_partial_answers_architecture[i])
|
||||||
# if len(new_partial_answers_architecture[i]) > 1:
|
# # if len(new_partial_answers_architecture[i]) > 1:
|
||||||
# print('HERE!!!')
|
# # print('HERE!!!')
|
||||||
all_new_partial_answers_deprel[i].append(create_output_string_deprel(child))
|
# all_new_partial_answers_deprel[i].append(create_output_string_deprel(child))
|
||||||
|
|
||||||
# add 6 queries from 3 split up
|
# add 6 queries from 3 split up
|
||||||
# self.group_results(new_partial_subtrees, child_queries_metadata, all_query_indices,
|
# self.group_results(new_partial_subtrees, child_queries_metadata, all_query_indices,
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import argparse
|
import argparse
|
||||||
import configparser
|
import configparser
|
||||||
|
import copy
|
||||||
import csv
|
import csv
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
import os
|
||||||
|
@ -245,6 +246,78 @@ def chunkify(a, n):
|
||||||
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
|
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
|
||||||
|
|
||||||
|
|
||||||
|
def add_node(tree):
|
||||||
|
if 'children' in tree:
|
||||||
|
tree['children'].append({})
|
||||||
|
else:
|
||||||
|
tree['children'] = [{}]
|
||||||
|
|
||||||
|
|
||||||
|
# walk over all nodes in tree and add a node to each possible node
|
||||||
|
def tree_grow(orig_tree):
|
||||||
|
new_trees = []
|
||||||
|
new_tree = copy.deepcopy(orig_tree)
|
||||||
|
add_node(new_tree)
|
||||||
|
new_trees.append(new_tree)
|
||||||
|
if 'children' in orig_tree:
|
||||||
|
children = []
|
||||||
|
for child_tree in orig_tree['children']:
|
||||||
|
children.append(tree_grow(child_tree))
|
||||||
|
for i, child in enumerate(children):
|
||||||
|
for child_res in child:
|
||||||
|
new_tree = copy.deepcopy(orig_tree)
|
||||||
|
new_tree['children'][i] = child_res
|
||||||
|
new_trees.append(new_tree)
|
||||||
|
|
||||||
|
return new_trees
|
||||||
|
|
||||||
|
|
||||||
|
def compare_trees(tree1, tree2):
|
||||||
|
if tree1 == {} and tree2 == {}:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if 'children' not in tree1 or 'children' not in tree2 or len(tree1['children']) != len(tree2['children']):
|
||||||
|
return False
|
||||||
|
|
||||||
|
children2_connections = []
|
||||||
|
|
||||||
|
for child1_i, child1 in enumerate(tree1['children']):
|
||||||
|
child_duplicated = False
|
||||||
|
for child2_i, child2 in enumerate(tree2['children']):
|
||||||
|
if child2_i in children2_connections:
|
||||||
|
pass
|
||||||
|
if compare_trees(child1, child2):
|
||||||
|
children2_connections.append(child2_i)
|
||||||
|
child_duplicated = True
|
||||||
|
break
|
||||||
|
if not child_duplicated:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def create_ngrams_query_trees(n, trees):
|
||||||
|
for i in range(n - 1):
|
||||||
|
new_trees = []
|
||||||
|
for tree in trees:
|
||||||
|
# append new_tree only if it is not already inside
|
||||||
|
for new_tree in tree_grow(tree):
|
||||||
|
duplicate = False
|
||||||
|
for confirmed_new_tree in new_trees:
|
||||||
|
if compare_trees(new_tree, confirmed_new_tree):
|
||||||
|
duplicate = True
|
||||||
|
break
|
||||||
|
if not duplicate:
|
||||||
|
new_trees.append(new_tree)
|
||||||
|
|
||||||
|
trees = new_trees
|
||||||
|
# delete_duplicates(trees)
|
||||||
|
# print('here')
|
||||||
|
# tree_grow(tree)
|
||||||
|
# tree_grow(tree)
|
||||||
|
# tree['children'] = [{}]
|
||||||
|
return trees
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
@ -262,24 +335,34 @@ def main():
|
||||||
# config.read('config.ini')
|
# config.read('config.ini')
|
||||||
# create queries
|
# create queries
|
||||||
ngrams = 0
|
ngrams = 0
|
||||||
if config.getint('settings', 'ngrams') == 2:
|
|
||||||
ngrams = 2
|
|
||||||
query_tree = [{"children": [{}]}]
|
|
||||||
elif config.getint('settings', 'ngrams') == 3:
|
# if config.getint('settings', 'ngrams') == 2:
|
||||||
ngrams = 3
|
# ngrams = 2
|
||||||
query_tree = [{"children": [{}, {}]}, {"children": [{"children": [{}]}]}]
|
# query_tree = [{"children": [{}]}]
|
||||||
elif config.getint('settings', 'ngrams') == 4:
|
# elif config.getint('settings', 'ngrams') == 3:
|
||||||
ngrams = 4
|
# ngrams = 3
|
||||||
query_tree = [{"children": [{}, {}, {}]}, {"children": [{"children": [{}, {}]}]}, {"children": [{"children": [{}]}, {}]}, {"children": [{"children": [{"children": [{}]}]}]}]
|
# query_tree = [{"children": [{}, {}]}, {"children": [{"children": [{}]}]}]
|
||||||
elif config.getint('settings', 'ngrams') == 5:
|
# elif config.getint('settings', 'ngrams') == 4:
|
||||||
ngrams = 5
|
# ngrams = 4
|
||||||
query_tree = [{"children": [{}, {}, {}, {}]}, {"children": [{"children": [{}]}, {}, {}]}, {"children": [{"children": [{}, {}]}, {}]}, {"children": [{"children": [{}]}, {"children": [{}]}]},
|
# query_tree = [{"children": [{}, {}, {}]}, {"children": [{"children": [{}, {}]}]}, {"children": [{"children": [{}]}, {}]}, {"children": [{"children": [{"children": [{}]}]}]}]
|
||||||
{"children": [{"children": [{"children": [{}]}]}, {}]}, {"children": [{"children": [{"children": [{}]}, {}]}]}, {"children": [{"children": [{"children": [{}, {}]}]}]},
|
# elif config.getint('settings', 'ngrams') == 5:
|
||||||
{"children": [{"children": [{"children": [{"children": [{}]}]}]}]}]
|
# ngrams = 5
|
||||||
|
# query_tree = [{"children": [{}, {}, {}, {}]}, {"children": [{"children": [{}]}, {}, {}]}, {"children": [{"children": [{}, {}]}, {}]}, {"children": [{"children": [{}]}, {"children": [{}]}]},
|
||||||
|
# {"children": [{"children": [{"children": [{}]}]}, {}]}, {"children": [{"children": [{"children": [{}]}, {}]}]}, {"children": [{"children": [{"children": [{}, {}]}]}]},
|
||||||
|
# {"children": [{"children": [{"children": [{"children": [{}]}]}]}]}, {'children': [{'children': [{}, {}, {}]}]}]
|
||||||
|
if config.getint('settings', 'ngrams') > 1:
|
||||||
|
query_tree = create_ngrams_query_trees(config.getint('settings', 'ngrams'), [{}])
|
||||||
else:
|
else:
|
||||||
query_tree = [decode_query('(' + config.get('settings', 'query') + ')', '')]
|
query_tree = [decode_query('(' + config.get('settings', 'query') + ')', '')]
|
||||||
# order_independent_queries(query_tree)
|
# order_independent_queries(query_tree)
|
||||||
|
|
||||||
|
# 261 - 9 grams
|
||||||
|
# 647 - 10 grams
|
||||||
|
# 1622 - 11 grams
|
||||||
|
# 4126 - 12 grams
|
||||||
|
# 10598 - 13 grams
|
||||||
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = create_trees(config)
|
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = create_trees(config)
|
||||||
|
|
||||||
|
|
||||||
|
@ -302,7 +385,7 @@ def main():
|
||||||
result_dict = {}
|
result_dict = {}
|
||||||
filters = {}
|
filters = {}
|
||||||
filters['node_order'] = config.get('settings', 'node_order') == 'fixed'
|
filters['node_order'] = config.get('settings', 'node_order') == 'fixed'
|
||||||
filters['caching'] = config.getboolean('settings', 'caching')
|
# filters['caching'] = config.getboolean('settings', 'caching')
|
||||||
filters['dependency_type'] = config.get('settings', 'dependency_type') == 'labeled'
|
filters['dependency_type'] = config.get('settings', 'dependency_type') == 'labeled'
|
||||||
if config.has_option('settings', 'label_whitelist'):
|
if config.has_option('settings', 'label_whitelist'):
|
||||||
filters['label_whitelist'] = config.get('settings', 'label_whitelist').split('|')
|
filters['label_whitelist'] = config.get('settings', 'label_whitelist').split('|')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user