Added ngram=2 calculations + removed some bugs
This commit is contained in:
parent
31496a4267
commit
97136ca5c3
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -2,3 +2,4 @@
|
||||||
venv/
|
venv/
|
||||||
internal_saves/
|
internal_saves/
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
results/
|
||||||
|
|
28
Tree.py
28
Tree.py
|
@ -46,15 +46,15 @@ class Tree(object):
|
||||||
('deprel' not in query_tree or query_tree['deprel'] == self.deprel.get_value)
|
('deprel' not in query_tree or query_tree['deprel'] == self.deprel.get_value)
|
||||||
|
|
||||||
def generate_children_queries(self, all_query_indices, children):
|
def generate_children_queries(self, all_query_indices, children):
|
||||||
subtree_outcomes = []
|
partial_results = {}
|
||||||
# list of pairs (index of query in group, group of query, is permanent)
|
# list of pairs (index of query in group, group of query, is permanent)
|
||||||
child_queries_metadata = []
|
child_queries_metadata = []
|
||||||
for child_index, child in enumerate(children):
|
for child_index, child in enumerate(children):
|
||||||
new_queries = []
|
new_queries = []
|
||||||
|
|
||||||
# add continuation queries to children
|
# add continuation queries to children
|
||||||
for (result_part_index, result_index, is_permanent), subtree_outcome in zip(child_queries_metadata, subtree_outcomes):
|
for result_part_index, result_index, is_permanent in child_queries_metadata:
|
||||||
if subtree_outcome:
|
if result_index in partial_results and result_part_index in partial_results[result_index] and len(partial_results[result_index][result_part_index]) > 0:
|
||||||
if len(all_query_indices[result_index][0]) > result_part_index + 1:
|
if len(all_query_indices[result_index][0]) > result_part_index + 1:
|
||||||
new_queries.append((result_part_index + 1, result_index, is_permanent))
|
new_queries.append((result_part_index + 1, result_index, is_permanent))
|
||||||
# else:
|
# else:
|
||||||
|
@ -72,7 +72,7 @@ class Tree(object):
|
||||||
for result_part_index, result_index, _ in child_queries_metadata:
|
for result_part_index, result_index, _ in child_queries_metadata:
|
||||||
child_queries.append(all_query_indices[result_index][0][result_part_index])
|
child_queries.append(all_query_indices[result_index][0][result_part_index])
|
||||||
|
|
||||||
subtree_outcomes = yield child, child_queries, child_queries_metadata
|
partial_results = yield child, child_queries, child_queries_metadata
|
||||||
yield None, None, None
|
yield None, None, None
|
||||||
|
|
||||||
def add_subtrees(self, old_subtree, new_subtree):
|
def add_subtrees(self, old_subtree, new_subtree):
|
||||||
|
@ -105,18 +105,6 @@ class Tree(object):
|
||||||
if outcome:
|
if outcome:
|
||||||
new_results = self.get_results(partial_results_dict, result_index, result_part, outcome, len(all_query_indices[result_index][0]))
|
new_results = self.get_results(partial_results_dict, result_index, result_part, outcome, len(all_query_indices[result_index][0]))
|
||||||
if new_results:
|
if new_results:
|
||||||
# if is_permanent:
|
|
||||||
# if result_index in completed_subtrees:
|
|
||||||
# self.add_subtrees(completed_subtrees[result_index], new_results)
|
|
||||||
# else:
|
|
||||||
# completed_subtrees[result_index] = new_results
|
|
||||||
# comment
|
|
||||||
# self.add_subtrees(completed_subtrees[result_index], new_results)
|
|
||||||
# else:
|
|
||||||
# if result_index in completed_subtrees:
|
|
||||||
# self.add_subtrees(partial_subtrees[result_index], new_results)
|
|
||||||
# else:
|
|
||||||
# partial_subtrees[result_index] = new_results
|
|
||||||
self.add_subtrees(partial_subtrees[result_index], new_results)
|
self.add_subtrees(partial_subtrees[result_index], new_results)
|
||||||
else:
|
else:
|
||||||
if not is_permanent:
|
if not is_permanent:
|
||||||
|
@ -142,7 +130,7 @@ class Tree(object):
|
||||||
|
|
||||||
for i in range(len(new_completed_subtrees)):
|
for i in range(len(new_completed_subtrees)):
|
||||||
completed_subtrees[i].extend(new_completed_subtrees[i])
|
completed_subtrees[i].extend(new_completed_subtrees[i])
|
||||||
child, child_queries, child_queries_metadata = children_queries_generator.send(new_partial_subtrees)
|
child, child_queries, child_queries_metadata = children_queries_generator.send(partial_results_dict)
|
||||||
child_index += 1
|
child_index += 1
|
||||||
|
|
||||||
return partial_subtrees, completed_subtrees
|
return partial_subtrees, completed_subtrees
|
||||||
|
@ -168,7 +156,6 @@ class Tree(object):
|
||||||
r_all_query_indices.append((permanent_query_tree['r_children'], True))
|
r_all_query_indices.append((permanent_query_tree['r_children'], True))
|
||||||
|
|
||||||
active_temporary_query_trees = []
|
active_temporary_query_trees = []
|
||||||
# partial_subtrees = [[] for i in range(len(temporary_query_trees))]
|
|
||||||
for i, temporary_query_tree in enumerate(temporary_query_trees):
|
for i, temporary_query_tree in enumerate(temporary_query_trees):
|
||||||
if self.fits_static_requirements(temporary_query_tree):
|
if self.fits_static_requirements(temporary_query_tree):
|
||||||
active_temporary_query_trees.append(temporary_query_tree)
|
active_temporary_query_trees.append(temporary_query_tree)
|
||||||
|
@ -177,11 +164,6 @@ class Tree(object):
|
||||||
l_all_query_indices.append((temporary_query_tree['l_children'], False))
|
l_all_query_indices.append((temporary_query_tree['l_children'], False))
|
||||||
if 'r_children' in temporary_query_tree:
|
if 'r_children' in temporary_query_tree:
|
||||||
r_all_query_indices.append((temporary_query_tree['r_children'], False))
|
r_all_query_indices.append((temporary_query_tree['r_children'], False))
|
||||||
# if 'l_children' not in temporary_query_tree and 'r_children' not in temporary_query_tree:
|
|
||||||
# partial_subtrees[i] = [[self.create_output_string()]]
|
|
||||||
# elif 'l_children' not in temporary_query_tree and 'r_children' not in temporary_query_tree:
|
|
||||||
# partial_subtrees[i] = None
|
|
||||||
|
|
||||||
|
|
||||||
l_partial_subtrees, l_completed_subtrees = self.get_all_query_indices(len(temporary_query_trees), len(permanent_query_trees), permanent_query_trees, l_all_query_indices, self.l_children)
|
l_partial_subtrees, l_completed_subtrees = self.get_all_query_indices(len(temporary_query_trees), len(permanent_query_trees), permanent_query_trees, l_all_query_indices, self.l_children)
|
||||||
r_partial_subtrees, r_completed_subtrees = self.get_all_query_indices(len(temporary_query_trees), len(permanent_query_trees), permanent_query_trees, r_all_query_indices, self.r_children)
|
r_partial_subtrees, r_completed_subtrees = self.get_all_query_indices(len(temporary_query_trees), len(permanent_query_trees), permanent_query_trees, r_all_query_indices, self.r_children)
|
||||||
|
|
|
@ -1,8 +1,12 @@
|
||||||
[settings]
|
[settings]
|
||||||
input = /media/luka/Portable Disk/Datasets/dependency_treeparse/ssj500k.conllu/sl_ssj-ud_v2.4.conllu
|
input = /media/luka/Portable Disk/Datasets/dependency_treeparse/ssj500k.conllu/sl_ssj-ud_v2.4.conllu
|
||||||
|
output = /results/out.tsv
|
||||||
internal_saves = ./internal_saves
|
internal_saves = ./internal_saves
|
||||||
output = ./association_rules.tsv
|
ngrams = 0
|
||||||
|
; ngrams = 2
|
||||||
; analyze_type options: 'lemma', 'word'
|
; analyze_type options: 'lemma', 'word'
|
||||||
; query = _ > _
|
; query = _ > _
|
||||||
query = _ > (_ < _) > (_ < _)
|
; query = _ > (_ < _) > _
|
||||||
; query = _ < (_ > _) < _ > _
|
; query = _ < (_ > _) < _ > _
|
||||||
|
; query = _ < _ > _
|
||||||
|
query = _ < _
|
|
@ -1,4 +1,5 @@
|
||||||
import configparser
|
import configparser
|
||||||
|
import csv
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
|
@ -115,17 +116,34 @@ def main():
|
||||||
config = configparser.ConfigParser()
|
config = configparser.ConfigParser()
|
||||||
config.read('config.ini')
|
config.read('config.ini')
|
||||||
|
|
||||||
|
ngrams = 0
|
||||||
|
if config.getint('settings', 'ngrams') == 2:
|
||||||
|
ngrams = 2
|
||||||
|
query_tree = [{"l_children": [{}]}, {"r_children": [{}]}]
|
||||||
|
else:
|
||||||
|
query_tree = [decode_query('(' + config.get('settings', 'query') + ')')]
|
||||||
|
|
||||||
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = create_trees(config)
|
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = create_trees(config)
|
||||||
|
|
||||||
|
|
||||||
query_tree = decode_query('(' + config.get('settings', 'query') + ')')
|
|
||||||
|
|
||||||
for tree in all_trees[1:]:
|
result_dict = {}
|
||||||
|
|
||||||
|
# for tree in all_trees[2:]:
|
||||||
|
for tree in all_trees:
|
||||||
# original
|
# original
|
||||||
# r_children = tree.r_children[:1] + tree.r_children[3:4]
|
# r_children = tree.r_children[:1] + tree.r_children[3:4]
|
||||||
tree.r_children = tree.r_children[:1] + tree.r_children[2:4]
|
# tree.r_children = tree.r_children[:1] + tree.r_children[2:4]
|
||||||
_, subtrees = tree.get_subtrees([query_tree], [])
|
_, subtrees = tree.get_subtrees(query_tree, [])
|
||||||
|
for query_results in subtrees:
|
||||||
|
for result in query_results:
|
||||||
|
if ngrams:
|
||||||
|
result = sorted(result)
|
||||||
|
r = tuple(result)
|
||||||
|
if r in result_dict:
|
||||||
|
result_dict[r] += 1
|
||||||
|
else:
|
||||||
|
result_dict[r] = 1
|
||||||
# test 1 layer queries
|
# test 1 layer queries
|
||||||
# tree.r_children = []
|
# tree.r_children = []
|
||||||
# tree.l_children[1].l_children = []
|
# tree.l_children[1].l_children = []
|
||||||
|
@ -142,11 +160,25 @@ def main():
|
||||||
# # _, subtrees = new_tree.get_subtrees(
|
# # _, subtrees = new_tree.get_subtrees(
|
||||||
# # [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}], "r_children": []}], "r_children": []}], [])
|
# # [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}], "r_children": []}], "r_children": []}], [])
|
||||||
|
|
||||||
|
sorted_list = sorted(result_dict.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
with open(config.get('settings', 'output'), "w", newline="") as f:
|
||||||
|
# header - use every second space as a split
|
||||||
|
writer = csv.writer(f, delimiter='\t')
|
||||||
|
if ngrams:
|
||||||
|
writer.writerow(['Word 1', 'Word 2', 'Number of occurences'])
|
||||||
|
else:
|
||||||
|
span = 2
|
||||||
|
words = config.get('settings', 'query').split(" ")
|
||||||
|
header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Number of occurences']
|
||||||
|
writer.writerow(header)
|
||||||
|
|
||||||
|
# body
|
||||||
|
for k, v in sorted_list:
|
||||||
|
writer.writerow(list(k) + [str(v)])
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# {"form": "", "lemma": "", "upos": "", "xpos": "", "l_children": [{}, {}], "r_children": [{}, {}]}
|
|
||||||
# {"form": "", "lemma": "", "upos": "", "xpos": "", "l_children": [{}, {}], "r_children": [{}, {}]}
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user