Added ngram=2 calculations + removed some bugs
This commit is contained in:
parent
31496a4267
commit
97136ca5c3
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -2,3 +2,4 @@
|
|||
venv/
|
||||
internal_saves/
|
||||
__pycache__/
|
||||
results/
|
||||
|
|
28
Tree.py
28
Tree.py
|
@ -46,15 +46,15 @@ class Tree(object):
|
|||
('deprel' not in query_tree or query_tree['deprel'] == self.deprel.get_value)
|
||||
|
||||
def generate_children_queries(self, all_query_indices, children):
|
||||
subtree_outcomes = []
|
||||
partial_results = {}
|
||||
# list of pairs (index of query in group, group of query, is permanent)
|
||||
child_queries_metadata = []
|
||||
for child_index, child in enumerate(children):
|
||||
new_queries = []
|
||||
|
||||
# add continuation queries to children
|
||||
for (result_part_index, result_index, is_permanent), subtree_outcome in zip(child_queries_metadata, subtree_outcomes):
|
||||
if subtree_outcome:
|
||||
for result_part_index, result_index, is_permanent in child_queries_metadata:
|
||||
if result_index in partial_results and result_part_index in partial_results[result_index] and len(partial_results[result_index][result_part_index]) > 0:
|
||||
if len(all_query_indices[result_index][0]) > result_part_index + 1:
|
||||
new_queries.append((result_part_index + 1, result_index, is_permanent))
|
||||
# else:
|
||||
|
@ -72,7 +72,7 @@ class Tree(object):
|
|||
for result_part_index, result_index, _ in child_queries_metadata:
|
||||
child_queries.append(all_query_indices[result_index][0][result_part_index])
|
||||
|
||||
subtree_outcomes = yield child, child_queries, child_queries_metadata
|
||||
partial_results = yield child, child_queries, child_queries_metadata
|
||||
yield None, None, None
|
||||
|
||||
def add_subtrees(self, old_subtree, new_subtree):
|
||||
|
@ -105,18 +105,6 @@ class Tree(object):
|
|||
if outcome:
|
||||
new_results = self.get_results(partial_results_dict, result_index, result_part, outcome, len(all_query_indices[result_index][0]))
|
||||
if new_results:
|
||||
# if is_permanent:
|
||||
# if result_index in completed_subtrees:
|
||||
# self.add_subtrees(completed_subtrees[result_index], new_results)
|
||||
# else:
|
||||
# completed_subtrees[result_index] = new_results
|
||||
# comment
|
||||
# self.add_subtrees(completed_subtrees[result_index], new_results)
|
||||
# else:
|
||||
# if result_index in completed_subtrees:
|
||||
# self.add_subtrees(partial_subtrees[result_index], new_results)
|
||||
# else:
|
||||
# partial_subtrees[result_index] = new_results
|
||||
self.add_subtrees(partial_subtrees[result_index], new_results)
|
||||
else:
|
||||
if not is_permanent:
|
||||
|
@ -142,7 +130,7 @@ class Tree(object):
|
|||
|
||||
for i in range(len(new_completed_subtrees)):
|
||||
completed_subtrees[i].extend(new_completed_subtrees[i])
|
||||
child, child_queries, child_queries_metadata = children_queries_generator.send(new_partial_subtrees)
|
||||
child, child_queries, child_queries_metadata = children_queries_generator.send(partial_results_dict)
|
||||
child_index += 1
|
||||
|
||||
return partial_subtrees, completed_subtrees
|
||||
|
@ -168,7 +156,6 @@ class Tree(object):
|
|||
r_all_query_indices.append((permanent_query_tree['r_children'], True))
|
||||
|
||||
active_temporary_query_trees = []
|
||||
# partial_subtrees = [[] for i in range(len(temporary_query_trees))]
|
||||
for i, temporary_query_tree in enumerate(temporary_query_trees):
|
||||
if self.fits_static_requirements(temporary_query_tree):
|
||||
active_temporary_query_trees.append(temporary_query_tree)
|
||||
|
@ -177,11 +164,6 @@ class Tree(object):
|
|||
l_all_query_indices.append((temporary_query_tree['l_children'], False))
|
||||
if 'r_children' in temporary_query_tree:
|
||||
r_all_query_indices.append((temporary_query_tree['r_children'], False))
|
||||
# if 'l_children' not in temporary_query_tree and 'r_children' not in temporary_query_tree:
|
||||
# partial_subtrees[i] = [[self.create_output_string()]]
|
||||
# elif 'l_children' not in temporary_query_tree and 'r_children' not in temporary_query_tree:
|
||||
# partial_subtrees[i] = None
|
||||
|
||||
|
||||
l_partial_subtrees, l_completed_subtrees = self.get_all_query_indices(len(temporary_query_trees), len(permanent_query_trees), permanent_query_trees, l_all_query_indices, self.l_children)
|
||||
r_partial_subtrees, r_completed_subtrees = self.get_all_query_indices(len(temporary_query_trees), len(permanent_query_trees), permanent_query_trees, r_all_query_indices, self.r_children)
|
||||
|
|
10
config.ini
10
config.ini
|
@ -1,8 +1,12 @@
|
|||
[settings]
|
||||
input = /media/luka/Portable Disk/Datasets/dependency_treeparse/ssj500k.conllu/sl_ssj-ud_v2.4.conllu
|
||||
output = /results/out.tsv
|
||||
internal_saves = ./internal_saves
|
||||
output = ./association_rules.tsv
|
||||
ngrams = 0
|
||||
; ngrams = 2
|
||||
; analyze_type options: 'lemma', 'word'
|
||||
; query = _ > _
|
||||
query = _ > (_ < _) > (_ < _)
|
||||
; query = _ < (_ > _) < _ > _
|
||||
; query = _ > (_ < _) > _
|
||||
; query = _ < (_ > _) < _ > _
|
||||
; query = _ < _ > _
|
||||
query = _ < _
|
|
@ -1,4 +1,5 @@
|
|||
import configparser
|
||||
import csv
|
||||
import hashlib
|
||||
import os
|
||||
import pickle
|
||||
|
@ -115,17 +116,34 @@ def main():
|
|||
config = configparser.ConfigParser()
|
||||
config.read('config.ini')
|
||||
|
||||
ngrams = 0
|
||||
if config.getint('settings', 'ngrams') == 2:
|
||||
ngrams = 2
|
||||
query_tree = [{"l_children": [{}]}, {"r_children": [{}]}]
|
||||
else:
|
||||
query_tree = [decode_query('(' + config.get('settings', 'query') + ')')]
|
||||
|
||||
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = create_trees(config)
|
||||
|
||||
|
||||
query_tree = decode_query('(' + config.get('settings', 'query') + ')')
|
||||
|
||||
for tree in all_trees[1:]:
|
||||
result_dict = {}
|
||||
|
||||
# for tree in all_trees[2:]:
|
||||
for tree in all_trees:
|
||||
# original
|
||||
# r_children = tree.r_children[:1] + tree.r_children[3:4]
|
||||
tree.r_children = tree.r_children[:1] + tree.r_children[2:4]
|
||||
_, subtrees = tree.get_subtrees([query_tree], [])
|
||||
|
||||
# tree.r_children = tree.r_children[:1] + tree.r_children[2:4]
|
||||
_, subtrees = tree.get_subtrees(query_tree, [])
|
||||
for query_results in subtrees:
|
||||
for result in query_results:
|
||||
if ngrams:
|
||||
result = sorted(result)
|
||||
r = tuple(result)
|
||||
if r in result_dict:
|
||||
result_dict[r] += 1
|
||||
else:
|
||||
result_dict[r] = 1
|
||||
# test 1 layer queries
|
||||
# tree.r_children = []
|
||||
# tree.l_children[1].l_children = []
|
||||
|
@ -142,11 +160,25 @@ def main():
|
|||
# # _, subtrees = new_tree.get_subtrees(
|
||||
# # [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}], "r_children": []}], "r_children": []}], [])
|
||||
|
||||
return
|
||||
sorted_list = sorted(result_dict.items(), key=lambda x: x[1], reverse=True)
|
||||
|
||||
with open(config.get('settings', 'output'), "w", newline="") as f:
|
||||
# header - use every second space as a split
|
||||
writer = csv.writer(f, delimiter='\t')
|
||||
if ngrams:
|
||||
writer.writerow(['Word 1', 'Word 2', 'Number of occurences'])
|
||||
else:
|
||||
span = 2
|
||||
words = config.get('settings', 'query').split(" ")
|
||||
header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Number of occurences']
|
||||
writer.writerow(header)
|
||||
|
||||
# body
|
||||
for k, v in sorted_list:
|
||||
writer.writerow(list(k) + [str(v)])
|
||||
|
||||
return
|
||||
|
||||
# {"form": "", "lemma": "", "upos": "", "xpos": "", "l_children": [{}, {}], "r_children": [{}, {}]}
|
||||
# {"form": "", "lemma": "", "upos": "", "xpos": "", "l_children": [{}, {}], "r_children": [{}, {}]}
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
Loading…
Reference in New Issue
Block a user