From 97136ca5c30ea4f33bf4517d9ce052cbd9c49aed Mon Sep 17 00:00:00 2001 From: Luka Date: Thu, 29 Aug 2019 08:22:17 +0200 Subject: [PATCH] Added ngram=2 calculations + removed some bugs --- .gitignore | 1 + Tree.py | 28 +++++------------------- config.ini | 10 ++++++--- dependency-parsetree.py | 48 ++++++++++++++++++++++++++++++++++------- 4 files changed, 53 insertions(+), 34 deletions(-) diff --git a/.gitignore b/.gitignore index 78f1e99..102b861 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ venv/ internal_saves/ __pycache__/ +results/ diff --git a/Tree.py b/Tree.py index 992b05a..a975772 100644 --- a/Tree.py +++ b/Tree.py @@ -46,15 +46,15 @@ class Tree(object): ('deprel' not in query_tree or query_tree['deprel'] == self.deprel.get_value) def generate_children_queries(self, all_query_indices, children): - subtree_outcomes = [] + partial_results = {} # list of pairs (index of query in group, group of query, is permanent) child_queries_metadata = [] for child_index, child in enumerate(children): new_queries = [] # add continuation queries to children - for (result_part_index, result_index, is_permanent), subtree_outcome in zip(child_queries_metadata, subtree_outcomes): - if subtree_outcome: + for result_part_index, result_index, is_permanent in child_queries_metadata: + if result_index in partial_results and result_part_index in partial_results[result_index] and len(partial_results[result_index][result_part_index]) > 0: if len(all_query_indices[result_index][0]) > result_part_index + 1: new_queries.append((result_part_index + 1, result_index, is_permanent)) # else: @@ -72,7 +72,7 @@ class Tree(object): for result_part_index, result_index, _ in child_queries_metadata: child_queries.append(all_query_indices[result_index][0][result_part_index]) - subtree_outcomes = yield child, child_queries, child_queries_metadata + partial_results = yield child, child_queries, child_queries_metadata yield None, None, None def add_subtrees(self, old_subtree, new_subtree): @@ -105,18 +105,6 @@ class Tree(object): if outcome: new_results = self.get_results(partial_results_dict, result_index, result_part, outcome, len(all_query_indices[result_index][0])) if new_results: - # if is_permanent: - # if result_index in completed_subtrees: - # self.add_subtrees(completed_subtrees[result_index], new_results) - # else: - # completed_subtrees[result_index] = new_results - # comment - # self.add_subtrees(completed_subtrees[result_index], new_results) - # else: - # if result_index in completed_subtrees: - # self.add_subtrees(partial_subtrees[result_index], new_results) - # else: - # partial_subtrees[result_index] = new_results self.add_subtrees(partial_subtrees[result_index], new_results) else: if not is_permanent: @@ -142,7 +130,7 @@ class Tree(object): for i in range(len(new_completed_subtrees)): completed_subtrees[i].extend(new_completed_subtrees[i]) - child, child_queries, child_queries_metadata = children_queries_generator.send(new_partial_subtrees) + child, child_queries, child_queries_metadata = children_queries_generator.send(partial_results_dict) child_index += 1 return partial_subtrees, completed_subtrees @@ -168,7 +156,6 @@ class Tree(object): r_all_query_indices.append((permanent_query_tree['r_children'], True)) active_temporary_query_trees = [] - # partial_subtrees = [[] for i in range(len(temporary_query_trees))] for i, temporary_query_tree in enumerate(temporary_query_trees): if self.fits_static_requirements(temporary_query_tree): active_temporary_query_trees.append(temporary_query_tree) @@ -177,11 +164,6 @@ class Tree(object): l_all_query_indices.append((temporary_query_tree['l_children'], False)) if 'r_children' in temporary_query_tree: r_all_query_indices.append((temporary_query_tree['r_children'], False)) - # if 'l_children' not in temporary_query_tree and 'r_children' not in temporary_query_tree: - # partial_subtrees[i] = [[self.create_output_string()]] - # elif 'l_children' not in temporary_query_tree and 'r_children' not in temporary_query_tree: - # partial_subtrees[i] = None - l_partial_subtrees, l_completed_subtrees = self.get_all_query_indices(len(temporary_query_trees), len(permanent_query_trees), permanent_query_trees, l_all_query_indices, self.l_children) r_partial_subtrees, r_completed_subtrees = self.get_all_query_indices(len(temporary_query_trees), len(permanent_query_trees), permanent_query_trees, r_all_query_indices, self.r_children) diff --git a/config.ini b/config.ini index 14008dd..f7acc2f 100644 --- a/config.ini +++ b/config.ini @@ -1,8 +1,12 @@ [settings] input = /media/luka/Portable Disk/Datasets/dependency_treeparse/ssj500k.conllu/sl_ssj-ud_v2.4.conllu +output = /results/out.tsv internal_saves = ./internal_saves -output = ./association_rules.tsv +ngrams = 0 +; ngrams = 2 ; analyze_type options: 'lemma', 'word' ; query = _ > _ -query = _ > (_ < _) > (_ < _) -; query = _ < (_ > _) < _ > _ \ No newline at end of file +; query = _ > (_ < _) > _ +; query = _ < (_ > _) < _ > _ +; query = _ < _ > _ +query = _ < _ \ No newline at end of file diff --git a/dependency-parsetree.py b/dependency-parsetree.py index d44e3e5..cb372ae 100644 --- a/dependency-parsetree.py +++ b/dependency-parsetree.py @@ -1,4 +1,5 @@ import configparser +import csv import hashlib import os import pickle @@ -115,17 +116,34 @@ def main(): config = configparser.ConfigParser() config.read('config.ini') + ngrams = 0 + if config.getint('settings', 'ngrams') == 2: + ngrams = 2 + query_tree = [{"l_children": [{}]}, {"r_children": [{}]}] + else: + query_tree = [decode_query('(' + config.get('settings', 'query') + ')')] + (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = create_trees(config) - query_tree = decode_query('(' + config.get('settings', 'query') + ')') - for tree in all_trees[1:]: + result_dict = {} + + # for tree in all_trees[2:]: + for tree in all_trees: # original # r_children = tree.r_children[:1] + tree.r_children[3:4] - tree.r_children = tree.r_children[:1] + tree.r_children[2:4] - _, subtrees = tree.get_subtrees([query_tree], []) - + # tree.r_children = tree.r_children[:1] + tree.r_children[2:4] + _, subtrees = tree.get_subtrees(query_tree, []) + for query_results in subtrees: + for result in query_results: + if ngrams: + result = sorted(result) + r = tuple(result) + if r in result_dict: + result_dict[r] += 1 + else: + result_dict[r] = 1 # test 1 layer queries # tree.r_children = [] # tree.l_children[1].l_children = [] @@ -142,11 +160,25 @@ def main(): # # _, subtrees = new_tree.get_subtrees( # # [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}], "r_children": []}], "r_children": []}], []) - return + sorted_list = sorted(result_dict.items(), key=lambda x: x[1], reverse=True) + + with open(config.get('settings', 'output'), "w", newline="") as f: + # header - use every second space as a split + writer = csv.writer(f, delimiter='\t') + if ngrams: + writer.writerow(['Word 1', 'Word 2', 'Number of occurences']) + else: + span = 2 + words = config.get('settings', 'query').split(" ") + header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Number of occurences'] + writer.writerow(header) + + # body + for k, v in sorted_list: + writer.writerow(list(k) + [str(v)]) + return - # {"form": "", "lemma": "", "upos": "", "xpos": "", "l_children": [{}, {}], "r_children": [{}, {}]} - # {"form": "", "lemma": "", "upos": "", "xpos": "", "l_children": [{}, {}], "r_children": [{}, {}]} if __name__ == "__main__": main()