From 1c5481da77d72fecfda8f2e87f7b9f3e37d8978c Mon Sep 17 00:00:00 2001 From: Luka Date: Fri, 23 Aug 2019 11:14:31 +0200 Subject: [PATCH] Initial commit --- .gitignore | 3 + Tree.py | 339 ++++++++++++++++++++++++++++++++++++++++ Value.py | 6 + config.ini | 7 + dependency-parsetree.py | 134 ++++++++++++++++ 5 files changed, 489 insertions(+) create mode 100644 .gitignore create mode 100644 Tree.py create mode 100644 Value.py create mode 100644 config.ini create mode 100644 dependency-parsetree.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f872a2f --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.idea/ +venv/ +internal_saves/ diff --git a/Tree.py b/Tree.py new file mode 100644 index 0000000..be4b1d1 --- /dev/null +++ b/Tree.py @@ -0,0 +1,339 @@ +import sys + +from pyconll.unit import Token + +from Value import Value + + +class Tree(object): + def __init__(self, form, lemma, upos, xpos, deprel, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, head): + # form_unicode = str(form).encode("utf-8") + if form not in form_dict: + form_dict[form] = Value(form) + self.form = form_dict[form] + if lemma not in lemma_dict: + lemma_dict[lemma] = Value(lemma) + self.lemma = lemma_dict[lemma] + if upos not in upos_dict: + upos_dict[upos] = Value(upos) + self.upos = upos_dict[upos] + if xpos not in xpos_dict: + xpos_dict[xpos] = Value(xpos) + self.xpos = xpos_dict[xpos] + if deprel not in deprel_dict: + deprel_dict[deprel] = Value(deprel) + self.deprel = deprel_dict[deprel] + # self.position = position + + self.parent = head + self.l_children = [] + self.r_children = [] + + def add_l_child(self, child): + self.l_children.append(child) + + def add_r_child(self, child): + self.r_children.append(child) + + def set_parent(self, parent): + self.parent = parent + + def fits_static_requirements(self, query_tree): + return ('form' not in query_tree or query_tree['form'] == self.form.get_value) and \ + ('lemma' not in query_tree or query_tree['lemma'] == self.lemma.get_value) and \ + ('upos' not in query_tree or query_tree['upos'] == self.upos.get_value) and \ + ('xpos' not in query_tree or query_tree['xpos'] == self.xpos.get_value) and \ + ('deprel' not in query_tree or query_tree['deprel'] == self.deprel.get_value) + + def generate_children_queries(self, l_all_query_indices): + subtree_outcomes = [] + # list of pairs (index of query in group, group of query) + queries = [] + for child_index, child in enumerate(self.l_children): + new_queries = [] + + # add continuation queries to children + for (result_part_index, result_index, is_permanent), subtree_outcome in zip(queries, subtree_outcomes): + if subtree_outcome: + if len(l_all_query_indices[result_index][0]) > result_part_index + 1: + new_queries.append((result_part_index + 1, result_index, is_permanent)) + # else: + # completed_subtrees.append((child, result_index)) + + queries = new_queries + + # add new queries to children + for result_index, (group, is_permanent) in enumerate(l_all_query_indices): + # check if node has enough children for query to be possible + if len(self.l_children) - len(group) >= child_index: + queries.append((0, result_index, is_permanent)) + + l_children_query_trees = [] + for result_part_index, result_index, _ in queries: + l_children_query_trees.append(l_all_query_indices[result_index][0][result_part_index]) + + subtree_outcomes = yield child, l_children_query_trees, queries + yield None, None, None + + def add_subtrees(self, old_subtree, new_subtree): + old_subtree.extend(new_subtree) + + def group_results_old(self, subtree_outcomes, queries, l_all_query_indices, completed_subtrees, query_creation_dict, child_index, partial_subtrees): + for outcome, (result_part_index, result_index, is_permanent) in zip(subtree_outcomes, queries): + if outcome: + if result_part_index == len(l_all_query_indices[result_index][0]) - 1: + new_results = self.create_subtrees(query_creation_dict, result_index, result_part_index, child_index, outcome) + if is_permanent: + self.add_subtrees(completed_subtrees, new_results) + + else: + self.add_subtrees(partial_subtrees, new_results) + else: + # save results for later usage + if result_index in query_creation_dict: + if result_part_index in query_creation_dict[result_index]: + query_creation_dict[result_index][result_part_index][child_index] = outcome + else: + query_creation_dict[result_index][result_part_index] = {child_index: outcome} + else: + query_creation_dict[result_index] = {result_part_index: {child_index: outcome}} + else: + if not is_permanent: + partial_subtrees.append(None) + + + def add_partial_results(self, partial_results_dict, result_index, result_part_index, child_index, outcome): + # save results for later usage + # if result_index in partial_results_dict: + # if result_part_index in partial_results_dict[result_index]: + # # previous_results, previous_stage = partial_results_dict[result_index][result_part_index] + # partial_results_dict[result_index][result_part_index] = self.add_results_part(partial_results_dict[result_index][result_part_index], outcome) + # else: + # partial_results_dict[result_index][result_part_index] = outcome + # else: + # partial_results_dict[result_index] = {result_part_index: outcome} + + + + if result_index in partial_results_dict: + if result_part_index in partial_results_dict[result_index]: + # previous_results, previous_stage = partial_results_dict[result_index][result_part_index] + partial_results_dict[result_index][result_part_index] = self.add_results_part(partial_results_dict[result_index][result_part_index], self.create_tuple_from_output(outcome, [])) + else: + partial_results_dict[result_index][result_part_index] = self.create_tuple_from_output(outcome, []) + else: + partial_results_dict[result_index] = {result_part_index: self.create_tuple_from_output(outcome, [])} + + # def create_tuple_from_output(self, new_results, combined_results): + # for new_result in new_results: + # combined_results.append((new_result, 0)) + # return combined_results + + def add_results_part(self, previous_results_part, new_results): + combined_results = self.merge_results(previous_results_part, new_results) + + return self.create_tuple_from_output(new_results, combined_results=combined_results) + # for new_result in new_results: + # combined_results.append((new_result, 0)) + # return combined_results + + def group_results(self, subtree_outcomes, queries, l_all_query_indices, completed_subtrees, partial_results_dict, child_index, partial_subtrees): + for outcome, (result_part_index, result_index, is_permanent) in zip(subtree_outcomes, queries): + if outcome: + if result_part_index == len(l_all_query_indices[result_index][0]) - 1: + # new_results = self.create_subtrees(partial_results_dict, result_index, result_part_index, child_index, outcome) + if result_part_index > 0: + new_results = self.merge_results(partial_results_dict[result_index][result_part_index - 1], + outcome) + else: + new_results = outcome + + if is_permanent: + self.add_subtrees(completed_subtrees, new_results) + else: + self.add_subtrees(partial_subtrees, new_results) + else: + self.add_partial_results(partial_results_dict, result_index, result_part_index, child_index, outcome) + else: + if not is_permanent: + partial_subtrees.append(None) + + def get_subtrees(self, permanent_query_trees, temporary_query_trees): + """ + + :param permanent_query_trees: + :param temporary_query_trees: + """ + + # list of all children queries grouped by parent queries + l_all_query_indices = [] + r_all_query_indices = [] + + active_permanent_querry_trees = [] + + for permanent_query_tree in permanent_query_trees: + if self.fits_static_requirements(permanent_query_tree): + active_permanent_querry_trees.append(permanent_query_tree) + l_all_query_indices.append((permanent_query_tree['l_children'], True)) + r_all_query_indices.append((permanent_query_tree['r_children'], True)) + + active_temporary_query_tree = [] + partial_subtrees = list([None] * len(temporary_query_trees)) + + for i, temporary_query_tree in enumerate(temporary_query_trees): + if self.fits_static_requirements(temporary_query_tree): + active_temporary_query_tree.append(temporary_query_tree) + if 'l_children' in temporary_query_tree: + l_all_query_indices.append((temporary_query_tree['l_children'], False)) + if 'r_children' in temporary_query_tree: + r_all_query_indices.append((temporary_query_tree['r_children'], False)) + if 'l_children' not in temporary_query_tree and 'r_children' not in temporary_query_tree: + partial_subtrees[i] = [[self.create_output_string()]] + elif 'l_children' not in temporary_query_tree and 'r_children' not in temporary_query_tree: + partial_subtrees[i] = None + # if self.fits_static_requirements(temporary_query_tree): + # if temporary_query_tree['l_children'] and self.l_children: + # l_children_permanent_query_trees.append(temporary_query_tree['l_children']) + # if temporary_query_tree['r_children'] and self.r_children: + # r_children_permanent_query_trees.append(temporary_query_tree['r_children']) + + + + # tree_outcomes = [] + completed_subtrees = [] + # list of pairs (index of query in group, group of query) + queries = [] + subtree_outcomes = [] + query_creation_dict = {} + + + children_queries_generator = self.generate_children_queries(l_all_query_indices) + + # # children_queries_generator.send([]) + # a = next(children_queries_generator) + # a1 = children_queries_generator.send(list([True] * len(a))) + # # b = next(children_queries_generator) + # b1 = children_queries_generator.send(list([True] * len(a1))) + # # c = next(children_queries_generator) + # c1 = children_queries_generator.send(list([True] * len(b1))) + # # d = next(children_queries_generator) + # d1 = children_queries_generator.send(list([True] * len(c1))) + child_index = 0 + child, child_query, child_group_mapper = next(children_queries_generator) + while child: + subtree_outcomes, completed_subtrees = child.get_subtrees(permanent_query_trees, child_query) + + self.group_results(subtree_outcomes, child_group_mapper, l_all_query_indices, completed_subtrees, query_creation_dict, child_index, partial_subtrees) + # TODO + child, child_query, child_group_mapper = children_queries_generator.send(subtree_outcomes) + child_index += 1 + print('test') + + # for child_index, child in enumerate(self.l_children): + # # add continuation queries to children + # for (result_part_index, query_indices_index, is_permanent), subtree_outcome in zip(queries, subtree_outcomes): + # if subtree_outcome: + # if len(l_all_query_indices[query_indices_index]) > result_part_index + 1: + # queries.append((result_part_index + 1, query_indices_index, is_permanent)) + # # else: + # # completed_subtrees.append((child, query_indices_index)) + # + # # add new queries to children + # for query_indices_index, (query_indices, is_permanent) in enumerate(l_all_query_indices): + # # check if node has enough children for query to be possible + # if len(self.l_children) - len(query_indices) >= child_index: + # queries.append((0, query_indices_index, is_permanent)) + # + # + # l_children_query_trees = [] + # for result_part_index, query_indices_index, _ in queries: + # l_children_query_trees.append(l_all_query_indices[query_indices_index][0][result_part_index]) + # subtree_outcomes, completed_subtrees = child.get_subtrees(permanent_query_trees, l_children_query_trees) + # + # + # + # # TODO: Right children functionality + # + # + # + # for outcome, (result_part_index, query_indices_index, is_permanent) in zip(subtree_outcomes, queries): + # if outcome: + # if result_part_index == len(l_all_query_indices[query_indices_index]) - 1: + # if is_permanent: + # completed_subtrees.extend(self.create_output(temp_results, query_indices_index, result_part_index, child_index)) + # else: + # partial_subtrees.append(self.create_output(temp_results, query_indices_index, result_part_index, child_index)) + # else: + # # save results for later usage + # if child_index in temp_results: + # if query_indices_index in temp_results[child_index]: + # temp_results[child_index][query_indices_index][result_part_index] = outcome + # else: + # temp_results[child_index][query_indices_index] = {result_part_index: outcome} + # else: + # temp_results[child_index] = {query_indices_index: {result_part_index: outcome}} + # else: + # if not is_permanent: + # partial_subtrees.append(None) + return partial_subtrees, completed_subtrees + + @staticmethod + def merge_results(old_results, new_results): + # previous_results, previous_stage = partial_results_dict[result_index][result_part_index] + merged_results = [] + # old_results, old_stage = old_results_tuple + for old_result, old_stage in old_results: + for new_result in new_results: + merged_results.append((old_result + new_result, old_stage + 1)) + # if not old_results: + # return new_results + return merged_results + + def create_subtrees(self, query_creation_dict, result_index, result_part_index, child_index, outcome): + new_valid_subtrees = [] + # stores all result_parts that have specific child_index together + result_connections = {} + for i in range(result_part_index): + for j in range(child_index): + # if child indices exist in result_index and result_part_index plus index of part is higher or equal to index of child (otherwise it is not in query_creation_dict + if result_index in query_creation_dict and i in query_creation_dict[result_index] and j in query_creation_dict[result_index][i]: + # if result_index in query_creation_dict and i in query_creation_dict[result_index] and j in query_creation_dict[result_index][i] and i >= j: + if i in result_connections: + result_connections[i].append(j) + else: + result_connections[i] = [j] + # positioned_candidates[j] = (query_creation_dict[i][result_index][j]) + + # result = [] + return self.create_subtrees_from_result_connections(0, 0, result_part_index, query_creation_dict, result_connections, result_index, [], outcome) + # new_valid_subtrees.extend(outcome[]) + # outcome.append(self.create_output_string()) + + def create_subtrees_from_result_connections(self, child_index_i, result_part_index_i, result_part_index_final, query_creation_dict, result_connections, + result_index, res_array, outcome): + if result_part_index_i == result_part_index_final: + # self.merge_results(res_array, outcome) + return self.merge_results(res_array, outcome) + + + + # res_array.append(query_creation_dict[result_index][result_part_index_i][child_index_i]) + results = [] + + for child_index in result_connections[result_part_index_i]: + if not (result_index in query_creation_dict and result_part_index_i in query_creation_dict[ + result_index] and child_index_i in query_creation_dict[result_index][result_part_index_i]): + print('HERE!') + return [] + + pass_array = self.merge_results(res_array, + query_creation_dict[result_index][result_part_index_i][child_index]) + # if child_index >= result_part_index_i: + results.extend(self.create_subtrees_from_result_connections(child_index, result_part_index_i + 1, result_part_index_final, query_creation_dict, + result_connections, result_index, res_array, outcome)) + + # print('aaa') + return results + + def create_output_string(self): + return self.form.get_value() \ No newline at end of file diff --git a/Value.py b/Value.py new file mode 100644 index 0000000..a3080ff --- /dev/null +++ b/Value.py @@ -0,0 +1,6 @@ +class Value(object): + def __init__(self, value): + self.value = value + + def get_value(self): + return self.value diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..9c8f759 --- /dev/null +++ b/config.ini @@ -0,0 +1,7 @@ +[settings] +input = /media/luka/Portable Disk/Datasets/dependency_treeparse/ssj500k.conllu/sl_ssj-ud_v2.4.conllu +internal_saves = ./internal_saves +output = ./association_rules.tsv +; analyze_type options: 'lemma', 'word' +; query = _ > _ +query = _ < (_ > _) < _ > _ \ No newline at end of file diff --git a/dependency-parsetree.py b/dependency-parsetree.py new file mode 100644 index 0000000..05d2b22 --- /dev/null +++ b/dependency-parsetree.py @@ -0,0 +1,134 @@ +import configparser +import hashlib +import os +import pickle +import re + +import pyconll + +from Tree import Tree + + +def decode_query(orig_query): + new_query = False + + # if command in bracelets remove them and treat command as new query + if orig_query[0] == '(' and orig_query[-1] == ')': + new_query = True + orig_query = orig_query[1:-1] + + # if orig_query is '_' return {} + if orig_query == '_': + return {} + elif not new_query: + raise Exception('Not supported yet!') + + # split over spaces if not inside braces + PATTERN = re.compile(r'''((?:[^ ()]|\([^(]*\))+)''') + all_orders = PATTERN.split(orig_query)[1::2] + + + # all_orders = orig_query.split() + node_actions = all_orders[::2] + priority_actions = all_orders[1::2] + priority_actions_beginnings = [a[0] for a in priority_actions] + + # find root index + try: + root_index = priority_actions_beginnings.index('>') + except ValueError: + root_index = len(priority_actions) + + l_children = [] + r_children = [] + root = None + for i, node_action in enumerate(node_actions): + if i < root_index: + l_children.append(decode_query(node_action)) + elif i > root_index: + r_children.append(decode_query(node_action)) + else: + root = decode_query(node_action) + root["l_children"] = l_children + root["r_children"] = r_children + return root + + +def create_trees(config): + internal_saves = config.get('settings', 'internal_saves') + input_path = config.get('settings', 'input') + hash_object = hashlib.sha1(input_path.encode('utf-8')) + hex_dig = hash_object.hexdigest() + trees_read_outputfile = os.path.join(internal_saves, hex_dig) + + if not os.path.exists(trees_read_outputfile): + + train = pyconll.load_from_file(input_path) + + form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict = {}, {}, {}, {}, {} + + all_trees = [] + + for sentence in train: + root = None + root_id = None + token_nodes = [] + for token in sentence: + node = Tree(token.form, token.lemma, token.upos, token.xpos, token.deprel, form_dict, + lemma_dict, upos_dict, xpos_dict, deprel_dict, token.head) + token_nodes.append(node) + if token.deprel == 'root': + root = node + root_id = int(token.id) + + for token_id, token in enumerate(token_nodes): + if token.parent == 0: + token.set_parent(None) + else: + parent_id = int(token.parent) - 1 + if token_id < parent_id: + token_nodes[parent_id].add_l_child(token) + elif token_id > parent_id: + token_nodes[parent_id].add_r_child(token) + else: + raise Exception('Root element should not be here!') + token.set_parent(token_nodes[parent_id]) + + if root == None: + raise Exception('No root element in sentence!') + all_trees.append(root) + + + with open(trees_read_outputfile, 'wb') as output: + pickle.dump((all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict), output) + else: + print('Reading trees:') + print('Completed') + with open(trees_read_outputfile, 'rb') as pkl_file: + (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = pickle.load(pkl_file) + + return all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict + +def main(): + config = configparser.ConfigParser() + config.read('config.ini') + + (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = create_trees(config) + + + query_tree = decode_query('(' + config.get('settings', 'query') + ')') + + for tree in all_trees: + # _, subtrees = tree.get_subtrees([query_tree], []) + tree.r_children = [] + tree.l_children[1].l_children = [] + _, subtrees = tree.get_subtrees([{"l_children": [{'a1':''}, {'a2':''}], "r_children": []}, {"l_children": [{'b1':''}], "r_children": []}, {"l_children": [{'c1':''}, {'c2':''}, {'c3':''}], "r_children": []}], []) + print('here') + return + + + # {"form": "", "lemma": "", "upos": "", "xpos": "", "l_children": [{}, {}], "r_children": [{}, {}]} + # {"form": "", "lemma": "", "upos": "", "xpos": "", "l_children": [{}, {}], "r_children": [{}, {}]} + +if __name__ == "__main__": + main()