STARK/dependency-parsetree.py

import argparse
import configparser
import copy
import csv
import hashlib
import os
import pickle
import re
import time
import timeit
from multiprocessing import Pool

import pyconll

from Tree import Tree, create_output_string_form, create_output_string_deprel, create_output_string_lemma, create_output_string_upos, create_output_string_xpos, create_output_string_feats

# for separate searches of feats
feats_detailed_list = [
    # lexical features
    'PronType', 'NumType', 'Poss', 'Reflex', 'Foreign', 'Abbr',

    # Inflectional features (nominal)
    'Gender', 'Animacy', 'NounClass', 'Number', 'Case', 'Definite', 'Degree',

    # Inflectional features (verbal)
    'VerbForm', 'Mood', 'Tense', 'Aspect', 'Voice', 'Evident', 'Polarity', 'Person', 'Polite', 'Clusivity',

    # Other
    'Variant', 'Number[psor]', 'Gender[psor]', 'NumForm'
]

feats_detailed_dict = {key: {} for key in feats_detailed_list}


def decode_query(orig_query, dependency_type):
    new_query = False

    # if command in bracelets remove them and treat command as new query
    if orig_query[0] == '(' and orig_query[-1] == ')':
        new_query = True
        orig_query = orig_query[1:-1]

    # if orig_query is '_' return {}
    if dependency_type != '':
        decoded_query = {'deprel': dependency_type}
    else:
        decoded_query = {}

    if orig_query == '_':
        return decoded_query
    # if no spaces in query then this is query node and do this otherwise further split query
    elif len(orig_query.split(' ')) == 1:
        orig_query_split_parts = orig_query.split(' ')[0].split('&')
        for orig_query_split_part in orig_query_split_parts:
            orig_query_split = orig_query_split_part.split('=', 1)
            if len(orig_query_split) > 1:
                if orig_query_split[0] == 'L':
                    decoded_query['lemma'] = orig_query_split[1]
                    # return decoded_query
                elif orig_query_split[0] == 'upos':
                    decoded_query['upos'] = orig_query_split[1]
                    # return decoded_query
                elif orig_query_split[0] == 'xpos':
                    decoded_query['xpos'] = orig_query_split[1]
                    # return decoded_query
                elif orig_query_split[0] == 'form':
                    decoded_query['form'] = orig_query_split[1]
                    # return decoded_query
                elif orig_query_split[0] == 'feats':
                    decoded_query['feats'] = orig_query_split[1]
                    # return decoded_query
                elif orig_query_split[0] in feats_detailed_list:
                    decoded_query['feats_detailed'] = {}
                    decoded_query['feats_detailed'][orig_query_split[0]] = orig_query_split[1]
                    return decoded_query
                elif not new_query:
                    raise Exception('Not supported yet!')
                else:
                    print('???')
            elif not new_query:
                decoded_query['form'] = orig_query_split_part
                # return decoded_query
        return decoded_query

    # split over spaces if not inside braces
    # PATTERN = re.compile(r'''((?:[^ ()]|\([^.]*\))+)''')
    # all_orders = PATTERN.split(orig_query)
    # PATTERN = re.compile(r"(?:[^ ()]|\([^.]*\))+")
    # all_orders = re.findall(r"(?:[^ ()]|\([^]*\))+", orig_query)
    all_orders = re.split(r"\s+(?=[^()]*(?:\(|$))", orig_query)


    # all_orders = orig_query.split()
    node_actions = all_orders[::2]
    priority_actions = all_orders[1::2]
    priority_actions_beginnings = [a[0] for a in priority_actions]

    # find root index
    try:
        root_index = priority_actions_beginnings.index('>')
    except ValueError:
        root_index = len(priority_actions)

    children = []
    root = None
    for i, node_action in enumerate(node_actions):
        if i < root_index:
            children.append(decode_query(node_action, priority_actions[i][1:]))
        elif i > root_index:
            children.append(decode_query(node_action, priority_actions[i - 1][1:]))
        else:
            root = decode_query(node_action, dependency_type)
    if children:
        root["children"] = children
    return root


def create_trees(config):
    internal_saves = config.get('settings', 'internal_saves')
    input_path = config.get('settings', 'input')
    hash_object = hashlib.sha1(input_path.encode('utf-8'))
    hex_dig = hash_object.hexdigest()
    trees_read_outputfile = os.path.join(internal_saves, hex_dig)

    if not os.path.exists(trees_read_outputfile):

        train = pyconll.load_from_file(input_path)

        form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, feats_dict = {}, {}, {}, {}, {}, {}

        all_trees = []

        for sentence in train:
            root = None
            root_id = None
            token_nodes = []
            for token in sentence:
                # token_feats = ''
                # for k, v in token.feats.items():
                #     token_feats += k + next(iter(v)) + '|'
                # token_feats = token_feats[:-1]
                if not token.id.isdigit():
                    continue

                # TODO check if 5th place is always there for feats
                feats = token._fields[5]
                node = Tree(int(token.id), token.form, token.lemma, token.upos, token.xpos, token.deprel, feats, token.feats, form_dict,
                            lemma_dict, upos_dict, xpos_dict, deprel_dict, feats_dict, feats_detailed_dict, token.head)
                token_nodes.append(node)
                if token.deprel == 'root':
                    root = node
                    root_id = int(token.id)

            for token_id, token in enumerate(token_nodes):
                if int(token.parent) == 0:
                    token.set_parent(None)
                else:
                    parent_id = int(token.parent) - 1
                    # if token_id < parent_id:
                    #     token_nodes[parent_id].add_l_child(token)
                    # elif token_id > parent_id:
                    #     token_nodes[parent_id].add_r_child(token)
                    # else:
                    #     raise Exception('Root element should not be here!')
                    if token_nodes[parent_id].children_split == -1 and token_id > parent_id:
                        token_nodes[parent_id].children_split = len(token_nodes[parent_id].children)
                    token_nodes[parent_id].add_child(token)
                    token.set_parent(token_nodes[parent_id])

            for token in token_nodes:
                if token.children_split == -1:
                    token.children_split = len(token.children)

            if root == None:
                raise Exception('No root element in sentence!')
            all_trees.append(root)


        with open(trees_read_outputfile, 'wb') as output:
            pickle.dump((all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict), output)
    else:
        print('Reading trees:')
        print('Completed')
        with open(trees_read_outputfile, 'rb') as pkl_file:
            (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = pickle.load(pkl_file)

    return all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict


# def order_independent_queries(query_tree):
#     all_children = query_tree['l_children'] + query_tree['r_children']
#     if all_children > 0:
#
#     else:
#         return query_tree
#     pass

def printable_answers(query):
    # all_orders = re.findall(r"(?:[^ ()]|\([^]*\))+", query)
    all_orders = re.split(r"\s+(?=[^()]*(?:\(|$))", query)

    # all_orders = orig_query.split()
    node_actions = all_orders[::2]
    # priority_actions = all_orders[1::2]

    if len(node_actions) > 1:
        res = []
        # for node_action in node_actions[:-1]:
        #     res.extend(printable_answers(node_action[1:-1]))
        # res.extend([node_actions[-1]])
        for node_action in node_actions:
            # if command in bracelets remove them and treat command as new query
            # TODO FIX BRACELETS IN A BETTER WAY
            if not node_action:
                res.extend(['('])
            elif node_action[0] == '(' and node_action[-1] == ')':
                res.extend(printable_answers(node_action[1:-1]))
            else:
                res.extend([node_action])
        return res
    else:
        return [query]


def tree_calculations(input_data):
    tree, query_tree, create_output_string_funct, filters = input_data
    _, subtrees = tree.get_subtrees(query_tree, [], create_output_string_funct, filters)
    return subtrees


def tree_calculations_chunks(input_data):
    trees, query_tree, create_output_string_funct, filters = input_data

    result_dict = {}
    for tree in trees:
        _, subtrees = tree.get_subtrees(query_tree, [], create_output_string_funct, filters)

        for query_results in subtrees:
            for r in query_results:
                if r in result_dict:
                    result_dict[r] += 1
                else:
                    result_dict[r] = 1
    return result_dict


def chunkify(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))


def add_node(tree):
    if 'children' in tree:
        tree['children'].append({})
    else:
        tree['children'] = [{}]


# walk over all nodes in tree and add a node to each possible node
def tree_grow(orig_tree):
    new_trees = []
    new_tree = copy.deepcopy(orig_tree)
    add_node(new_tree)
    new_trees.append(new_tree)
    if 'children' in orig_tree:
        children = []
        for child_tree in orig_tree['children']:
            children.append(tree_grow(child_tree))
        for i, child in enumerate(children):
            for child_res in child:
                new_tree = copy.deepcopy(orig_tree)
                new_tree['children'][i] = child_res
                new_trees.append(new_tree)

    return new_trees


def compare_trees(tree1, tree2):
    if tree1 == {} and tree2 == {}:
        return True

    if 'children' not in tree1 or 'children' not in tree2 or len(tree1['children']) != len(tree2['children']):
        return False

    children2_connections = []

    for child1_i, child1 in enumerate(tree1['children']):
        child_duplicated = False
        for child2_i, child2 in enumerate(tree2['children']):
            if child2_i in children2_connections:
                pass
            if compare_trees(child1, child2):
                children2_connections.append(child2_i)
                child_duplicated = True
                break
        if not child_duplicated:
            return False

    return True

def create_ngrams_query_trees(n, trees):
    for i in range(n - 1):
        new_trees = []
        for tree in trees:
            # append new_tree only if it is not already inside
            for new_tree in tree_grow(tree):
                duplicate = False
                for confirmed_new_tree in new_trees:
                    if compare_trees(new_tree, confirmed_new_tree):
                        duplicate = True
                        break
                if not duplicate:
                    new_trees.append(new_tree)

        trees = new_trees
        # delete_duplicates(trees)
        # print('here')
    # tree_grow(tree)
    # tree_grow(tree)
    # tree['children'] = [{}]
    return trees


def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--config_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The input config file.")
    args = parser.parse_args()

    config = configparser.ConfigParser()
    config.read(args.config_file)
    # a = args.config_file
    # config.read('config.ini')
    # create queries
    ngrams = 0


    # if config.getint('settings', 'ngrams') == 2:
    #     ngrams = 2
    #     query_tree = [{"children": [{}]}]
    # elif config.getint('settings', 'ngrams') == 3:
    #     ngrams = 3
    #     query_tree = [{"children": [{}, {}]}, {"children": [{"children": [{}]}]}]
    # elif config.getint('settings', 'ngrams') == 4:
    #     ngrams = 4
    #     query_tree = [{"children": [{}, {}, {}]}, {"children": [{"children": [{}, {}]}]}, {"children": [{"children": [{}]}, {}]}, {"children": [{"children": [{"children": [{}]}]}]}]
    # elif config.getint('settings', 'ngrams') == 5:
    #     ngrams = 5
    #     query_tree = [{"children": [{}, {}, {}, {}]}, {"children": [{"children": [{}]}, {}, {}]}, {"children": [{"children": [{}, {}]}, {}]}, {"children": [{"children": [{}]}, {"children": [{}]}]},
    #                   {"children": [{"children": [{"children": [{}]}]}, {}]}, {"children": [{"children": [{"children": [{}]}, {}]}]}, {"children": [{"children": [{"children": [{}, {}]}]}]},
    #                   {"children": [{"children": [{"children": [{"children": [{}]}]}]}]}, {'children': [{'children': [{}, {}, {}]}]}]
    ngrams_range = config.get('settings', 'ngrams').split('-')
    ngrams_range = [int(r) for r in ngrams_range]

    if ngrams_range[0] > 1:
        if len(ngrams_range) == 1:
            query_tree = create_ngrams_query_trees(ngrams_range[0], [{}])
        elif len(ngrams_range) == 2:
            query_tree = []
            for i in range(ngrams_range[0], ngrams_range[1] + 1):
                query_tree.extend(create_ngrams_query_trees(i, [{}]))
    else:
        query_tree = [decode_query('(' + config.get('settings', 'query') + ')', '')]
        # order_independent_queries(query_tree)

    # 261 - 9 grams
    # 647 - 10 grams
    # 1622 - 11 grams
    # 4126 - 12 grams
    # 10598 - 13 grams
    (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = create_trees(config)


    # set filters
    assert config.get('settings', 'node_type') in ['deprel', 'lemma', 'upos', 'xpos', 'form', 'feats'], '"node_type" is not set up correctly'
    cpu_cores = config.getint('settings', 'cpu_cores')
    if config.get('settings', 'node_type') == 'deprel':
        create_output_string_funct = create_output_string_deprel
    elif config.get('settings', 'node_type') == 'lemma':
        create_output_string_funct = create_output_string_lemma
    elif config.get('settings', 'node_type') == 'upos':
        create_output_string_funct = create_output_string_upos
    elif config.get('settings', 'node_type') == 'xpos':
        create_output_string_funct = create_output_string_xpos
    elif config.get('settings', 'node_type') == 'feats':
        create_output_string_funct = create_output_string_feats
    else:
        create_output_string_funct = create_output_string_form

    result_dict = {}
    filters = {}
    filters['node_order'] = config.get('settings', 'node_order') == 'fixed'
    # filters['caching'] = config.getboolean('settings', 'caching')
    filters['dependency_type'] = config.get('settings', 'dependency_type') == 'labeled'
    if config.has_option('settings', 'label_whitelist'):
        filters['label_whitelist'] = config.get('settings', 'label_whitelist').split('|')
    else:
        filters['label_whitelist'] = []

    if config.has_option('settings', 'root_whitelist'):
        # test
        filters['root_whitelist'] = []

        for option in config.get('settings', 'root_whitelist'). split('|'):
            attribute_dict = {}
            for attribute in option.split('&'):
                value = attribute.split('=')
                # assert value[0] in ['deprel', 'lemma', 'upos', 'xpos', 'form',
                #                     'feats'], '"root_whitelist" is not set up correctly'
                attribute_dict[value[0]] = value[1]
            filters['root_whitelist'].append(attribute_dict)
        # filters['root_whitelist'] = [{'upos': 'NOUN', 'Case': 'Nom'}, {'upos': 'ADJ', 'Degree': 'Sup'}]
    else:
        filters['root_whitelist'] = []

    filters['complete_tree_type'] = config.get('settings', 'tree_type') == 'complete'


    # for tree in all_trees[2:]:
    # for tree in all_trees[1205:]:
    with Pool(cpu_cores) as p:
        start_exe_time = time.time()
        # 1.25 s (16 cores)
        # chunked_trees = list(chunkify(all_trees, cpu_cores))
        # if cpu_cores > 1:
        #     part_results = p.map(tree_calculations_chunks,
        #                          [(tree, query_tree, create_output_string_funct, filters) for tree in chunked_trees])
        #
        #     for part_result in part_results:
        #         for r_k, r_v in part_result.items():
        #             if r_k in result_dict:
        #                 result_dict[r_k] += r_v
        #             else:
        #                 result_dict[r_k] = r_v

        # 1.02 s (16 cores)
        if cpu_cores > 1:
            all_subtrees = p.map(tree_calculations, [(tree, query_tree, create_output_string_funct, filters) for tree in all_trees])

            # for subtrees in all_subtrees:
            for tree_i, subtrees in enumerate(all_subtrees):
                for query_results in subtrees:
                    for r in query_results:
                        # if r == '(" < , < je < velik) < tem':
                        #     print(tree_i)
                        # if r in result_dict:
                        #     result_dict[r] += 1
                        # else:
                        #     result_dict[r] = 1
                        if r.key in result_dict:
                            result_dict[r.key]['number'] += 1
                        else:
                            result_dict[r.key] = {'object': r, 'number': 1}

        # 3.65 s (1 core)
        else:
            # for tree_i, tree in enumerate(all_trees[-5:]):
            for tree_i, tree in enumerate(all_trees):
            # text = Če pa ostane odrasel otrok doma, se starši le težko sprijaznijo s tem, da je "velik", otrok pa ima ves čas občutek, da se njegovi starši po nepotrebnem vtikajo v njegovo življenje.
            # for tree_i, tree in enumerate(all_trees[5170:]):
            # for tree in all_trees:
                subtrees = tree_calculations((tree, query_tree, create_output_string_funct, filters))
                for query_results in subtrees:
                    for r in query_results:
                        # if r == '(" < , < je < velik) < tem':
                        #     print(tree_i)
                        if r.key in result_dict:
                            result_dict[r.key]['number'] += 1
                        else:
                            result_dict[r.key] = {'object': r, 'number': 1}

        print("Execution time:")
        print("--- %s seconds ---" % (time.time() - start_exe_time))
            # test 1 layer queries
            # # tree.r_children = []
            # # tree.children[1].children = []
            # # query = [{'children': [{}]}, {'children': [{}]}]
            # # query = [{"children": [{}, {}]}, {"children": [{}]}, {"children": [{}, {}, {}]}]
            # query = [{"children": [{'form': 'je'}, {}]}, {"children": [{'form': 'je'}]}, {"children": [{'form': 'je'}, {}, {}]}]
            # # query = [{'q1':'', "children": [{'a1':''}, {'a2':''}]}, {'q2':'', "children": [{'b1':''}]}, {'q3':'', "children": [{'c1':''}, {'c2':''}, {'c3':''}]}]
            # _, _, subtrees = tree.get_subtrees(query, [], create_output_string_funct)
            # # _, subtrees = tree.get_subtrees([{'q1':'', "children": [{'a1':''}, {'a2':''}], "children": []}, {'q2':'', "children": [{'b1':''}], "children": []}, {'q3':'', "children": [{'c1':''}, {'c2':''}, {'c3':''}], "children": []}], [])
            # print('HERE!')

            # test 2 layer queries
            # tree.r_children = [Tree('je', '', '', '', '', form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, None)]
            # tree.l_children[1].l_children = []
            # new_tree = Tree('bil', '', '', '', '', form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, None)
            # new_tree.l_children = [tree]
            # _, subtrees = new_tree.get_subtrees(
            #     [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}]}]}], [])
            # # _, subtrees = new_tree.get_subtrees(
            # #     [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}], "r_children": []}],  "r_children": []}], [])
    sorted_list = sorted(result_dict.items(), key=lambda x: x[1]['number'], reverse=True)

    with open(config.get('settings', 'output'), "w", newline="") as f:
        # header - use every second space as a split
        writer = csv.writer(f, delimiter='\t')
        if ngrams:
            len_words = ngrams
        else:
            len_words = int(len(config.get('settings', 'query').split(" "))/2 + 1)
        header = ["Structure"] + ["Word #" + str(i) for i in range(1, len_words + 1)] + ['Number of occurences']
        # header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Number of occurences']
        writer.writerow(header)

        # body
        for k, v in sorted_list:
            words_only = printable_answers(k)
            writer.writerow([k] + words_only + [str(v['number'])])

    return "Done"


if __name__ == "__main__":
    start_time = time.time()
    main()
    print("Total:")
    print("--- %s seconds ---" % (time.time() - start_time))
-												Added parameter for path of config.ini input + Added dependency_relation into queries

											
										
										
											5 years ago
+								import argparse
-												Initial commit

											
										
										
											5 years ago
+								import configparser
-												Added unlimited ngrams

											
										
										
											5 years ago
+								import copy
-												Added ngram=2 calculations + removed some bugs

											
										
										
											5 years ago
+								import csv
-												Initial commit

											
										
										
											5 years ago
+								import hashlib
 								import os
 								import pickle
 								import re
-												Added multiprocessing.

											
										
										
											5 years ago
+								import time
 								import timeit
 								from multiprocessing import Pool
-												Initial commit

											
										
										
											5 years ago
 								import pyconll
-												Added feats in node_type

											
										
										
											5 years ago
+								from Tree import Tree, create_output_string_form, create_output_string_deprel, create_output_string_lemma, create_output_string_upos, create_output_string_xpos, create_output_string_feats
-												Initial commit

											
										
										
											5 years ago
-												Refactoring original code + partialy added node_order=yes

											
										
										
											5 years ago
+								# for separate searches of feats
-												Added root filtering.

											
										
										
											5 years ago
+								feats_detailed_list = [
 								    # lexical features
 								    'PronType', 'NumType', 'Poss', 'Reflex', 'Foreign', 'Abbr',
 								    # Inflectional features (nominal)
 								    'Gender', 'Animacy', 'NounClass', 'Number', 'Case', 'Definite', 'Degree',
 								    # Inflectional features (verbal)
 								    'VerbForm', 'Mood', 'Tense', 'Aspect', 'Voice', 'Evident', 'Polarity', 'Person', 'Polite', 'Clusivity',
 								    # Other
 								    'Variant', 'Number[psor]', 'Gender[psor]', 'NumForm'
 								]
 								feats_detailed_dict = {key: {} for key in feats_detailed_list}
-												Added feats to queries

											
										
										
											5 years ago
-												Initial commit

											
										
										
											5 years ago
-												Added parameter for path of config.ini input + Added dependency_relation into queries

											
										
										
											5 years ago
+								def decode_query(orig_query, dependency_type):
-												Initial commit

											
										
										
											5 years ago
+								    new_query = False
 								    # if command in bracelets remove them and treat command as new query
 								    if orig_query[0] == '(' and orig_query[-1] == ')':
 								        new_query = True
 								        orig_query = orig_query[1:-1]
 								    # if orig_query is '_' return {}
-												Added parameter for path of config.ini input + Added dependency_relation into queries

											
										
										
											5 years ago
+								    if dependency_type != '':
 								        decoded_query = {'deprel': dependency_type}
 								    else:
 								        decoded_query = {}
-												Initial commit

											
										
										
											5 years ago
+								    if orig_query == '_':
-												Added parameter for path of config.ini input + Added dependency_relation into queries

											
										
										
											5 years ago
+								        return decoded_query
-												Added some fixes

											
										
										
											5 years ago
+								    # if no spaces in query then this is query node and do this otherwise further split query
 								    elif len(orig_query.split(' ')) == 1:
-												Added filtering with &

											
										
										
											5 years ago
+								        orig_query_split_parts = orig_query.split(' ')[0].split('&')
 								        for orig_query_split_part in orig_query_split_parts:
-												Added fixed node order

											
										
										
											5 years ago
+								            orig_query_split = orig_query_split_part.split('=', 1)
-												Added filtering with &

											
										
										
											5 years ago
+								            if len(orig_query_split) > 1:
 								                if orig_query_split[0] == 'L':
 								                    decoded_query['lemma'] = orig_query_split[1]
 								                    # return decoded_query
 								                elif orig_query_split[0] == 'upos':
 								                    decoded_query['upos'] = orig_query_split[1]
 								                    # return decoded_query
 								                elif orig_query_split[0] == 'xpos':
 								                    decoded_query['xpos'] = orig_query_split[1]
 								                    # return decoded_query
 								                elif orig_query_split[0] == 'form':
 								                    decoded_query['form'] = orig_query_split[1]
 								                    # return decoded_query
 								                elif orig_query_split[0] == 'feats':
 								                    decoded_query['feats'] = orig_query_split[1]
 								                    # return decoded_query
-												Added root filtering.

											
										
										
											5 years ago
+								                elif orig_query_split[0] in feats_detailed_list:
 								                    decoded_query['feats_detailed'] = {}
 								                    decoded_query['feats_detailed'][orig_query_split[0]] = orig_query_split[1]
 								                    return decoded_query
-												Added filtering with &

											
										
										
											5 years ago
+								                elif not new_query:
 								                    raise Exception('Not supported yet!')
 								                else:
 								                    print('???')
-												Added some fixes

											
										
										
											5 years ago
+								            elif not new_query:
-												Added filtering with &

											
										
										
											5 years ago
+								                decoded_query['form'] = orig_query_split_part
 								                # return decoded_query
 								        return decoded_query
-												Initial commit

											
										
										
											5 years ago
 								    # split over spaces if not inside braces
-												Major changes - node_order is unimportant

											
										
										
											5 years ago
+								    # PATTERN = re.compile(r'''((?:[^ ()]|\([^.]*\))+)''')
 								    # all_orders = PATTERN.split(orig_query)
 								    # PATTERN = re.compile(r"(?:[^ ()]|\([^.]*\))+")
 								    # all_orders = re.findall(r"(?:[^ ()]|\([^]*\))+", orig_query)
 								    all_orders = re.split(r"\s+(?=[^()]*(?:\(|$))", orig_query)
-												Initial commit

											
										
										
											5 years ago
 								    # all_orders = orig_query.split()
 								    node_actions = all_orders[::2]
 								    priority_actions = all_orders[1::2]
 								    priority_actions_beginnings = [a[0] for a in priority_actions]
 								    # find root index
 								    try:
 								        root_index = priority_actions_beginnings.index('>')
 								    except ValueError:
 								        root_index = len(priority_actions)
-												Refactoring original code + partialy added node_order=yes

											
										
										
											5 years ago
+								    children = []
-												Initial commit

											
										
										
											5 years ago
+								    root = None
 								    for i, node_action in enumerate(node_actions):
 								        if i < root_index:
-												Refactoring original code + partialy added node_order=yes

											
										
										
											5 years ago
+								            children.append(decode_query(node_action, priority_actions[i][1:]))
-												Initial commit

											
										
										
											5 years ago
+								        elif i > root_index:
-												Refactoring original code + partialy added node_order=yes

											
										
										
											5 years ago
+								            children.append(decode_query(node_action, priority_actions[i - 1][1:]))
-												Initial commit

											
										
										
											5 years ago
+								        else:
-												Added parameter for path of config.ini input + Added dependency_relation into queries

											
										
										
											5 years ago
+								            root = decode_query(node_action, dependency_type)
-												Refactoring original code + partialy added node_order=yes

											
										
										
											5 years ago
+								    if children:
 								        root["children"] = children
-												Initial commit

											
										
										
											5 years ago
+								    return root
 								def create_trees(config):
 								    internal_saves = config.get('settings', 'internal_saves')
 								    input_path = config.get('settings', 'input')
 								    hash_object = hashlib.sha1(input_path.encode('utf-8'))
 								    hex_dig = hash_object.hexdigest()
 								    trees_read_outputfile = os.path.join(internal_saves, hex_dig)
 								    if not os.path.exists(trees_read_outputfile):
 								        train = pyconll.load_from_file(input_path)
-												Added root filtering.

											
										
										
											5 years ago
+								        form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, feats_dict = {}, {}, {}, {}, {}, {}
-												Initial commit

											
										
										
											5 years ago
 								        all_trees = []
 								        for sentence in train:
 								            root = None
 								            root_id = None
 								            token_nodes = []
 								            for token in sentence:
-												Refactoring original code + partialy added node_order=yes

											
										
										
											5 years ago
+								                # token_feats = ''
 								                # for k, v in token.feats.items():
 								                #     token_feats += k + next(iter(v)) + '|'
 								                # token_feats = token_feats[:-1]
-												Adding results class to code

											
										
										
											5 years ago
+								                if not token.id.isdigit():
 								                    continue
-												Refactoring original code + partialy added node_order=yes

											
										
										
											5 years ago
+								                # TODO check if 5th place is always there for feats
-												Added root filtering.

											
										
										
											5 years ago
+								                feats = token._fields[5]
-												Adding results class to code

											
										
										
											5 years ago
+								                node = Tree(int(token.id), token.form, token.lemma, token.upos, token.xpos, token.deprel, feats, token.feats, form_dict,
-												Added root filtering.

											
										
										
											5 years ago
+								                            lemma_dict, upos_dict, xpos_dict, deprel_dict, feats_dict, feats_detailed_dict, token.head)
-												Initial commit

											
										
										
											5 years ago
+								                token_nodes.append(node)
 								                if token.deprel == 'root':
 								                    root = node
 								                    root_id = int(token.id)
 								            for token_id, token in enumerate(token_nodes):
-												Base algorithm working for simple queries, complex queries so far dont

											
										
										
											5 years ago
+								                if int(token.parent) == 0:
-												Initial commit

											
										
										
											5 years ago
+								                    token.set_parent(None)
 								                else:
 								                    parent_id = int(token.parent) - 1
-												Refactoring original code + partialy added node_order=yes

											
										
										
											5 years ago
+								                    # if token_id < parent_id:
 								                    #     token_nodes[parent_id].add_l_child(token)
 								                    # elif token_id > parent_id:
 								                    #     token_nodes[parent_id].add_r_child(token)
 								                    # else:
 								                    #     raise Exception('Root element should not be here!')
 								                    if token_nodes[parent_id].children_split == -1 and token_id > parent_id:
 								                        token_nodes[parent_id].children_split = len(token_nodes[parent_id].children)
 								                    token_nodes[parent_id].add_child(token)
-												Initial commit

											
										
										
											5 years ago
+								                    token.set_parent(token_nodes[parent_id])
-												Refactoring original code + partialy added node_order=yes

											
										
										
											5 years ago
+								            for token in token_nodes:
 								                if token.children_split == -1:
 								                    token.children_split = len(token.children)
-												Initial commit

											
										
										
											5 years ago
+								            if root == None:
 								                raise Exception('No root element in sentence!')
 								            all_trees.append(root)
 								        with open(trees_read_outputfile, 'wb') as output:
 								            pickle.dump((all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict), output)
 								    else:
 								        print('Reading trees:')
 								        print('Completed')
 								        with open(trees_read_outputfile, 'rb') as pkl_file:
 								            (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = pickle.load(pkl_file)
 								    return all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict
-												Major changes - node_order is unimportant

											
										
										
											5 years ago
 								# def order_independent_queries(query_tree):
 								#     all_children = query_tree['l_children'] + query_tree['r_children']
 								#     if all_children > 0:
 								#
 								#     else:
 								#         return query_tree
 								#     pass
 								def printable_answers(query):
 								    # all_orders = re.findall(r"(?:[^ ()]|\([^]*\))+", query)
 								    all_orders = re.split(r"\s+(?=[^()]*(?:\(|$))", query)
 								    # all_orders = orig_query.split()
 								    node_actions = all_orders[::2]
 								    # priority_actions = all_orders[1::2]
 								    if len(node_actions) > 1:
 								        res = []
-												Refactoring original code + partialy added node_order=yes

											
										
										
											5 years ago
+								        # for node_action in node_actions[:-1]:
 								        #     res.extend(printable_answers(node_action[1:-1]))
 								        # res.extend([node_actions[-1]])
 								        for node_action in node_actions:
-												Fixed output - no more bracelets later

											
										
										
											5 years ago
+								            # if command in bracelets remove them and treat command as new query
-												Fixed nesting bug + unworking ngrams + Added dependency_type

											
										
										
											5 years ago
+								            # TODO FIX BRACELETS IN A BETTER WAY
 								            if not node_action:
 								                res.extend(['('])
 								            elif node_action[0] == '(' and node_action[-1] == ')':
-												Fixed output - no more bracelets later

											
										
										
											5 years ago
+								                res.extend(printable_answers(node_action[1:-1]))
 								            else:
 								                res.extend([node_action])
-												Major changes - node_order is unimportant

											
										
										
											5 years ago
+								        return res
 								    else:
 								        return [query]
-												Added multiprocessing.

											
										
										
											5 years ago
 								def tree_calculations(input_data):
 								    tree, query_tree, create_output_string_funct, filters = input_data
-												Fixes on results

											
										
										
											5 years ago
+								    _, subtrees = tree.get_subtrees(query_tree, [], create_output_string_funct, filters)
-												Added multiprocessing.

											
										
										
											5 years ago
+								    return subtrees
 								def tree_calculations_chunks(input_data):
 								    trees, query_tree, create_output_string_funct, filters = input_data
 								    result_dict = {}
 								    for tree in trees:
-												Fixes on results

											
										
										
											5 years ago
+								        _, subtrees = tree.get_subtrees(query_tree, [], create_output_string_funct, filters)
-												Added multiprocessing.

											
										
										
											5 years ago
 								        for query_results in subtrees:
 								            for r in query_results:
 								                if r in result_dict:
 								                    result_dict[r] += 1
 								                else:
 								                    result_dict[r] = 1
 								    return result_dict
 								def chunkify(a, n):
 								    k, m = divmod(len(a), n)
 								    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
-												Added unlimited ngrams

											
										
										
											5 years ago
+								def add_node(tree):
 								    if 'children' in tree:
 								        tree['children'].append({})
 								    else:
 								        tree['children'] = [{}]
 								# walk over all nodes in tree and add a node to each possible node
 								def tree_grow(orig_tree):
 								    new_trees = []
 								    new_tree = copy.deepcopy(orig_tree)
 								    add_node(new_tree)
 								    new_trees.append(new_tree)
 								    if 'children' in orig_tree:
 								        children = []
 								        for child_tree in orig_tree['children']:
 								            children.append(tree_grow(child_tree))
 								        for i, child in enumerate(children):
 								            for child_res in child:
 								                new_tree = copy.deepcopy(orig_tree)
 								                new_tree['children'][i] = child_res
 								                new_trees.append(new_tree)
 								    return new_trees
 								def compare_trees(tree1, tree2):
 								    if tree1 == {} and tree2 == {}:
 								        return True
 								    if 'children' not in tree1 or 'children' not in tree2 or len(tree1['children']) != len(tree2['children']):
 								        return False
 								    children2_connections = []
 								    for child1_i, child1 in enumerate(tree1['children']):
 								        child_duplicated = False
 								        for child2_i, child2 in enumerate(tree2['children']):
 								            if child2_i in children2_connections:
 								                pass
 								            if compare_trees(child1, child2):
 								                children2_connections.append(child2_i)
 								                child_duplicated = True
 								                break
 								        if not child_duplicated:
 								            return False
 								    return True
 								def create_ngrams_query_trees(n, trees):
 								    for i in range(n - 1):
 								        new_trees = []
 								        for tree in trees:
 								            # append new_tree only if it is not already inside
 								            for new_tree in tree_grow(tree):
 								                duplicate = False
 								                for confirmed_new_tree in new_trees:
 								                    if compare_trees(new_tree, confirmed_new_tree):
 								                        duplicate = True
 								                        break
 								                if not duplicate:
 								                    new_trees.append(new_tree)
 								        trees = new_trees
 								        # delete_duplicates(trees)
 								        # print('here')
 								    # tree_grow(tree)
 								    # tree_grow(tree)
 								    # tree['children'] = [{}]
 								    return trees
-												Initial commit

											
										
										
											5 years ago
+								def main():
-												Added parameter for path of config.ini input + Added dependency_relation into queries

											
										
										
											5 years ago
+								    parser = argparse.ArgumentParser()
-												Initial commit

											
										
										
											5 years ago
-												Added parameter for path of config.ini input + Added dependency_relation into queries

											
										
										
											5 years ago
+								    ## Required parameters
 								    parser.add_argument("--config_file",
 								                        default=None,
 								                        type=str,
 								                        required=True,
 								                        help="The input config file.")
 								    args = parser.parse_args()
 								    config = configparser.ConfigParser()
 								    config.read(args.config_file)
 								    # a = args.config_file
 								    # config.read('config.ini')
-												Added analyze_type filter

											
										
										
											5 years ago
+								    # create queries
-												Added ngram=2 calculations + removed some bugs

											
										
										
											5 years ago
+								    ngrams = 0
-												Added unlimited ngrams

											
										
										
											5 years ago
 								    # if config.getint('settings', 'ngrams') == 2:
 								    #     ngrams = 2
 								    #     query_tree = [{"children": [{}]}]
 								    # elif config.getint('settings', 'ngrams') == 3:
 								    #     ngrams = 3
 								    #     query_tree = [{"children": [{}, {}]}, {"children": [{"children": [{}]}]}]
 								    # elif config.getint('settings', 'ngrams') == 4:
 								    #     ngrams = 4
 								    #     query_tree = [{"children": [{}, {}, {}]}, {"children": [{"children": [{}, {}]}]}, {"children": [{"children": [{}]}, {}]}, {"children": [{"children": [{"children": [{}]}]}]}]
 								    # elif config.getint('settings', 'ngrams') == 5:
 								    #     ngrams = 5
 								    #     query_tree = [{"children": [{}, {}, {}, {}]}, {"children": [{"children": [{}]}, {}, {}]}, {"children": [{"children": [{}, {}]}, {}]}, {"children": [{"children": [{}]}, {"children": [{}]}]},
 								    #                   {"children": [{"children": [{"children": [{}]}]}, {}]}, {"children": [{"children": [{"children": [{}]}, {}]}]}, {"children": [{"children": [{"children": [{}, {}]}]}]},
 								    #                   {"children": [{"children": [{"children": [{"children": [{}]}]}]}]}, {'children': [{'children': [{}, {}, {}]}]}]
-												Added interval ngrams

											
										
										
											5 years ago
+								    ngrams_range = config.get('settings', 'ngrams').split('-')
 								    ngrams_range = [int(r) for r in ngrams_range]
 								    if ngrams_range[0] > 1:
 								        if len(ngrams_range) == 1:
 								            query_tree = create_ngrams_query_trees(ngrams_range[0], [{}])
 								        elif len(ngrams_range) == 2:
 								            query_tree = []
 								            for i in range(ngrams_range[0], ngrams_range[1] + 1):
 								                query_tree.extend(create_ngrams_query_trees(i, [{}]))
-												Added ngram=2 calculations + removed some bugs

											
										
										
											5 years ago
+								    else:
-												Added parameter for path of config.ini input + Added dependency_relation into queries

											
										
										
											5 years ago
+								        query_tree = [decode_query('(' + config.get('settings', 'query') + ')', '')]
-												Major changes - node_order is unimportant

											
										
										
											5 years ago
+								        # order_independent_queries(query_tree)
-												Added ngram=2 calculations + removed some bugs

											
										
										
											5 years ago
-												Added unlimited ngrams

											
										
										
											5 years ago
+								    # 261 - 9 grams
 								    # 647 - 10 grams
 								    # 1622 - 11 grams
 								    # 4126 - 12 grams
 								    # 10598 - 13 grams
-												Initial commit

											
										
										
											5 years ago
+								    (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = create_trees(config)
-												Added analyze_type filter

											
										
										
											5 years ago
+								    # set filters
-												Added feats in node_type

											
										
										
											5 years ago
+								    assert config.get('settings', 'node_type') in ['deprel', 'lemma', 'upos', 'xpos', 'form', 'feats'], '"node_type" is not set up correctly'
-												Added multiprocessing.

											
										
										
											5 years ago
+								    cpu_cores = config.getint('settings', 'cpu_cores')
-												Major changes - node_order is unimportant

											
										
										
											5 years ago
+								    if config.get('settings', 'node_type') == 'deprel':
-												Added analyze_type filter

											
										
										
											5 years ago
+								        create_output_string_funct = create_output_string_deprel
-												Major changes - node_order is unimportant

											
										
										
											5 years ago
+								    elif config.get('settings', 'node_type') == 'lemma':
-												Added analyze_type filter

											
										
										
											5 years ago
+								        create_output_string_funct = create_output_string_lemma
-												Major changes - node_order is unimportant

											
										
										
											5 years ago
+								    elif config.get('settings', 'node_type') == 'upos':
-												Added analyze_type filter

											
										
										
											5 years ago
+								        create_output_string_funct = create_output_string_upos
-												Major changes - node_order is unimportant

											
										
										
											5 years ago
+								    elif config.get('settings', 'node_type') == 'xpos':
-												Added analyze_type filter

											
										
										
											5 years ago
+								        create_output_string_funct = create_output_string_xpos
-												Added feats in node_type

											
										
										
											5 years ago
+								    elif config.get('settings', 'node_type') == 'feats':
 								        create_output_string_funct = create_output_string_feats
-												Added analyze_type filter

											
										
										
											5 years ago
+								    else:
 								        create_output_string_funct = create_output_string_form
-												Initial commit

											
										
										
											5 years ago
-												Added ngram=2 calculations + removed some bugs

											
										
										
											5 years ago
+								    result_dict = {}
-												Refactoring original code + partialy added node_order=yes

											
										
										
											5 years ago
+								    filters = {}
-												Changed config.ini settings

											
										
										
											5 years ago
+								    filters['node_order'] = config.get('settings', 'node_order') == 'fixed'
-												Added unlimited ngrams

											
										
										
											5 years ago
+								    # filters['caching'] = config.getboolean('settings', 'caching')
-												Fixed nesting bug + unworking ngrams + Added dependency_type

											
										
										
											5 years ago
+								    filters['dependency_type'] = config.get('settings', 'dependency_type') == 'labeled'
-												Added label_whitelist

											
										
										
											5 years ago
+								    if config.has_option('settings', 'label_whitelist'):
 								        filters['label_whitelist'] = config.get('settings', 'label_whitelist').split('|')
 								    else:
 								        filters['label_whitelist'] = []
-												Added ngram=2 calculations + removed some bugs

											
										
										
											5 years ago
-												Added root filtering.

											
										
										
											5 years ago
+								    if config.has_option('settings', 'root_whitelist'):
 								        # test
 								        filters['root_whitelist'] = []
 								        for option in config.get('settings', 'root_whitelist'). split('|'):
 								            attribute_dict = {}
 								            for attribute in option.split('&'):
 								                value = attribute.split('=')
 								                # assert value[0] in ['deprel', 'lemma', 'upos', 'xpos', 'form',
 								                #                     'feats'], '"root_whitelist" is not set up correctly'
 								                attribute_dict[value[0]] = value[1]
 								            filters['root_whitelist'].append(attribute_dict)
 								        # filters['root_whitelist'] = [{'upos': 'NOUN', 'Case': 'Nom'}, {'upos': 'ADJ', 'Degree': 'Sup'}]
 								    else:
 								        filters['root_whitelist'] = []
-												Added complete tree_type

											
										
										
											5 years ago
+								    filters['complete_tree_type'] = config.get('settings', 'tree_type') == 'complete'
-												Left tree working

											
										
										
											5 years ago
-												Added multiprocessing.

											
										
										
											5 years ago
+								    # for tree in all_trees[2:]:
 								    # for tree in all_trees[1205:]:
 								    with Pool(cpu_cores) as p:
 								        start_exe_time = time.time()
 								        # 1.25 s (16 cores)
 								        # chunked_trees = list(chunkify(all_trees, cpu_cores))
 								        # if cpu_cores > 1:
 								        #     part_results = p.map(tree_calculations_chunks,
 								        #                          [(tree, query_tree, create_output_string_funct, filters) for tree in chunked_trees])
 								        #
 								        #     for part_result in part_results:
 								        #         for r_k, r_v in part_result.items():
 								        #             if r_k in result_dict:
 								        #                 result_dict[r_k] += r_v
 								        #             else:
 								        #                 result_dict[r_k] = r_v
 								        # 1.02 s (16 cores)
 								        if cpu_cores > 1:
-												Fixes on results

											
										
										
											5 years ago
+								            all_subtrees = p.map(tree_calculations, [(tree, query_tree, create_output_string_funct, filters) for tree in all_trees])
-												Added multiprocessing.

											
										
										
											5 years ago
-												Adding results class to code

											
										
										
											5 years ago
+								            # for subtrees in all_subtrees:
 								            for tree_i, subtrees in enumerate(all_subtrees):
-												Added multiprocessing.

											
										
										
											5 years ago
+								                for query_results in subtrees:
 								                    for r in query_results:
-												Adding results class to code

											
										
										
											5 years ago
+								                        # if r == '(" < , < je < velik) < tem':
 								                        #     print(tree_i)
 								                        # if r in result_dict:
 								                        #     result_dict[r] += 1
 								                        # else:
 								                        #     result_dict[r] = 1
-												Fixes on results

											
										
										
											5 years ago
+								                        if r.key in result_dict:
 								                            result_dict[r.key]['number'] += 1
-												Added multiprocessing.

											
										
										
											5 years ago
+								                        else:
-												Fixes on results

											
										
										
											5 years ago
+								                            result_dict[r.key] = {'object': r, 'number': 1}
-												Added multiprocessing.

											
										
										
											5 years ago
 								        # 3.65 s (1 core)
 								        else:
-												Adding results class to code

											
										
										
											5 years ago
+								            # for tree_i, tree in enumerate(all_trees[-5:]):
-												Fixes on results

											
										
										
											5 years ago
+								            for tree_i, tree in enumerate(all_trees):
-												Adding results class to code

											
										
										
											5 years ago
+								            # text = Če pa ostane odrasel otrok doma, se starši le težko sprijaznijo s tem, da je "velik", otrok pa ima ves čas občutek, da se njegovi starši po nepotrebnem vtikajo v njegovo življenje.
 								            # for tree_i, tree in enumerate(all_trees[5170:]):
 								            # for tree in all_trees:
-												Added multiprocessing.

											
										
										
											5 years ago
+								                subtrees = tree_calculations((tree, query_tree, create_output_string_funct, filters))
 								                for query_results in subtrees:
 								                    for r in query_results:
-												Adding results class to code

											
										
										
											5 years ago
+								                        # if r == '(" < , < je < velik) < tem':
 								                        #     print(tree_i)
-												Fixes on results

											
										
										
											5 years ago
+								                        if r.key in result_dict:
 								                            result_dict[r.key]['number'] += 1
-												Added multiprocessing.

											
										
										
											5 years ago
+								                        else:
-												Fixes on results

											
										
										
											5 years ago
+								                            result_dict[r.key] = {'object': r, 'number': 1}
-												Added multiprocessing.

											
										
										
											5 years ago
 								        print("Execution time:")
 								        print("--- %s seconds ---" % (time.time() - start_exe_time))
 								            # test 1 layer queries
 								            # # tree.r_children = []
 								            # # tree.children[1].children = []
 								            # # query = [{'children': [{}]}, {'children': [{}]}]
 								            # # query = [{"children": [{}, {}]}, {"children": [{}]}, {"children": [{}, {}, {}]}]
 								            # query = [{"children": [{'form': 'je'}, {}]}, {"children": [{'form': 'je'}]}, {"children": [{'form': 'je'}, {}, {}]}]
 								            # # query = [{'q1':'', "children": [{'a1':''}, {'a2':''}]}, {'q2':'', "children": [{'b1':''}]}, {'q3':'', "children": [{'c1':''}, {'c2':''}, {'c3':''}]}]
 								            # _, _, subtrees = tree.get_subtrees(query, [], create_output_string_funct)
 								            # # _, subtrees = tree.get_subtrees([{'q1':'', "children": [{'a1':''}, {'a2':''}], "children": []}, {'q2':'', "children": [{'b1':''}], "children": []}, {'q3':'', "children": [{'c1':''}, {'c2':''}, {'c3':''}], "children": []}], [])
 								            # print('HERE!')
 								            # test 2 layer queries
 								            # tree.r_children = [Tree('je', '', '', '', '', form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, None)]
 								            # tree.l_children[1].l_children = []
 								            # new_tree = Tree('bil', '', '', '', '', form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, None)
 								            # new_tree.l_children = [tree]
 								            # _, subtrees = new_tree.get_subtrees(
 								            #     [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}]}]}], [])
 								            # # _, subtrees = new_tree.get_subtrees(
 								            # #     [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}], "r_children": []}],  "r_children": []}], [])
-												Adding results class to code

											
										
										
											5 years ago
+								    sorted_list = sorted(result_dict.items(), key=lambda x: x[1]['number'], reverse=True)
-												Added ngram=2 calculations + removed some bugs

											
										
										
											5 years ago
 								    with open(config.get('settings', 'output'), "w", newline="") as f:
 								        # header - use every second space as a split
 								        writer = csv.writer(f, delimiter='\t')
 								        if ngrams:
-												Major changes - node_order is unimportant

											
										
										
											5 years ago
+								            len_words = ngrams
-												Added ngram=2 calculations + removed some bugs

											
										
										
											5 years ago
+								        else:
-												Added feats to queries

											
										
										
											5 years ago
+								            len_words = int(len(config.get('settings', 'query').split(" "))/2 + 1)
 								        header = ["Structure"] + ["Word #" + str(i) for i in range(1, len_words + 1)] + ['Number of occurences']
-												Major changes - node_order is unimportant

											
										
										
											5 years ago
+								        # header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Number of occurences']
 								        writer.writerow(header)
-												Added ngram=2 calculations + removed some bugs

											
										
										
											5 years ago
 								        # body
 								        for k, v in sorted_list:
-												Fixes on results

											
										
										
											5 years ago
+								            words_only = printable_answers(k)
 								            writer.writerow([k] + words_only + [str(v['number'])])
-												Initial commit

											
										
										
											5 years ago
-												Added multiprocessing.

											
										
										
											5 years ago
+								    return "Done"
-												Initial commit

											
										
										
											5 years ago
 								if __name__ == "__main__":
-												Added multiprocessing.

											
										
										
											5 years ago
+								    start_time = time.time()
-												Initial commit

											
										
										
											5 years ago
+								    main()
-												Added multiprocessing.

											
										
										
											5 years ago
+								    print("Total:")
 								    print("--- %s seconds ---" % (time.time() - start_time))