Cleaning repo

This commit is contained in:
2019-12-30 11:30:36 +01:00
parent 36223d3a98
commit 421f12cac6
5 changed files with 8 additions and 606 deletions
+4 -148
View File
@@ -23,11 +23,13 @@ import pickle
import re
import string
import time
import timeit
from multiprocessing import Pool
from pathlib import Path
import gzip
import sys
import pyconll
from Tree import Tree
from generic import get_collocabilities, create_output_string_form, create_output_string_deprel, create_output_string_lemma, create_output_string_upos, create_output_string_xpos, create_output_string_feats
sys.setrecursionlimit(25000)
def save_zipped_pickle(obj, filename, protocol=-1):
@@ -39,31 +41,6 @@ def load_zipped_pickle(filename):
loaded_object = pickle.load(f)
return loaded_object
import pyconll
from Tree import Tree, create_output_string_form, create_output_string_deprel, create_output_string_lemma, create_output_string_upos, create_output_string_xpos, create_output_string_feats
# for separate searches of feats
# feats_detailed_list = [
# # lexical features
# 'PronType', 'NumType', 'Poss', 'Reflex', 'Foreign', 'Abbr',
#
# # Inflectional features (nominal)
# 'Gender', 'Animacy', 'NounClass', 'Number', 'Case', 'Definite', 'Degree',
#
# # Inflectional features (verbal)
# 'VerbForm', 'Mood', 'Tense', 'Aspect', 'Voice', 'Evident', 'Polarity', 'Person', 'Polite', 'Clusivity',
#
# # Other
# 'Variant', 'Number[psor]', 'Gender[psor]', 'NumForm'
# ]
# feats_detailed_list = []
# feats_detailed_dict = {key: {} for key in feats_detailed_list}
from generic import get_collocabilities
def decode_query(orig_query, dependency_type, feats_detailed_list):
new_query = False
@@ -72,7 +49,6 @@ def decode_query(orig_query, dependency_type, feats_detailed_list):
new_query = True
orig_query = orig_query[1:-1]
# if orig_query is '_' return {}
if dependency_type != '':
decoded_query = {'deprel': dependency_type}
else:
@@ -88,19 +64,14 @@ def decode_query(orig_query, dependency_type, feats_detailed_list):
if len(orig_query_split) > 1:
if orig_query_split[0] == 'L':
decoded_query['lemma'] = orig_query_split[1]
# return decoded_query
elif orig_query_split[0] == 'upos':
decoded_query['upos'] = orig_query_split[1]
# return decoded_query
elif orig_query_split[0] == 'xpos':
decoded_query['xpos'] = orig_query_split[1]
# return decoded_query
elif orig_query_split[0] == 'form':
decoded_query['form'] = orig_query_split[1]
# return decoded_query
elif orig_query_split[0] == 'feats':
decoded_query['feats'] = orig_query_split[1]
# return decoded_query
elif orig_query_split[0] in feats_detailed_list:
decoded_query['feats_detailed'] = {}
decoded_query['feats_detailed'][orig_query_split[0]] = orig_query_split[1]
@@ -111,18 +82,11 @@ def decode_query(orig_query, dependency_type, feats_detailed_list):
print('???')
elif not new_query:
decoded_query['form'] = orig_query_split_part
# return decoded_query
return decoded_query
# split over spaces if not inside braces
# PATTERN = re.compile(r'''((?:[^ ()]|\([^.]*\))+)''')
# all_orders = PATTERN.split(orig_query)
# PATTERN = re.compile(r"(?:[^ ()]|\([^.]*\))+")
# all_orders = re.findall(r"(?:[^ ()]|\([^]*\))+", orig_query)
all_orders = re.split(r"\s+(?=[^()]*(?:\(|$))", orig_query)
# all_orders = orig_query.split()
node_actions = all_orders[::2]
priority_actions = all_orders[1::2]
priority_actions_beginnings = [a[0] for a in priority_actions]
@@ -148,8 +112,6 @@ def decode_query(orig_query, dependency_type, feats_detailed_list):
def create_trees(input_path, internal_saves, feats_detailed_dict={}, save=True):
# internal_saves = filters['internal_saves']
# input_path = filters['input']
hash_object = hashlib.sha1(input_path.encode('utf-8'))
hex_dig = hash_object.hexdigest()
trees_read_outputfile = os.path.join(internal_saves, hex_dig)
@@ -165,13 +127,8 @@ def create_trees(input_path, internal_saves, feats_detailed_dict={}, save=True):
for sentence in train:
root = None
root_id = None
token_nodes = []
for token in sentence:
# token_feats = ''
# for k, v in token.feats.items():
# token_feats += k + next(iter(v)) + '|'
# token_feats = token_feats[:-1]
if not token.id.isdigit():
continue
@@ -194,12 +151,6 @@ def create_trees(input_path, internal_saves, feats_detailed_dict={}, save=True):
token.set_parent(None)
else:
parent_id = int(token.parent) - 1
# if token_id < parent_id:
# token_nodes[parent_id].add_l_child(token)
# elif token_id > parent_id:
# token_nodes[parent_id].add_r_child(token)
# else:
# raise Exception('Root element should not be here!')
if token_nodes[parent_id].children_split == -1 and token_id > parent_id:
token_nodes[parent_id].children_split = len(token_nodes[parent_id].children)
token_nodes[parent_id].add_child(token)
@@ -210,35 +161,19 @@ def create_trees(input_path, internal_saves, feats_detailed_dict={}, save=True):
token.children_split = len(token.children)
if root == None:
# print(input_path)
print('No root: ' + sentence.id)
continue
# raise Exception('No root element in sentence!')
all_trees.append(root)
if save:
save_zipped_pickle((all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict), trees_read_outputfile, protocol=2)
# with open(trees_read_outputfile, 'wb') as output:
#
# pickle.dump((all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict), output)
else:
print('Reading trees:')
print('Completed')
all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict = load_zipped_pickle(trees_read_outputfile)
# with open(trees_read_outputfile, 'rb') as pkl_file:
# (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict) = pickle.load(pkl_file)
return all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict
# def order_independent_queries(query_tree):
# all_children = query_tree['l_children'] + query_tree['r_children']
# if all_children > 0:
#
# else:
# return query_tree
# pass
def printable_answers(query):
# all_orders = re.findall(r"(?:[^ ()]|\([^]*\))+", query)
all_orders = re.split(r"\s+(?=[^()]*(?:\(|$))", query)
@@ -293,11 +228,6 @@ def tree_calculations_chunks(input_data):
return result_dict
def chunkify(a, n):
k, m = divmod(len(a), n)
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
def add_node(tree):
if 'children' in tree:
tree['children'].append({})
@@ -362,30 +292,11 @@ def create_ngrams_query_trees(n, trees):
new_trees.append(new_tree)
trees = new_trees
# delete_duplicates(trees)
# print('here')
# tree_grow(tree)
# tree_grow(tree)
# tree['children'] = [{}]
return trees
def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict, result_dict):
with Pool(cpu_cores) as p:
# 1.25 s (16 cores)
# chunked_trees = list(chunkify(all_trees, cpu_cores))
# if cpu_cores > 1:
# part_results = p.map(tree_calculations_chunks,
# [(tree, query_tree, create_output_string_funct, filters) for tree in chunked_trees])
#
# for part_result in part_results:
# for r_k, r_v in part_result.items():
# if r_k in result_dict:
# result_dict[r_k] += r_v
# else:
# result_dict[r_k] = r_v
# 1.02 s (16 cores)
if cpu_cores > 1:
# input_data = (tree, query_tree, create_output_string_functs, filters)
all_unigrams = p.map(get_unigrams, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees])
for unigrams in all_unigrams:
for unigram in unigrams:
@@ -396,24 +307,14 @@ def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, f
all_subtrees = p.map(tree_calculations, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees])
# for subtrees in all_subtrees:
for tree_i, subtrees in enumerate(all_subtrees):
for query_results in subtrees:
for r in query_results:
# if r.key == '(ne <advmod more >xcomp (se <expl izogniti) >punct .)':
# print('HERE')
# print(tree_i)
if filters['node_order']:
key = r.get_key() + r.order
else:
key = r.get_key()
# if r == '(" < , < je < velik) < tem':
# print(tree_i)
# if r in result_dict:
# result_dict[r] += 1
# else:
# result_dict[r] = 1
if key in result_dict:
result_dict[key]['number'] += 1
else:
@@ -421,11 +322,7 @@ def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, f
# 3.65 s (1 core)
else:
# for tree_i, tree in enumerate(all_trees[-5:]):
for tree_i, tree in enumerate(all_trees):
# for tree_i, tree in enumerate(all_trees[852:]):
# for tree_i, tree in enumerate(all_trees[1689:]):
# for tree_i, tree in enumerate(all_trees[1:3]):
input_data = (tree, query_tree, create_output_string_functs, filters)
if filters['association_measures']:
unigrams = get_unigrams(input_data)
@@ -434,10 +331,7 @@ def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, f
unigrams_dict[unigram] += 1
else:
unigrams_dict[unigram] = 1
# for tree_i, tree in enumerate(all_trees[1:]):
# text = Če pa ostane odrasel otrok doma, se starši le težko sprijaznijo s tem, da je "velik", otrok pa ima ves čas občutek, da se njegovi starši po nepotrebnem vtikajo v njegovo življenje.
# for tree_i, tree in enumerate(all_trees[5170:]):
# for tree in all_trees:
subtrees = tree_calculations(input_data)
for query_results in subtrees:
for r in query_results:
@@ -445,8 +339,6 @@ def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, f
key = r.get_key() + r.order
else:
key = r.get_key()
# if r == '(" < , < je < velik) < tem':
# print(tree_i)
if key in result_dict:
result_dict[key]['number'] += 1
else:
@@ -465,7 +357,6 @@ def read_filters(config, feats_detailed_list):
query_tree.extend(create_ngrams_query_trees(i, [{}]))
else:
query_tree = [decode_query('(' + config.get('settings', 'query') + ')', '', feats_detailed_list)]
# order_independent_queries(query_tree)
# set filters
node_types = config.get('settings', 'node_type').split('+')
@@ -506,11 +397,8 @@ def read_filters(config, feats_detailed_list):
attribute_dict = {}
for attribute in option.split('&'):
value = attribute.split('=')
# assert value[0] in ['deprel', 'lemma', 'upos', 'xpos', 'form',
# 'feats'], '"root_whitelist" is not set up correctly'
attribute_dict[value[0]] = value[1]
filters['root_whitelist'].append(attribute_dict)
# filters['root_whitelist'] = [{'upos': 'NOUN', 'Case': 'Nom'}, {'upos': 'ADJ', 'Degree': 'Sup'}]
else:
filters['root_whitelist'] = []
@@ -540,12 +428,6 @@ def main():
internal_saves = config.get('settings', 'internal_saves')
input_path = config.get('settings', 'input')
# a = args.config_file
# config.read('config.ini')
# create queries
if os.path.isdir(input_path):
checkpoint_path = Path(internal_saves, 'checkpoint.pkl')
@@ -572,9 +454,6 @@ def main():
for path in sorted(pathlist):
# because path is object not string
path_str = str(path)
# if Path(path_str).name == 'GF0003946-dedup.conllu':
# break
# print(path_in_str)
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, sub_corpus_size,
feats_detailed_list) = create_trees(path_str, internal_saves, feats_detailed_dict=feats_detailed_list, save=False)
@@ -593,7 +472,6 @@ def main():
# 15.26
print("Execution time:")
print("--- %s seconds ---" % (time.time() - start_exe_time))
# print(1 + 'asd')
save_zipped_pickle(
(already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list),
checkpoint_path, protocol=2)
@@ -620,26 +498,6 @@ def main():
print("Execution time:")
print("--- %s seconds ---" % (time.time() - start_exe_time))
# test 1 layer queries
# # tree.r_children = []
# # tree.children[1].children = []
# # query = [{'children': [{}]}, {'children': [{}]}]
# # query = [{"children": [{}, {}]}, {"children": [{}]}, {"children": [{}, {}, {}]}]
# query = [{"children": [{'form': 'je'}, {}]}, {"children": [{'form': 'je'}]}, {"children": [{'form': 'je'}, {}, {}]}]
# # query = [{'q1':'', "children": [{'a1':''}, {'a2':''}]}, {'q2':'', "children": [{'b1':''}]}, {'q3':'', "children": [{'c1':''}, {'c2':''}, {'c3':''}]}]
# _, _, subtrees = tree.get_subtrees(query, [], create_output_string_funct)
# # _, subtrees = tree.get_subtrees([{'q1':'', "children": [{'a1':''}, {'a2':''}], "children": []}, {'q2':'', "children": [{'b1':''}], "children": []}, {'q3':'', "children": [{'c1':''}, {'c2':''}, {'c3':''}], "children": []}], [])
# print('HERE!')
# test 2 layer queries
# tree.r_children = [Tree('je', '', '', '', '', form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, None)]
# tree.l_children[1].l_children = []
# new_tree = Tree('bil', '', '', '', '', form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, None)
# new_tree.l_children = [tree]
# _, subtrees = new_tree.get_subtrees(
# [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}]}]}], [])
# # _, subtrees = new_tree.get_subtrees(
# # [{"l_children":[{"l_children": [{'a1': ''}, {'a2': ''}, {'a3': ''}, {'a4': ''}], "r_children": []}], "r_children": []}], [])
sorted_list = sorted(result_dict.items(), key=lambda x: x[1]['number'], reverse=True)
with open(config.get('settings', 'output'), "w", newline="") as f:
@@ -660,7 +518,6 @@ def main():
header += ['Root node']
if filters['association_measures']:
header += ['MI', 'MI3', 'Dice', 'logDice', 't-score', 'simple-LL']
# header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Absolute frequency']
writer.writerow(header)
if filters['lines_threshold']:
@@ -673,7 +530,6 @@ def main():
if filters['frequency_threshold'] and filters['frequency_threshold'] > v['number']:
break
words_only = [word_att for word in v['object'].array for word_att in word] + ['' for i in range((tree_size_range[-1] - len(v['object'].array)) * len(v['object'].array[0]))]
# words_only = printable_answers(k)
row = [v['object'].get_key()[1:-1]] + words_only + [str(v['number'])]
row += ['%.4f' % relative_frequency]
if filters['node_order']: