Sorted input + fixed certain errors
This commit is contained in:
parent
f1bba0e58f
commit
36223d3a98
|
@ -186,6 +186,10 @@ def create_trees(input_path, internal_saves, feats_detailed_dict={}, save=True):
|
||||||
corpus_size += 1
|
corpus_size += 1
|
||||||
|
|
||||||
for token_id, token in enumerate(token_nodes):
|
for token_id, token in enumerate(token_nodes):
|
||||||
|
if isinstance(token.parent, int) or token.parent == '':
|
||||||
|
root = None
|
||||||
|
print('No parent: ' + sentence.id)
|
||||||
|
break
|
||||||
if int(token.parent) == 0:
|
if int(token.parent) == 0:
|
||||||
token.set_parent(None)
|
token.set_parent(None)
|
||||||
else:
|
else:
|
||||||
|
@ -380,7 +384,7 @@ def count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, f
|
||||||
# else:
|
# else:
|
||||||
# result_dict[r_k] = r_v
|
# result_dict[r_k] = r_v
|
||||||
# 1.02 s (16 cores)
|
# 1.02 s (16 cores)
|
||||||
if cpu_cores > 0:
|
if cpu_cores > 1:
|
||||||
# input_data = (tree, query_tree, create_output_string_functs, filters)
|
# input_data = (tree, query_tree, create_output_string_functs, filters)
|
||||||
all_unigrams = p.map(get_unigrams, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees])
|
all_unigrams = p.map(get_unigrams, [(tree, query_tree, create_output_string_functs, filters) for tree in all_trees])
|
||||||
for unigrams in all_unigrams:
|
for unigrams in all_unigrams:
|
||||||
|
@ -559,16 +563,17 @@ def main():
|
||||||
already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list = load_zipped_pickle(
|
already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list = load_zipped_pickle(
|
||||||
checkpoint_path)
|
checkpoint_path)
|
||||||
|
|
||||||
for path in os.listdir(input_path):
|
for path in sorted(os.listdir(input_path)):
|
||||||
path_obj = Path(input_path, path)
|
path_obj = Path(input_path, path)
|
||||||
pathlist = path_obj.glob('**/*.conllu')
|
pathlist = path_obj.glob('**/*.conllu')
|
||||||
if path_obj.name in already_processed:
|
if path_obj.name in already_processed:
|
||||||
continue
|
continue
|
||||||
start_exe_time = time.time()
|
start_exe_time = time.time()
|
||||||
for path in pathlist:
|
for path in sorted(pathlist):
|
||||||
# because path is object not string
|
# because path is object not string
|
||||||
path_str = str(path)
|
path_str = str(path)
|
||||||
|
# if Path(path_str).name == 'GF0003946-dedup.conllu':
|
||||||
|
# break
|
||||||
# print(path_in_str)
|
# print(path_in_str)
|
||||||
|
|
||||||
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, sub_corpus_size,
|
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, sub_corpus_size,
|
||||||
|
@ -585,9 +590,10 @@ def main():
|
||||||
|
|
||||||
already_processed.add(path_obj.name)
|
already_processed.add(path_obj.name)
|
||||||
|
|
||||||
|
# 15.26
|
||||||
print("Execution time:")
|
print("Execution time:")
|
||||||
print("--- %s seconds ---" % (time.time() - start_exe_time))
|
print("--- %s seconds ---" % (time.time() - start_exe_time))
|
||||||
|
# print(1 + 'asd')
|
||||||
save_zipped_pickle(
|
save_zipped_pickle(
|
||||||
(already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list),
|
(already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list),
|
||||||
checkpoint_path, protocol=2)
|
checkpoint_path, protocol=2)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user