Added option to compare one corpus results with the other

This commit is contained in:
Luka 2023-01-20 18:20:03 +01:00
parent 91274a36af
commit 91a7ddd84c
2 changed files with 103 additions and 72 deletions

View File

@ -18,6 +18,7 @@ import configparser
import copy import copy
import csv import csv
import hashlib import hashlib
import math
import os import os
import pickle import pickle
import re import re
@ -420,6 +421,94 @@ def read_filters(config, args, feats_detailed_list):
return filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types return filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types
def process(input_path, internal_saves, config, args):
if os.path.isdir(input_path):
checkpoint_path = Path(internal_saves, 'checkpoint.pkl')
continuation_processing = config.getboolean('settings', 'continuation_processing',
fallback=False) if not args.continuation_processing else args.input
if not checkpoint_path.exists() or not continuation_processing:
already_processed = set()
result_dict = {}
unigrams_dict = {}
corpus_size = 0
feats_detailed_list = {}
if checkpoint_path.exists():
os.remove(checkpoint_path)
else:
already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list = load_zipped_pickle(
checkpoint_path)
for path in sorted(os.listdir(input_path)):
path_obj = Path(input_path, path)
pathlist = path_obj.glob('**/*.conllu')
if path_obj.name in already_processed:
continue
start_exe_time = time.time()
for path in sorted(pathlist):
# because path is object not string
path_str = str(path)
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, sub_corpus_size,
feats_detailed_list) = create_trees(path_str, internal_saves, feats_detailed_dict=feats_detailed_list,
save=False)
corpus_size += sub_corpus_size
filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types = read_filters(
config, args, feats_detailed_list)
count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict,
result_dict)
already_processed.add(path_obj.name)
# 15.26
print("Execution time:")
print("--- %s seconds ---" % (time.time() - start_exe_time))
save_zipped_pickle(
(already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list),
checkpoint_path, protocol=2)
else:
# 261 - 9 grams
# 647 - 10 grams
# 1622 - 11 grams
# 4126 - 12 grams
# 10598 - 13 grams
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size,
feats_detailed_list) = create_trees(input_path, internal_saves)
result_dict = {}
unigrams_dict = {}
filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types = read_filters(config,
args,
feats_detailed_list)
start_exe_time = time.time()
count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict, result_dict)
print("Execution time:")
print("--- %s seconds ---" % (time.time() - start_exe_time))
return result_dict, tree_size_range, filters, corpus_size, unigrams_dict, node_types
def get_keyness(abs_freq_A, abs_freq_B, count_A, count_B):
E1 = count_A * (abs_freq_A + abs_freq_B) / (count_A + count_B)
E2 = count_B * (abs_freq_A + abs_freq_B) / (count_A + count_B)
LL = 2 * ((abs_freq_A * math.log(abs_freq_A / E1)) + (abs_freq_B * math.log(abs_freq_B / E2))) if abs_freq_B > 0 else 'NaN'
return [LL]
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
@ -447,6 +536,7 @@ def main():
parser.add_argument("--print_root", default=None, type=bool, help="Print root.") parser.add_argument("--print_root", default=None, type=bool, help="Print root.")
parser.add_argument("--nodes_number", default=None, type=bool, help="Nodes number.") parser.add_argument("--nodes_number", default=None, type=bool, help="Nodes number.")
parser.add_argument("--continuation_processing", default=None, type=bool, help="Nodes number.") parser.add_argument("--continuation_processing", default=None, type=bool, help="Nodes number.")
parser.add_argument("--compare", default=None, type=str, help="Corpus with which we want to compare statistics.")
args = parser.parse_args() args = parser.parse_args()
config = configparser.ConfigParser() config = configparser.ConfigParser()
@ -455,76 +545,12 @@ def main():
internal_saves = config.get('settings', 'internal_saves') if not args.internal_saves else args.internal_saves internal_saves = config.get('settings', 'internal_saves') if not args.internal_saves else args.internal_saves
input_path = config.get('settings', 'input') if not args.input else args.input input_path = config.get('settings', 'input') if not args.input else args.input
if os.path.isdir(input_path): result_dict, tree_size_range, filters, corpus_size, unigrams_dict, node_types = process(input_path, internal_saves, config, args)
checkpoint_path = Path(internal_saves, 'checkpoint.pkl') if args.compare is not None:
continuation_processing = config.getboolean('settings', 'continuation_processing', fallback=False) if not args.continuation_processing else args.input other_input_path = args.compare
other_result_dict, other_tree_size_range, other_filters, other_corpus_size, other_unigrams_dict, other_node_types = process(other_input_path, internal_saves, config, args)
if not checkpoint_path.exists() or not continuation_processing:
already_processed = set()
result_dict = {}
unigrams_dict = {}
corpus_size = 0
feats_detailed_list = {}
if checkpoint_path.exists():
os.remove(checkpoint_path)
else:
already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list = load_zipped_pickle(
checkpoint_path)
for path in sorted(os.listdir(input_path)):
path_obj = Path(input_path, path)
pathlist = path_obj.glob('**/*.conllu')
if path_obj.name in already_processed:
continue
start_exe_time = time.time()
for path in sorted(pathlist):
# because path is object not string
path_str = str(path)
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, sub_corpus_size,
feats_detailed_list) = create_trees(path_str, internal_saves, feats_detailed_dict=feats_detailed_list, save=False)
corpus_size += sub_corpus_size
filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types = read_filters(
config, args, feats_detailed_list)
count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict,
result_dict)
already_processed.add(path_obj.name)
# 15.26
print("Execution time:")
print("--- %s seconds ---" % (time.time() - start_exe_time))
save_zipped_pickle(
(already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list),
checkpoint_path, protocol=2)
else:
# 261 - 9 grams
# 647 - 10 grams
# 1622 - 11 grams
# 4126 - 12 grams
# 10598 - 13 grams
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size,
feats_detailed_list) = create_trees(input_path, internal_saves)
result_dict = {}
unigrams_dict = {}
filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types = read_filters(config, args, feats_detailed_list)
start_exe_time = time.time()
count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict, result_dict)
print("Execution time:")
print("--- %s seconds ---" % (time.time() - start_exe_time))
sorted_list = sorted(result_dict.items(), key=lambda x: x[1]['number'], reverse=True) sorted_list = sorted(result_dict.items(), key=lambda x: x[1]['number'], reverse=True)
output = config.get('settings', 'output') if not args.output else args.output output = config.get('settings', 'output') if not args.output else args.output
@ -547,6 +573,8 @@ def main():
header += ['Root node'] header += ['Root node']
if filters['association_measures']: if filters['association_measures']:
header += ['MI', 'MI3', 'Dice', 'logDice', 't-score', 'simple-LL'] header += ['MI', 'MI3', 'Dice', 'logDice', 't-score', 'simple-LL']
if args.compare:
header += ['LL']
writer.writerow(header) writer.writerow(header)
if filters['lines_threshold']: if filters['lines_threshold']:
@ -570,6 +598,9 @@ def main():
row += [v['object'].node.name] row += [v['object'].node.name]
if filters['association_measures']: if filters['association_measures']:
row += get_collocabilities(v, unigrams_dict, corpus_size) row += get_collocabilities(v, unigrams_dict, corpus_size)
if args.compare:
other_abs_freq = other_result_dict[k]['number'] if k in other_result_dict else 0
row += get_keyness(v['number'], other_abs_freq, corpus_size, other_corpus_size)
writer.writerow(row) writer.writerow(row)
return "Done" return "Done"

View File

@ -1,8 +1,9 @@
import os import os
from pathlib import Path from pathlib import Path
input_path = '/home/luka/Development/STARK/data/ud-treebanks-v2.11/' input_path = '/home/lukakrsnik/STARK/data/ud-treebanks-v2.11/'
output_path = '/home/luka/Development/STARK/results/ud-treebanks-v2.11_B/' output_path = '/home/lukakrsnik/STARK/results/ud-treebanks-v2.11_B/'
config_path = '/home/lukakrsnik/STARK/data/B_test-all-treebanks_3_completed_unlabeled_fixed_form_root=NOUN_5.ini'
for path in sorted(os.listdir(input_path)): for path in sorted(os.listdir(input_path)):
path_obj = Path(input_path, path) path_obj = Path(input_path, path)
@ -13,5 +14,4 @@ for path in sorted(os.listdir(input_path)):
if not os.path.exists(folder_name): if not os.path.exists(folder_name):
os.makedirs(folder_name) os.makedirs(folder_name)
if not os.path.exists(file_name): if not os.path.exists(file_name):
# os.system("python /home/luka/Development/STARK/dependency-parsetree.py --config_file config.ini --input " + str(path) + " --output " + file_name) os.system("python /home/luka/Development/STARK/dependency-parsetree.py --config_file " + config_path + " --input " + str(path) + " --output " + file_name)
os.system("python /home/luka/Development/STARK/dependency-parsetree.py --config_file data/B_test-all-treebanks_3_completed_unlabeled_fixed_form_root=NOUN_5.ini --input " + str(path) + " --output " + file_name)