From 91a7ddd84c98a54baa824cfa512ba78953e4448b Mon Sep 17 00:00:00 2001 From: Luka Date: Fri, 20 Jan 2023 18:20:03 +0100 Subject: [PATCH] Added option to compare one corpus results with the other --- dependency-parsetree.py | 117 +++++++++++++++++++++++++-------------- run-multiple-depparse.py | 8 +-- 2 files changed, 78 insertions(+), 47 deletions(-) diff --git a/dependency-parsetree.py b/dependency-parsetree.py index 7990b5a..8ede84c 100644 --- a/dependency-parsetree.py +++ b/dependency-parsetree.py @@ -18,6 +18,7 @@ import configparser import copy import csv import hashlib +import math import os import pickle import re @@ -420,45 +421,13 @@ def read_filters(config, args, feats_detailed_list): return filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types -def main(): - parser = argparse.ArgumentParser() - - ## Required parameters - parser.add_argument("--config_file", default=None, type=str, required=True, help="The input config file.") - parser.add_argument("--input", default=None, type=str, help="The input file/folder.") - parser.add_argument("--output", default=None, type=str, help="The output file.") - parser.add_argument("--internal_saves", default=None, type=str, help="Location for internal_saves.") - parser.add_argument("--cpu_cores", default=None, type=int, help="Number of cores used.") - - parser.add_argument("--tree_size", default=None, type=int, help="Size of trees.") - parser.add_argument("--tree_type", default=None, type=str, help="Tree type.") - parser.add_argument("--dependency_type", default=None, type=str, help="Dependency type.") - parser.add_argument("--node_order", default=None, type=str, help="Order of node.") - parser.add_argument("--node_type", default=None, type=str, help="Type of node.") - - parser.add_argument("--label_whitelist", default=None, type=str, help="Label whitelist.") - parser.add_argument("--root_whitelist", default=None, type=str, help="Root whitelist.") - - parser.add_argument("--query", default=None, type=str, help="Query.") - - parser.add_argument("--lines_threshold", default=None, type=str, help="Lines treshold.") - parser.add_argument("--frequency_threshold", default=None, type=int, help="Frequency threshold.") - parser.add_argument("--association_measures", default=None, type=bool, help="Association measures.") - parser.add_argument("--print_root", default=None, type=bool, help="Print root.") - parser.add_argument("--nodes_number", default=None, type=bool, help="Nodes number.") - parser.add_argument("--continuation_processing", default=None, type=bool, help="Nodes number.") - args = parser.parse_args() - - config = configparser.ConfigParser() - config.read(args.config_file) - - internal_saves = config.get('settings', 'internal_saves') if not args.internal_saves else args.internal_saves - input_path = config.get('settings', 'input') if not args.input else args.input +def process(input_path, internal_saves, config, args): if os.path.isdir(input_path): checkpoint_path = Path(internal_saves, 'checkpoint.pkl') - continuation_processing = config.getboolean('settings', 'continuation_processing', fallback=False) if not args.continuation_processing else args.input + continuation_processing = config.getboolean('settings', 'continuation_processing', + fallback=False) if not args.continuation_processing else args.input if not checkpoint_path.exists() or not continuation_processing: already_processed = set() @@ -483,14 +452,14 @@ def main(): path_str = str(path) (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, sub_corpus_size, - feats_detailed_list) = create_trees(path_str, internal_saves, feats_detailed_dict=feats_detailed_list, save=False) + feats_detailed_list) = create_trees(path_str, internal_saves, feats_detailed_dict=feats_detailed_list, + save=False) corpus_size += sub_corpus_size filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types = read_filters( config, args, feats_detailed_list) - count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict, result_dict) @@ -507,24 +476,81 @@ def main(): else: - # 261 - 9 grams - # 647 - 10 grams - # 1622 - 11 grams - # 4126 - 12 grams - # 10598 - 13 grams + # 261 - 9 grams + # 647 - 10 grams + # 1622 - 11 grams + # 4126 - 12 grams + # 10598 - 13 grams (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_list) = create_trees(input_path, internal_saves) result_dict = {} unigrams_dict = {} - filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types = read_filters(config, args, feats_detailed_list) + filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types = read_filters(config, + args, + feats_detailed_list) start_exe_time = time.time() count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict, result_dict) print("Execution time:") print("--- %s seconds ---" % (time.time() - start_exe_time)) + + return result_dict, tree_size_range, filters, corpus_size, unigrams_dict, node_types + + +def get_keyness(abs_freq_A, abs_freq_B, count_A, count_B): + E1 = count_A * (abs_freq_A + abs_freq_B) / (count_A + count_B) + E2 = count_B * (abs_freq_A + abs_freq_B) / (count_A + count_B) + + LL = 2 * ((abs_freq_A * math.log(abs_freq_A / E1)) + (abs_freq_B * math.log(abs_freq_B / E2))) if abs_freq_B > 0 else 'NaN' + + return [LL] + + +def main(): + parser = argparse.ArgumentParser() + + ## Required parameters + parser.add_argument("--config_file", default=None, type=str, required=True, help="The input config file.") + parser.add_argument("--input", default=None, type=str, help="The input file/folder.") + parser.add_argument("--output", default=None, type=str, help="The output file.") + parser.add_argument("--internal_saves", default=None, type=str, help="Location for internal_saves.") + parser.add_argument("--cpu_cores", default=None, type=int, help="Number of cores used.") + + parser.add_argument("--tree_size", default=None, type=int, help="Size of trees.") + parser.add_argument("--tree_type", default=None, type=str, help="Tree type.") + parser.add_argument("--dependency_type", default=None, type=str, help="Dependency type.") + parser.add_argument("--node_order", default=None, type=str, help="Order of node.") + parser.add_argument("--node_type", default=None, type=str, help="Type of node.") + + parser.add_argument("--label_whitelist", default=None, type=str, help="Label whitelist.") + parser.add_argument("--root_whitelist", default=None, type=str, help="Root whitelist.") + + parser.add_argument("--query", default=None, type=str, help="Query.") + + parser.add_argument("--lines_threshold", default=None, type=str, help="Lines treshold.") + parser.add_argument("--frequency_threshold", default=None, type=int, help="Frequency threshold.") + parser.add_argument("--association_measures", default=None, type=bool, help="Association measures.") + parser.add_argument("--print_root", default=None, type=bool, help="Print root.") + parser.add_argument("--nodes_number", default=None, type=bool, help="Nodes number.") + parser.add_argument("--continuation_processing", default=None, type=bool, help="Nodes number.") + parser.add_argument("--compare", default=None, type=str, help="Corpus with which we want to compare statistics.") + args = parser.parse_args() + + config = configparser.ConfigParser() + config.read(args.config_file) + + internal_saves = config.get('settings', 'internal_saves') if not args.internal_saves else args.internal_saves + input_path = config.get('settings', 'input') if not args.input else args.input + + result_dict, tree_size_range, filters, corpus_size, unigrams_dict, node_types = process(input_path, internal_saves, config, args) + + if args.compare is not None: + other_input_path = args.compare + other_result_dict, other_tree_size_range, other_filters, other_corpus_size, other_unigrams_dict, other_node_types = process(other_input_path, internal_saves, config, args) + sorted_list = sorted(result_dict.items(), key=lambda x: x[1]['number'], reverse=True) output = config.get('settings', 'output') if not args.output else args.output @@ -547,6 +573,8 @@ def main(): header += ['Root node'] if filters['association_measures']: header += ['MI', 'MI3', 'Dice', 'logDice', 't-score', 'simple-LL'] + if args.compare: + header += ['LL'] writer.writerow(header) if filters['lines_threshold']: @@ -570,6 +598,9 @@ def main(): row += [v['object'].node.name] if filters['association_measures']: row += get_collocabilities(v, unigrams_dict, corpus_size) + if args.compare: + other_abs_freq = other_result_dict[k]['number'] if k in other_result_dict else 0 + row += get_keyness(v['number'], other_abs_freq, corpus_size, other_corpus_size) writer.writerow(row) return "Done" diff --git a/run-multiple-depparse.py b/run-multiple-depparse.py index c4f19ef..3ea8dc0 100644 --- a/run-multiple-depparse.py +++ b/run-multiple-depparse.py @@ -1,8 +1,9 @@ import os from pathlib import Path -input_path = '/home/luka/Development/STARK/data/ud-treebanks-v2.11/' -output_path = '/home/luka/Development/STARK/results/ud-treebanks-v2.11_B/' +input_path = '/home/lukakrsnik/STARK/data/ud-treebanks-v2.11/' +output_path = '/home/lukakrsnik/STARK/results/ud-treebanks-v2.11_B/' +config_path = '/home/lukakrsnik/STARK/data/B_test-all-treebanks_3_completed_unlabeled_fixed_form_root=NOUN_5.ini' for path in sorted(os.listdir(input_path)): path_obj = Path(input_path, path) @@ -13,5 +14,4 @@ for path in sorted(os.listdir(input_path)): if not os.path.exists(folder_name): os.makedirs(folder_name) if not os.path.exists(file_name): - # os.system("python /home/luka/Development/STARK/dependency-parsetree.py --config_file config.ini --input " + str(path) + " --output " + file_name) - os.system("python /home/luka/Development/STARK/dependency-parsetree.py --config_file data/B_test-all-treebanks_3_completed_unlabeled_fixed_form_root=NOUN_5.ini --input " + str(path) + " --output " + file_name) + os.system("python /home/luka/Development/STARK/dependency-parsetree.py --config_file " + config_path + " --input " + str(path) + " --output " + file_name)