Added option to compare one corpus results with the other
This commit is contained in:
parent
91274a36af
commit
91a7ddd84c
|
@ -18,6 +18,7 @@ import configparser
|
||||||
import copy
|
import copy
|
||||||
import csv
|
import csv
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import math
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
import re
|
import re
|
||||||
|
@ -420,45 +421,13 @@ def read_filters(config, args, feats_detailed_list):
|
||||||
|
|
||||||
return filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types
|
return filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
|
|
||||||
## Required parameters
|
|
||||||
parser.add_argument("--config_file", default=None, type=str, required=True, help="The input config file.")
|
|
||||||
parser.add_argument("--input", default=None, type=str, help="The input file/folder.")
|
|
||||||
parser.add_argument("--output", default=None, type=str, help="The output file.")
|
|
||||||
parser.add_argument("--internal_saves", default=None, type=str, help="Location for internal_saves.")
|
|
||||||
parser.add_argument("--cpu_cores", default=None, type=int, help="Number of cores used.")
|
|
||||||
|
|
||||||
parser.add_argument("--tree_size", default=None, type=int, help="Size of trees.")
|
|
||||||
parser.add_argument("--tree_type", default=None, type=str, help="Tree type.")
|
|
||||||
parser.add_argument("--dependency_type", default=None, type=str, help="Dependency type.")
|
|
||||||
parser.add_argument("--node_order", default=None, type=str, help="Order of node.")
|
|
||||||
parser.add_argument("--node_type", default=None, type=str, help="Type of node.")
|
|
||||||
|
|
||||||
parser.add_argument("--label_whitelist", default=None, type=str, help="Label whitelist.")
|
|
||||||
parser.add_argument("--root_whitelist", default=None, type=str, help="Root whitelist.")
|
|
||||||
|
|
||||||
parser.add_argument("--query", default=None, type=str, help="Query.")
|
|
||||||
|
|
||||||
parser.add_argument("--lines_threshold", default=None, type=str, help="Lines treshold.")
|
|
||||||
parser.add_argument("--frequency_threshold", default=None, type=int, help="Frequency threshold.")
|
|
||||||
parser.add_argument("--association_measures", default=None, type=bool, help="Association measures.")
|
|
||||||
parser.add_argument("--print_root", default=None, type=bool, help="Print root.")
|
|
||||||
parser.add_argument("--nodes_number", default=None, type=bool, help="Nodes number.")
|
|
||||||
parser.add_argument("--continuation_processing", default=None, type=bool, help="Nodes number.")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
config = configparser.ConfigParser()
|
|
||||||
config.read(args.config_file)
|
|
||||||
|
|
||||||
internal_saves = config.get('settings', 'internal_saves') if not args.internal_saves else args.internal_saves
|
|
||||||
input_path = config.get('settings', 'input') if not args.input else args.input
|
|
||||||
|
|
||||||
|
def process(input_path, internal_saves, config, args):
|
||||||
if os.path.isdir(input_path):
|
if os.path.isdir(input_path):
|
||||||
|
|
||||||
checkpoint_path = Path(internal_saves, 'checkpoint.pkl')
|
checkpoint_path = Path(internal_saves, 'checkpoint.pkl')
|
||||||
continuation_processing = config.getboolean('settings', 'continuation_processing', fallback=False) if not args.continuation_processing else args.input
|
continuation_processing = config.getboolean('settings', 'continuation_processing',
|
||||||
|
fallback=False) if not args.continuation_processing else args.input
|
||||||
|
|
||||||
if not checkpoint_path.exists() or not continuation_processing:
|
if not checkpoint_path.exists() or not continuation_processing:
|
||||||
already_processed = set()
|
already_processed = set()
|
||||||
|
@ -483,14 +452,14 @@ def main():
|
||||||
path_str = str(path)
|
path_str = str(path)
|
||||||
|
|
||||||
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, sub_corpus_size,
|
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, sub_corpus_size,
|
||||||
feats_detailed_list) = create_trees(path_str, internal_saves, feats_detailed_dict=feats_detailed_list, save=False)
|
feats_detailed_list) = create_trees(path_str, internal_saves, feats_detailed_dict=feats_detailed_list,
|
||||||
|
save=False)
|
||||||
|
|
||||||
corpus_size += sub_corpus_size
|
corpus_size += sub_corpus_size
|
||||||
|
|
||||||
filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types = read_filters(
|
filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types = read_filters(
|
||||||
config, args, feats_detailed_list)
|
config, args, feats_detailed_list)
|
||||||
|
|
||||||
|
|
||||||
count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict,
|
count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict,
|
||||||
result_dict)
|
result_dict)
|
||||||
|
|
||||||
|
@ -518,13 +487,70 @@ def main():
|
||||||
result_dict = {}
|
result_dict = {}
|
||||||
unigrams_dict = {}
|
unigrams_dict = {}
|
||||||
|
|
||||||
filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types = read_filters(config, args, feats_detailed_list)
|
filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types = read_filters(config,
|
||||||
|
args,
|
||||||
|
feats_detailed_list)
|
||||||
|
|
||||||
start_exe_time = time.time()
|
start_exe_time = time.time()
|
||||||
count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict, result_dict)
|
count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict, result_dict)
|
||||||
|
|
||||||
print("Execution time:")
|
print("Execution time:")
|
||||||
print("--- %s seconds ---" % (time.time() - start_exe_time))
|
print("--- %s seconds ---" % (time.time() - start_exe_time))
|
||||||
|
|
||||||
|
return result_dict, tree_size_range, filters, corpus_size, unigrams_dict, node_types
|
||||||
|
|
||||||
|
|
||||||
|
def get_keyness(abs_freq_A, abs_freq_B, count_A, count_B):
|
||||||
|
E1 = count_A * (abs_freq_A + abs_freq_B) / (count_A + count_B)
|
||||||
|
E2 = count_B * (abs_freq_A + abs_freq_B) / (count_A + count_B)
|
||||||
|
|
||||||
|
LL = 2 * ((abs_freq_A * math.log(abs_freq_A / E1)) + (abs_freq_B * math.log(abs_freq_B / E2))) if abs_freq_B > 0 else 'NaN'
|
||||||
|
|
||||||
|
return [LL]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
## Required parameters
|
||||||
|
parser.add_argument("--config_file", default=None, type=str, required=True, help="The input config file.")
|
||||||
|
parser.add_argument("--input", default=None, type=str, help="The input file/folder.")
|
||||||
|
parser.add_argument("--output", default=None, type=str, help="The output file.")
|
||||||
|
parser.add_argument("--internal_saves", default=None, type=str, help="Location for internal_saves.")
|
||||||
|
parser.add_argument("--cpu_cores", default=None, type=int, help="Number of cores used.")
|
||||||
|
|
||||||
|
parser.add_argument("--tree_size", default=None, type=int, help="Size of trees.")
|
||||||
|
parser.add_argument("--tree_type", default=None, type=str, help="Tree type.")
|
||||||
|
parser.add_argument("--dependency_type", default=None, type=str, help="Dependency type.")
|
||||||
|
parser.add_argument("--node_order", default=None, type=str, help="Order of node.")
|
||||||
|
parser.add_argument("--node_type", default=None, type=str, help="Type of node.")
|
||||||
|
|
||||||
|
parser.add_argument("--label_whitelist", default=None, type=str, help="Label whitelist.")
|
||||||
|
parser.add_argument("--root_whitelist", default=None, type=str, help="Root whitelist.")
|
||||||
|
|
||||||
|
parser.add_argument("--query", default=None, type=str, help="Query.")
|
||||||
|
|
||||||
|
parser.add_argument("--lines_threshold", default=None, type=str, help="Lines treshold.")
|
||||||
|
parser.add_argument("--frequency_threshold", default=None, type=int, help="Frequency threshold.")
|
||||||
|
parser.add_argument("--association_measures", default=None, type=bool, help="Association measures.")
|
||||||
|
parser.add_argument("--print_root", default=None, type=bool, help="Print root.")
|
||||||
|
parser.add_argument("--nodes_number", default=None, type=bool, help="Nodes number.")
|
||||||
|
parser.add_argument("--continuation_processing", default=None, type=bool, help="Nodes number.")
|
||||||
|
parser.add_argument("--compare", default=None, type=str, help="Corpus with which we want to compare statistics.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
config.read(args.config_file)
|
||||||
|
|
||||||
|
internal_saves = config.get('settings', 'internal_saves') if not args.internal_saves else args.internal_saves
|
||||||
|
input_path = config.get('settings', 'input') if not args.input else args.input
|
||||||
|
|
||||||
|
result_dict, tree_size_range, filters, corpus_size, unigrams_dict, node_types = process(input_path, internal_saves, config, args)
|
||||||
|
|
||||||
|
if args.compare is not None:
|
||||||
|
other_input_path = args.compare
|
||||||
|
other_result_dict, other_tree_size_range, other_filters, other_corpus_size, other_unigrams_dict, other_node_types = process(other_input_path, internal_saves, config, args)
|
||||||
|
|
||||||
sorted_list = sorted(result_dict.items(), key=lambda x: x[1]['number'], reverse=True)
|
sorted_list = sorted(result_dict.items(), key=lambda x: x[1]['number'], reverse=True)
|
||||||
|
|
||||||
output = config.get('settings', 'output') if not args.output else args.output
|
output = config.get('settings', 'output') if not args.output else args.output
|
||||||
|
@ -547,6 +573,8 @@ def main():
|
||||||
header += ['Root node']
|
header += ['Root node']
|
||||||
if filters['association_measures']:
|
if filters['association_measures']:
|
||||||
header += ['MI', 'MI3', 'Dice', 'logDice', 't-score', 'simple-LL']
|
header += ['MI', 'MI3', 'Dice', 'logDice', 't-score', 'simple-LL']
|
||||||
|
if args.compare:
|
||||||
|
header += ['LL']
|
||||||
writer.writerow(header)
|
writer.writerow(header)
|
||||||
|
|
||||||
if filters['lines_threshold']:
|
if filters['lines_threshold']:
|
||||||
|
@ -570,6 +598,9 @@ def main():
|
||||||
row += [v['object'].node.name]
|
row += [v['object'].node.name]
|
||||||
if filters['association_measures']:
|
if filters['association_measures']:
|
||||||
row += get_collocabilities(v, unigrams_dict, corpus_size)
|
row += get_collocabilities(v, unigrams_dict, corpus_size)
|
||||||
|
if args.compare:
|
||||||
|
other_abs_freq = other_result_dict[k]['number'] if k in other_result_dict else 0
|
||||||
|
row += get_keyness(v['number'], other_abs_freq, corpus_size, other_corpus_size)
|
||||||
writer.writerow(row)
|
writer.writerow(row)
|
||||||
|
|
||||||
return "Done"
|
return "Done"
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
input_path = '/home/luka/Development/STARK/data/ud-treebanks-v2.11/'
|
input_path = '/home/lukakrsnik/STARK/data/ud-treebanks-v2.11/'
|
||||||
output_path = '/home/luka/Development/STARK/results/ud-treebanks-v2.11_B/'
|
output_path = '/home/lukakrsnik/STARK/results/ud-treebanks-v2.11_B/'
|
||||||
|
config_path = '/home/lukakrsnik/STARK/data/B_test-all-treebanks_3_completed_unlabeled_fixed_form_root=NOUN_5.ini'
|
||||||
|
|
||||||
for path in sorted(os.listdir(input_path)):
|
for path in sorted(os.listdir(input_path)):
|
||||||
path_obj = Path(input_path, path)
|
path_obj = Path(input_path, path)
|
||||||
|
@ -13,5 +14,4 @@ for path in sorted(os.listdir(input_path)):
|
||||||
if not os.path.exists(folder_name):
|
if not os.path.exists(folder_name):
|
||||||
os.makedirs(folder_name)
|
os.makedirs(folder_name)
|
||||||
if not os.path.exists(file_name):
|
if not os.path.exists(file_name):
|
||||||
# os.system("python /home/luka/Development/STARK/dependency-parsetree.py --config_file config.ini --input " + str(path) + " --output " + file_name)
|
os.system("python /home/luka/Development/STARK/dependency-parsetree.py --config_file " + config_path + " --input " + str(path) + " --output " + file_name)
|
||||||
os.system("python /home/luka/Development/STARK/dependency-parsetree.py --config_file data/B_test-all-treebanks_3_completed_unlabeled_fixed_form_root=NOUN_5.ini --input " + str(path) + " --output " + file_name)
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user