Added option to compare one corpus results with the other

2023-01-20 18:20:03 +01:00 · 2023-01-20 18:20:03 +01:00 · 91a7ddd84c
commit 91a7ddd84c
parent 91274a36af
2 changed files with 103 additions and 72 deletions
--- a/dependency-parsetree.py
+++ b/dependency-parsetree.py
@ -18,6 +18,7 @@ import configparser
 import copy
 import csv
 import hashlib
+import math
 import os
 import pickle
 import re
@ -420,6 +421,94 @@ def read_filters(config, args, feats_detailed_list):

    return filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types

+
+def process(input_path, internal_saves, config, args):
+    if os.path.isdir(input_path):
+
+        checkpoint_path = Path(internal_saves, 'checkpoint.pkl')
+        continuation_processing = config.getboolean('settings', 'continuation_processing',
+                                                    fallback=False) if not args.continuation_processing else args.input
+
+        if not checkpoint_path.exists() or not continuation_processing:
+            already_processed = set()
+            result_dict = {}
+            unigrams_dict = {}
+            corpus_size = 0
+            feats_detailed_list = {}
+            if checkpoint_path.exists():
+                os.remove(checkpoint_path)
+        else:
+            already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list = load_zipped_pickle(
+                checkpoint_path)
+
+        for path in sorted(os.listdir(input_path)):
+            path_obj = Path(input_path, path)
+            pathlist = path_obj.glob('**/*.conllu')
+            if path_obj.name in already_processed:
+                continue
+            start_exe_time = time.time()
+            for path in sorted(pathlist):
+                # because path is object not string
+                path_str = str(path)
+
+                (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, sub_corpus_size,
+                 feats_detailed_list) = create_trees(path_str, internal_saves, feats_detailed_dict=feats_detailed_list,
+                                                     save=False)
+
+                corpus_size += sub_corpus_size
+
+                filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types = read_filters(
+                    config, args, feats_detailed_list)
+
+                count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict,
+                            result_dict)
+
+            already_processed.add(path_obj.name)
+
+            # 15.26
+            print("Execution time:")
+            print("--- %s seconds ---" % (time.time() - start_exe_time))
+            save_zipped_pickle(
+                (already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list),
+                checkpoint_path, protocol=2)
+
+
+
+
+    else:
+        # 261 - 9 grams
+        # 647 - 10 grams
+        # 1622 - 11 grams
+        # 4126 - 12 grams
+        # 10598 - 13 grams
+        (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size,
+         feats_detailed_list) = create_trees(input_path, internal_saves)
+
+        result_dict = {}
+        unigrams_dict = {}
+
+        filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types = read_filters(config,
+                                                                                                                args,
+                                                                                                                feats_detailed_list)
+
+        start_exe_time = time.time()
+        count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict, result_dict)
+
+        print("Execution time:")
+        print("--- %s seconds ---" % (time.time() - start_exe_time))
+
+    return result_dict, tree_size_range, filters, corpus_size, unigrams_dict, node_types
+
+
+def get_keyness(abs_freq_A, abs_freq_B, count_A, count_B):
+    E1 = count_A * (abs_freq_A + abs_freq_B) / (count_A + count_B)
+    E2 = count_B * (abs_freq_A + abs_freq_B) / (count_A + count_B)
+
+    LL = 2 * ((abs_freq_A * math.log(abs_freq_A / E1)) + (abs_freq_B * math.log(abs_freq_B / E2))) if abs_freq_B > 0 else 'NaN'
+
+    return [LL]
+
+
 def main():
    parser = argparse.ArgumentParser()

@ -447,6 +536,7 @@ def main():
    parser.add_argument("--print_root", default=None, type=bool, help="Print root.")
    parser.add_argument("--nodes_number", default=None, type=bool, help="Nodes number.")
    parser.add_argument("--continuation_processing", default=None, type=bool, help="Nodes number.")
+    parser.add_argument("--compare", default=None, type=str, help="Corpus with which we want to compare statistics.")
    args = parser.parse_args()

    config = configparser.ConfigParser()
@ -455,76 +545,12 @@ def main():
    internal_saves = config.get('settings', 'internal_saves') if not args.internal_saves else args.internal_saves
    input_path = config.get('settings', 'input') if not args.input else args.input

-    if os.path.isdir(input_path):
+    result_dict, tree_size_range, filters, corpus_size, unigrams_dict, node_types = process(input_path, internal_saves, config, args)

-        checkpoint_path = Path(internal_saves, 'checkpoint.pkl')
-        continuation_processing = config.getboolean('settings', 'continuation_processing', fallback=False) if not args.continuation_processing else args.input
+    if args.compare is not None:
+        other_input_path = args.compare
+        other_result_dict, other_tree_size_range, other_filters, other_corpus_size, other_unigrams_dict, other_node_types = process(other_input_path, internal_saves, config, args)

-        if not checkpoint_path.exists() or not continuation_processing:
-            already_processed = set()
-            result_dict = {}
-            unigrams_dict = {}
-            corpus_size = 0
-            feats_detailed_list = {}
-            if checkpoint_path.exists():
-                os.remove(checkpoint_path)
-        else:
-            already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list = load_zipped_pickle(
-                checkpoint_path)
-
-        for path in sorted(os.listdir(input_path)):
-            path_obj = Path(input_path, path)
-            pathlist = path_obj.glob('**/*.conllu')
-            if path_obj.name in already_processed:
-                continue
-            start_exe_time = time.time()
-            for path in sorted(pathlist):
-                # because path is object not string
-                path_str = str(path)
-
-                (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, sub_corpus_size,
-                 feats_detailed_list) = create_trees(path_str, internal_saves, feats_detailed_dict=feats_detailed_list, save=False)
-
-                corpus_size += sub_corpus_size
-
-                filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types = read_filters(
-                    config, args, feats_detailed_list)
-
-
-                count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict,
-                            result_dict)
-
-            already_processed.add(path_obj.name)
-
-            # 15.26
-            print("Execution time:")
-            print("--- %s seconds ---" % (time.time() - start_exe_time))
-            save_zipped_pickle(
-                (already_processed, result_dict, unigrams_dict, corpus_size, feats_detailed_list),
-                checkpoint_path, protocol=2)
-
-
-
-
-    else:
-    # 261 - 9 grams
-    # 647 - 10 grams
-    # 1622 - 11 grams
-    # 4126 - 12 grams
-    # 10598 - 13 grams
-        (all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size,
-         feats_detailed_list) = create_trees(input_path, internal_saves)
-
-        result_dict = {}
-        unigrams_dict = {}
-
-        filters, query_tree, create_output_string_functs, cpu_cores, tree_size_range, node_types = read_filters(config, args, feats_detailed_list)
-
-        start_exe_time = time.time()
-        count_trees(cpu_cores, all_trees, query_tree, create_output_string_functs, filters, unigrams_dict, result_dict)
-
-        print("Execution time:")
-        print("--- %s seconds ---" % (time.time() - start_exe_time))
    sorted_list = sorted(result_dict.items(), key=lambda x: x[1]['number'], reverse=True)

    output = config.get('settings', 'output') if not args.output else args.output
@ -547,6 +573,8 @@ def main():
            header += ['Root node']
        if filters['association_measures']:
            header += ['MI', 'MI3', 'Dice', 'logDice', 't-score', 'simple-LL']
+        if args.compare:
+            header += ['LL']
        writer.writerow(header)

        if filters['lines_threshold']:
@ -570,6 +598,9 @@ def main():
                row += [v['object'].node.name]
            if filters['association_measures']:
                row += get_collocabilities(v, unigrams_dict, corpus_size)
+            if args.compare:
+                other_abs_freq = other_result_dict[k]['number'] if k in other_result_dict else 0
+                row += get_keyness(v['number'], other_abs_freq, corpus_size, other_corpus_size)
            writer.writerow(row)

    return "Done"
--- a/run-multiple-depparse.py
+++ b/run-multiple-depparse.py
@ -1,8 +1,9 @@
 import os
 from pathlib import Path

-input_path = '/home/luka/Development/STARK/data/ud-treebanks-v2.11/'
-output_path = '/home/luka/Development/STARK/results/ud-treebanks-v2.11_B/'
+input_path = '/home/lukakrsnik/STARK/data/ud-treebanks-v2.11/'
+output_path = '/home/lukakrsnik/STARK/results/ud-treebanks-v2.11_B/'
+config_path = '/home/lukakrsnik/STARK/data/B_test-all-treebanks_3_completed_unlabeled_fixed_form_root=NOUN_5.ini'

 for path in sorted(os.listdir(input_path)):
    path_obj = Path(input_path, path)
@ -13,5 +14,4 @@ for path in sorted(os.listdir(input_path)):
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)
        if not os.path.exists(file_name):
-            # os.system("python /home/luka/Development/STARK/dependency-parsetree.py --config_file config.ini --input " + str(path) + " --output " + file_name)
-            os.system("python /home/luka/Development/STARK/dependency-parsetree.py --config_file data/B_test-all-treebanks_3_completed_unlabeled_fixed_form_root=NOUN_5.ini --input " + str(path) + " --output " + file_name)
+            os.system("python /home/luka/Development/STARK/dependency-parsetree.py --config_file " + config_path + " --input " + str(path) + " --output " + file_name)