diff --git a/.gitignore b/.gitignore index d2a5e97..249a9db 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,6 @@ nohup.out data/kres_out/* data/kres_example/ +venv/ +.idea/ +data/ diff --git a/dockerfiles/python-java/Makefile b/dockerfiles/python-java/Makefile index 82b9f15..b30e0cd 100644 --- a/dockerfiles/python-java/Makefile +++ b/dockerfiles/python-java/Makefile @@ -15,6 +15,6 @@ run: -v /etc/group:/etc/group \ -v $(shell pwd)/../../:/cjvt-srl-tagging \ -w /cjvt-srl-tagging \ - -v /home/kristjan/kres_mount:/kres_mount:ro \ + -v /home/luka/Development/srl/data:/kres_mount:ro \ python-java \ /bin/bash diff --git a/tools/check_all_files_existence.py b/tools/check_all_files_existence.py new file mode 100644 index 0000000..54c484e --- /dev/null +++ b/tools/check_all_files_existence.py @@ -0,0 +1,19 @@ +import os + +# INPATH = Path(config["tools"]["giga_srl"]) +# infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()])) +SOURCEPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files' +from shutil import copyfile + +INPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/2_srl' +OUTPATH = '/home/luka/Development/srl/cjvt-srl-tagging/data/giga_out/1_tsv' +for i in range(100000): + # print(os.path.join(INPATH, 'giga.%07d.tsv' % i)) + # if not os.path.exists(os.path.join(INPATH, 'giga.%07d.tsv' % i)): + # print('giga.%07d.tsv' % i) + if not os.path.exists(os.path.join(INPATH, 'giga%07d.srl.tsv' % i)): + copyfile(os.path.join(SOURCEPATH, 'giga.%07d.tsv' % i), os.path.join(OUTPATH, 'giga%07d.tsv' % i)) + print('giga%07d.srl.tsv' % i) + + if i % 1000 == 0: + print(i) diff --git a/tools/fillpred_model.srl.tsv b/tools/fillpred_model.srl.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tools/find_diff_sentence_ids.py b/tools/find_diff_sentence_ids.py new file mode 100644 index 0000000..11a7dda --- /dev/null +++ b/tools/find_diff_sentence_ids.py @@ -0,0 +1,192 @@ +import pickle + +from parser.parser import Parser +import os +from os.path import join, dirname +from pathlib import Path +import re +import sys +import cProfile +import configparser +import logging +from multiprocessing import Pool + +SSJ500K_2_1 = 27829 # number of sentences +par = Parser() + +# path to data +config = configparser.ConfigParser() +config.read("tools.cfg") +analysis = '' +if 'kres_orig' in config["tools"]: + analysis = 'kres' + INDIR = Path(config["tools"]["kres_orig"]) + OUTDIR = Path(config["tools"]["kres_tsv"]) +elif 'giga_orig' in config["tools"]: + # analysis = 'gigafida' + analysis = 'giga' + INDIR_GIGA = Path(config["tools"]["giga_orig"]) + INDIR_GIGA_ORIG = Path(config["tools"]["giga"]) + INDIR_JOS = Path(config["tools"]["giga_jos"]) + OUTDIR = Path(config["tools"]["giga_tsv"]) + GIGA_PARTS = int(config["tools"]["giga_parts"]) + INTERNAL_DATA = config["tools"]["internal_data"] + +CPU_CORES = int(config["tools"]["cpu_cores"]) + +LOGFILE = Path(config["tools"]["logfile"]).absolute() +LOGFILE.touch(exist_ok=True) +LOGFILE.resolve() + +logging.basicConfig(filename=str(LOGFILE), level=logging.INFO) + +origfiles = [] +for subdir, dirs, files in os.walk(INDIR_GIGA_ORIG): + for file in files: + origfiles.append(Path(os.path.join(subdir, file))) +origfiles=list(enumerate(sorted(origfiles))) + +def giga_orig_sentence_generator(): + with open(INDIR_GIGA, 'r') as gof: + previous_new_line = False + sentence_words = [] + for l_gof in gof: + if l_gof == '\n': + yield ' '.join(sentence_words) + sentence_words = [] + else: + sentence_words.append(l_gof.split('\t')[0]) + # yield l_gof + +sentence_generator = giga_orig_sentence_generator() + +sentence_ids = [] +for origfile in origfiles: + split_file_sentences = par.parse_tei(origfile[1]) + for k, v in split_file_sentences.items(): + one_file_sentence = next(sentence_generator) + if one_file_sentence == v['text']: + sentence_ids.append(v['sid']) + else: + print('----------------') + print('ERROR') + print(v['sid']) + print(one_file_sentence) + print(v['text']) + print(origfile[0]) + +# count sentences in orig (if not counted before) +# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')) +if os.path.exists(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl')): + os.remove(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl')) + +with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'wb') as output: + pickle.dump(sentence_ids, output) + +# def giga_orig_generator(): +# with open(INDIR_GIGA, 'r') as gof: +# previous_new_line = False +# for l_gof in gof: +# if l_gof == '\n': +# if previous_new_line: +# continue +# previous_new_line = True +# elif previous_new_line: +# previous_new_line = False +# yield l_gof + +# import time +# def handle_giga_file(ran): +# """ +# File that splits big text file into more minor files. Only split on empty lines. +# """ +# # with open(INDIR_GIGA, 'r') as gof: +# # with open(INDIR_JOS, 'r') as gjf: +# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)): +# # pass +# # num_lines = i + 1 +# # print(num_lines) +# num_lines = 1393184026 +# # 1393184026 +# # 1393184033 +# # return +# num_lines_per_part = num_lines / GIGA_PARTS +# curr_part = 0 +# gof_generator = giga_orig_generator() +# +# diff_files = set() +# # with open(INDIR_GIGA, 'r') as gof: +# with open(INDIR_GIGA_OLD, 'r') as gjf: +# # sentence = {} +# # sentence['tokens'] = [] +# # sentence['links'] = {} +# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])): +# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])) +# +# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a') +# +# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)): +# for i, l_gjf in enumerate(gjf): +# l_gof = next(gof_generator) +# if curr_part < ran[0]: +# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' : +# if curr_part < ran[0]: +# print(curr_part) +# curr_part += 1 +# continue +# else: +# continue +# +# l_gof_split = l_gof.split('\t') +# l_gjf_split = l_gjf.split('\t') +# +# # if punctuation +# if l_gof != '\n': +# if l_gof_split != l_gjf_split: +# print(curr_part) +# diff_files.add(curr_part) +# l_gof = next(gof_generator) +# +# +# # if l_gof == '\n': +# else: +# # wf.flush() +# # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n': +# if i > num_lines_per_part * (curr_part + 1): +# curr_part += 1 +# # if wf doesn't exist (first one) +# # wf.close() +# if curr_part >= ran[1]: +# break +# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)): +# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)) +# +# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a') +# +# curr_part += 1 +# return diff_files +# # wf.close() +# +# with Pool(CPU_CORES) as p: +# final_range = [0, 100000] +# # final_range = [0, 150] +# # size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES +# # # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)] +# # ranges = [] +# # ps = None +# # for i in range(CPU_CORES): +# # s = int(final_range[0] + size_per_proc * i) +# # ns = int(final_range[0] + size_per_proc * (i + 1)) +# # ranges.append([s, ns]) +# # # ranges = [[0, 1]] +# # res = p.map(handle_giga_file, ranges) +# +# res = handle_giga_file(final_range) +# res = sorted(list(res)) +# if os.path.exists(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl')): +# os.remove(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl')) +# with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'wb') as pkl_file: +# pickle.dump(res, pkl_file) +# # with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file: +# # mydict2 = pickle.load(pkl_file) +# print('test') \ No newline at end of file diff --git a/tools/gen_json.kres.py b/tools/gen_json.kres.py new file mode 100644 index 0000000..ff1d2a4 --- /dev/null +++ b/tools/gen_json.kres.py @@ -0,0 +1,114 @@ +from pathlib import Path +from parser.parser import Parser +import configparser +import json +import sys +import logging +from multiprocessing import Pool + +# parse config +config = configparser.ConfigParser() +config.read("tools.cfg") +# ORIGPATH = Path(config["tools"]["kres_orig"]) +INPATH = Path(config["tools"]["giga_srl"]) +OUTPATH = Path(config["tools"]["kres_json"]) +DEBUG = config["tools"]["debug"] == "True" +CPU_CORES = int(config["tools"]["cpu_cores"]) + +LOGFILE = Path(config["tools"]["logfile"]).absolute() +LOGFILE.touch(exist_ok=True) +LOGFILE.resolve() + +logging.basicConfig(filename=str(LOGFILE), level=logging.INFO) + +def get_origfile(filename): + for origfile in ORIGPATH.iterdir(): + if filename.name.split('.')[0] == origfile.name.split('.')[0]: + return origfile + raise FileNotFoundError + +def extract_sentences(line_reader): + acc = [] + # last char in line is \n, remove it + for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]: + if len(line) == 1: # empty line + tmp = acc + acc = [] + yield tmp + else: + acc.append(line) + +def to_sentence(sentence_arr): + return " ".join([token[1] for token in sentence_arr]) + +def match_sentence_id(sentence, orig_dict): + for k, e in orig_dict.items(): + orig_sentence = " ".join(token[2] for token in e["tokens"]) + if sentence == orig_sentence: + return k + raise KeyError + +def get_dep_rel(token): + logging.debug(token) + for i, field in enumerate(token[14:]): + if field != "_": + return { + "arg": field, + "from": i, # i-th predicate in sentence + "dep": token[0], + } + return None + +def handle_file(infile_tpl): + i = infile_tpl[0] + infile = infile_tpl[1] + outfile = (OUTPATH / infile.name).with_suffix(".json") + origfile = get_origfile(infile) + orig_dict = par.parse_tei(origfile) + + with infile.open("rb") as fp: + outdata = {} + for sentence_arr in extract_sentences(fp.readlines()): + # tsv dropped sentence ids, match the ID, using original data + sid = match_sentence_id(to_sentence(sentence_arr), orig_dict) + + outdata[sid] = [] + + # find all predicate indices in the sentence + predicates = [] + for token in sentence_arr: + if token[12] == "Y": + predicates += [token[0]] # idx + + deprel = get_dep_rel(token) + if deprel is not None: + outdata[sid].append(deprel) + + # deprel["from"] points to n-th predicate + # replace with predicate's token index + for deprel in outdata[sid]: + deprel["from"] = predicates[deprel["from"]] + + if DEBUG: + print(to_sentence(sentence_arr)) + print(outdata[sid]) + print(sid) + print() + print() + + with outfile.open("w") as fp: + json.dump(outdata, fp) + logging.info("SRL relations written to: {}".format(outfile)) + + +# main +par = Parser() +OUTPATH.mkdir(exist_ok=True) + +infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()])) +logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles))) + +with Pool(CPU_CORES) as p: + p.map(handle_file, infiles) + +logging.info("Finished generating .json files.") diff --git a/tools/gen_json.py b/tools/gen_json.py index 628f597..1e8f821 100644 --- a/tools/gen_json.py +++ b/tools/gen_json.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import pickle from pathlib import Path from parser.parser import Parser import configparser @@ -9,9 +14,10 @@ from multiprocessing import Pool # parse config config = configparser.ConfigParser() config.read("tools.cfg") -ORIGPATH = Path(config["tools"]["kres_orig"]) -INPATH = Path(config["tools"]["kres_srl"]) -OUTPATH = Path(config["tools"]["kres_json"]) +ORIGPATH = Path(config["tools"]["giga"]) +INPATH = Path(config["tools"]["giga_srl"]) +OUTPATH = Path(config["tools"]["giga_json"]) +INTERNAL_DATA = Path(config["tools"]["internal_data"]) DEBUG = config["tools"]["debug"] == "True" CPU_CORES = int(config["tools"]["cpu_cores"]) @@ -48,6 +54,13 @@ def match_sentence_id(sentence, orig_dict): return k raise KeyError +def match_sentence_id_giga(sentence, orig_dict): + for k, e in orig_dict.items(): + # orig_sentence = " ".join(token[2] for token in e["tokens"]) + if sentence == e["text"]: + return k + raise KeyError + def get_dep_rel(token): logging.debug(token) for i, field in enumerate(token[14:]): @@ -59,7 +72,7 @@ def get_dep_rel(token): } return None -def handle_file(infile_tpl): +def handle_file_old(infile_tpl): i = infile_tpl[0] infile = infile_tpl[1] outfile = (OUTPATH / infile.name).with_suffix(".json") @@ -101,14 +114,283 @@ def handle_file(infile_tpl): logging.info("SRL relations written to: {}".format(outfile)) +def handle_file(whole_input): + # sentence_id = whole_input[0][3] + # orig_infile = whole_input[0][1] + sentence_id = whole_input[3] + orig_infile = whole_input[1] + + # origfile = origfiles[0][1] + # infile_tpl = infile_tpl[0] + + # i = infile_tpl[0] + # infile = infile_tpl[1] + outfile = (OUTPATH / orig_infile.name).with_suffix(".json") + + if outfile.exists(): + return + # origfile = get_origfile() + orig_dict = par.parse_tei(orig_infile) + outdata = {} + + gen = srl_multiple_files_sentences_generator(sentence_id) + # gen = srl_multiple_files_sentences_generator(whole_input[1]) + + mismatch_sentences = 0 + + for sentence_i, (orig_id, orig_val) in enumerate(orig_dict.items()): + if orig_id == 'GF0014802.2685.7': + print('PAUSE') + + # look at neighbouring sentences if they are correct + for i in range(100): + sentence, sentence_arr = next(gen) + # orig_sentence = " ".join(token[2] for token in e["tokens"]) + if sentence == orig_val["text"]: + # if i != 10 and i != 0: + # print('OK!') + sid = orig_id + + outdata[sid] = [] + + # find all predicate indices in the sentence + predicates = [] + for token in sentence_arr: + if token[12] == "Y": + predicates += [token[0]] # idx + + deprel = get_dep_rel(token) + if deprel is not None: + outdata[sid].append(deprel) + + # deprel["from"] points to n-th predicate + # replace with predicate's token index + for deprel in outdata[sid]: + deprel["from"] = predicates[deprel["from"]] + + if DEBUG: + print(to_sentence(sentence_arr)) + print(outdata[sid]) + print(sid) + print() + print() + break + else: + if i == 99: + mismatch_sentences += 1 + sid = orig_id + outdata[sid] = [] + gen = srl_multiple_files_sentences_generator(sentence_id + sentence_i) + + if mismatch_sentences > 0: + if mismatch_sentences / len(orig_dict.items()) < 0.1: + print('Slight mismatch - %d' % sentence_id) + print(whole_input) + print('ABS mitigated %d' % mismatch_sentences) + print('------------------------------------------------') + else: + print('ERRRRRRRRRRRRRRRROOOOOOORRRRRRRRRRR') + print('Big mismatch - %d' % sentence_id) + print(whole_input) + print('ABS mitigated errors:') + print(mismatch_sentences) + print('------------------------------------------------') + + + with outfile.open("w") as fp: + json.dump(outdata, fp) + logging.info("SRL relations written to: {}".format(outfile)) + +def count_orig_file_sentences(filename): + + if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)): + return + print(filename[0]) + orig_dict = par.parse_tei(filename[1]) + # return filename[0], filename[1], len(orig_dict) + with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output: + pickle.dump((filename[0], filename[1], len(orig_dict)), output) + + +def count_srl_file_sentences(filename): + if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)): + return + + print(filename[0]) + num_sentences = 0 + with filename[1].open("r") as fp: + for line in fp: + if line == '\n': + num_sentences += 1 + + # return filename[0], filename[1], num_sentences + with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output: + pickle.dump((filename[0], filename[1], num_sentences), output) + +def srl_sentences_generator(infile, curr_index, sen_start_index): + with infile.open("rb") as fp: + outdata = {} + for sentence_arr in extract_sentences(fp.readlines()): + if curr_index < sen_start_index: + curr_index += 1 + else: + yield to_sentence(sentence_arr), sentence_arr + yield None + + +def srl_multiple_files_sentences_generator(sentence_id): # srl_files): + sentence_id = max(0, sentence_id - 10) + for i, srl_file in enumerate(srl_file_sizes): + if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]: + srl_files = srl_file_sizes[i:] + break + + for file_info in srl_files: + # srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4]) + srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id) + el = next(srl_gen) + while el is not None: + yield el + el = next(srl_gen) + + yield None + + # main par = Parser() OUTPATH.mkdir(exist_ok=True) -infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()])) +infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()])) logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles))) +origfiles = [] +for subdir, dirs, files in os.walk(ORIGPATH): + for file in files: + origfiles.append(Path(os.path.join(subdir, file))) +origfiles=list(enumerate(sorted(origfiles))) +##### REMOVE ############ +# origfiles = origfiles[:3] + +# count sentences in orig (if not counted before) +# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')) +if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')): + # srl_file_sizes = {} + if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks')): + os.makedirs(os.path.join(INTERNAL_DATA, 'orig_chunks')) + # with Pool(CPU_CORES) as p: + # # p.map(handle_file, infiles) + # p.map(count_orig_file_sentences, origfiles) + for i in range(len(origfiles)): + count_orig_file_sentences(origfiles[i]) + orig_file_sizes = [] + for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'orig_chunks')).iterdir())): + print(x.name) + if x.is_file(): + with x.open('rb') as pkl_small_file: + orig_file_sizes.append(pickle.load(pkl_small_file)) + # orig_file_sizes = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()])) + print("Sorting orig files") + orig_file_sizes = sorted(orig_file_sizes) + total_size = 0 + orig_file_sizes_final = [] + print("Calculating orig files size") + for n, pa, si in orig_file_sizes: + orig_file_sizes_final.append((n, pa, si, total_size)) + total_size += si + orig_file_sizes = orig_file_sizes_final + print("Saving orig files size") + with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'wb') as output: + pickle.dump(orig_file_sizes, output) + print("Orig files saved") +else: + with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'rb') as pkl_file: + orig_file_sizes = pickle.load(pkl_file) + + +# count sentences in srl (if not counted before) +# os.remove(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl')) +if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl')): + # srl_file_sizes = {} + if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks')): + os.makedirs(os.path.join(INTERNAL_DATA, 'srl_chunks')) + # with Pool(CPU_CORES) as p: + # # p.map(handle_file, infiles) + # p.map(count_srl_file_sentences, infiles) + + for i in range(len(infiles)): + count_srl_file_sentences(infiles[i]) + + srl_file_sizes = [] + for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'srl_chunks')).iterdir())): + print(x.name) + if x.is_file(): + with x.open('rb') as pkl_small_file: + srl_file_sizes.append(pickle.load(pkl_small_file)) + print("Sorting srl files") + srl_file_sizes = sorted(srl_file_sizes) + total_size = 0 + srl_file_sizes_final = [] + print("Calculating srl files size") + for n, pa, si in srl_file_sizes: + srl_file_sizes_final.append((n, pa, si, total_size)) + total_size += si + srl_file_sizes = srl_file_sizes_final + print("Saving srl files size") + with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'wb') as output: + pickle.dump(srl_file_sizes, output) + print("Srl files saved") +else: + with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'rb') as pkl_file: + srl_file_sizes = pickle.load(pkl_file) + + +# print(len(orig_file_sizes)) +# print('asd' + 2) + +# inputs = [] +# srl_i = 0 +# srl_file = srl_file_sizes[srl_i] +# for orig_i, orig_path, orig_size, orig_first_sent_i in orig_file_sizes: +# interesting_srl_files = [] +# # beginning of srl chunk in range of orig chunk or ending of srl chunk in range of orig chunk +# # while srl_file[3] >= orig_first_sent_i and srl_file[3] < orig_first_sent_i + orig_size or \ +# # srl_file[3] + srl_file[2] - 1 >= orig_first_sent_i and srl_file[3] + srl_file[2] - 1 < orig_first_sent_i + orig_size: +# while srl_file[3] < orig_first_sent_i + orig_size and srl_file[3] + srl_file[2] > orig_first_sent_i: +# # if beginning of file is in +# if srl_file[3] > orig_first_sent_i: +# interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], srl_file[3])) +# # print('if %d' % srl_file[3]) +# else: +# interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], orig_first_sent_i)) +# # print('else %d' % orig_first_sent_i) +# +# if orig_first_sent_i + orig_size >= srl_file[3] + srl_file[2]: +# srl_i += 1 +# if srl_i < len(srl_file_sizes): +# srl_file = srl_file_sizes[srl_i] +# else: +# break +# # print(srl_i) +# # print('a ' + 2) +# else: +# break +# +# inputs.append([[orig_i, orig_path, orig_size, orig_first_sent_i], interesting_srl_files]) + # print(inputs[-1]) + + + +# srl_gen = srl_sentences_generator(srl_file_sizes[0][1], 0, 533) +# a = next(srl_gen) +# b = next(srl_gen) +# c = next(srl_gen) + +print('beginning processing') with Pool(CPU_CORES) as p: - p.map(handle_file, infiles) + # p.map(handle_file, inputs) + p.map(handle_file, orig_file_sizes) + +# for of in orig_file_sizes: +# handle_file(of) logging.info("Finished generating .json files.") diff --git a/tools/gen_json_fix_errors.py b/tools/gen_json_fix_errors.py new file mode 100644 index 0000000..3510eed --- /dev/null +++ b/tools/gen_json_fix_errors.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import pickle +from pathlib import Path +from parser.parser import Parser +import configparser +import json +import sys +import logging +from multiprocessing import Pool + +# parse config +config = configparser.ConfigParser() +config.read("tools.cfg") +ORIGPATH = Path(config["tools"]["giga"]) +INPATH = Path(config["tools"]["giga_srl_errors"]) +OUTPATH = Path(config["tools"]["giga_json"]) +INTERNAL_DATA = Path(config["tools"]["internal_data"]) +DEBUG = config["tools"]["debug"] == "True" +CPU_CORES = int(config["tools"]["cpu_cores"]) + +LOGFILE = Path(config["tools"]["logfile"]).absolute() +LOGFILE.touch(exist_ok=True) +LOGFILE.resolve() + +logging.basicConfig(filename=str(LOGFILE), level=logging.INFO) +error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))] + + + + +def get_origfile(filename): + for origfile in ORIGPATH.iterdir(): + if filename.name.split('.')[0] == origfile.name.split('.')[0]: + return origfile + raise FileNotFoundError + +def extract_sentences(line_reader): + acc = [] + # last char in line is \n, remove it + for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]: + if len(line) == 1: # empty line + tmp = acc + acc = [] + yield tmp + else: + acc.append(line) + +def to_sentence(sentence_arr): + return " ".join([token[1] for token in sentence_arr]) + +def match_sentence_id(sentence, orig_dict): + for k, e in orig_dict.items(): + orig_sentence = " ".join(token[2] for token in e["tokens"]) + if sentence == orig_sentence: + return k + raise KeyError + +def match_sentence_id_giga(sentence, orig_dict): + for k, e in orig_dict.items(): + # orig_sentence = " ".join(token[2] for token in e["tokens"]) + if sentence == e["text"]: + return k + raise KeyError + +def get_dep_rel(token): + logging.debug(token) + for i, field in enumerate(token[14:]): + if field != "_": + return { + "arg": field, + "from": i, # i-th predicate in sentence + "dep": token[0], + } + return None + +def handle_file_old(infile_tpl): + i = infile_tpl[0] + infile = infile_tpl[1] + outfile = (OUTPATH / infile.name).with_suffix(".json") + origfile = get_origfile(infile) + orig_dict = par.parse_tei(origfile) + + with infile.open("rb") as fp: + outdata = {} + for sentence_arr in extract_sentences(fp.readlines()): + # tsv dropped sentence ids, match the ID, using original data + sid = match_sentence_id(to_sentence(sentence_arr), orig_dict) + + outdata[sid] = [] + + # find all predicate indices in the sentence + predicates = [] + for token in sentence_arr: + if token[12] == "Y": + predicates += [token[0]] # idx + + deprel = get_dep_rel(token) + if deprel is not None: + outdata[sid].append(deprel) + + # deprel["from"] points to n-th predicate + # replace with predicate's token index + for deprel in outdata[sid]: + deprel["from"] = predicates[deprel["from"]] + + if DEBUG: + print(to_sentence(sentence_arr)) + print(outdata[sid]) + print(sid) + print() + print() + + with outfile.open("w") as fp: + json.dump(outdata, fp) + logging.info("SRL relations written to: {}".format(outfile)) + + +def fix_json(srl_gen, error_sentence, orig_json_data): + # sentence_id = whole_input[0][3] + # orig_infile = whole_input[0][1] + # sentence_id = whole_input[3] + # orig_infile = whole_input[1] + + # origfile = origfiles[0][1] + # infile_tpl = infile_tpl[0] + + # i = infile_tpl[0] + # infile = infile_tpl[1] + # outfile = (OUTPATH / orig_infile.name).with_suffix(".json") + + # if outfile.exists(): + # return + # origfile = get_origfile() + # orig_dict = par.parse_tei(orig_infile) + # outdata = {} + + # gen = srl_multiple_files_sentences_generator(sentence_id) + # gen = srl_multiple_files_sentences_generator(whole_input[1]) + + # mismatch_sentences = 0 + + # look at neighbouring sentences if they are correct + sentence, sentence_arr = next(srl_gen) + # orig_sentence = " ".join(token[2] for token in e["tokens"]) + sid = error_sentence + # a = orig_json_data[sid] + if orig_json_data[sid] != []: + # print('POSSIBLE ERROR:') + # print(orig_json_data[sid]) + orig_json_data[sid] = [] + + # find all predicate indices in the sentence + predicates = [] + for token in sentence_arr: + if token[12] == "Y": + predicates += [token[0]] # idx + + deprel = get_dep_rel(token) + if deprel is not None: + orig_json_data[sid].append(deprel) + + # deprel["from"] points to n-th predicate + # replace with predicate's token index + for deprel in orig_json_data[sid]: + deprel["from"] = predicates[deprel["from"]] + + if DEBUG: + print(to_sentence(sentence_arr)) + print(orig_json_data[sid]) + print(sid) + print() + print() + # a = orig_json_data[sid] + return orig_json_data + +def count_orig_file_sentences(filename): + + if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)): + return + print(filename[0]) + orig_dict = par.parse_tei(filename[1]) + # return filename[0], filename[1], len(orig_dict) + with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output: + pickle.dump((filename[0], filename[1], len(orig_dict)), output) + + +def count_srl_file_sentences(filename): + if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)): + return + + print(filename[0]) + num_sentences = 0 + with filename[1].open("r") as fp: + for line in fp: + if line == '\n': + num_sentences += 1 + + # return filename[0], filename[1], num_sentences + with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output: + pickle.dump((filename[0], filename[1], num_sentences), output) + +def srl_error_fix_generator(infile): + with infile.open("rb") as fp: + for sentence_arr in extract_sentences(fp.readlines()): + yield to_sentence(sentence_arr), sentence_arr + yield None + +def srl_sentences_generator(infile, curr_index, sen_start_index): + with infile.open("rb") as fp: + outdata = {} + for sentence_arr in extract_sentences(fp.readlines()): + if curr_index < sen_start_index: + curr_index += 1 + else: + yield to_sentence(sentence_arr), sentence_arr + yield None + + +def srl_multiple_files_sentences_generator(sentence_id): # srl_files): + sentence_id = max(0, sentence_id - 10) + for i, srl_file in enumerate(srl_file_sizes): + if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]: + srl_files = srl_file_sizes[i:] + break + + for file_info in srl_files: + # srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4]) + srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id) + el = next(srl_gen) + while el is not None: + yield el + el = next(srl_gen) + + yield None + +error_sentences_grouped = [] +group = False +prev_name = '' +# group sentences by their files +for name in error_sentences: + if name[:9] == prev_name: + group.append(name) + else: + prev_name = name[:9] + if group: + error_sentences_grouped.append(group) + group = [name] +error_sentences_grouped.append(group) + +srl_gen = srl_error_fix_generator(INPATH) + +# find errors in json files: +# with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as output: +# sentence_ids = pickle.load(output) +# +# +# +# origfiles = [] +# for subdir, dirs, files in os.walk(OUTPATH): +# for file in files: +# origfiles.append(Path(os.path.join(subdir, file))) +# origfiles=sorted(origfiles) +# +# +# +# for sent in origfiles: +# # for sent in sentence_ids: +# # outfile = Path(OUTPATH, sent[:9] + '-dedup.json') +# outfile = sent +# +# try: +# with outfile.open() as json_file: +# json.load(json_file) +# pass +# except: +# print(outfile.name) +# +# +# raise Exception('test') +# iterate over all wronged sentences and fix them +for errors_in_file in error_sentences_grouped: + outfile = Path(OUTPATH, errors_in_file[0][:9] + '-dedup.json') + with outfile.open() as json_file: + print(outfile.name) + orig_json_data = json.load(json_file) + for error_sentence in errors_in_file: + orig_json_data = fix_json(srl_gen, error_sentence, orig_json_data) + + with outfile.open('w') as json_file: + json.dump(orig_json_data, json_file) + logging.info("SRL relations written to: {}".format(outfile)) diff --git a/tools/parse_all.py b/tools/parse_all.py index 3b56a18..86c3caf 100644 --- a/tools/parse_all.py +++ b/tools/parse_all.py @@ -1,3 +1,5 @@ +import pickle + from parser.parser import Parser import os from os.path import join, dirname @@ -15,8 +17,21 @@ par = Parser() # path to data config = configparser.ConfigParser() config.read("tools.cfg") -INDIR = Path(config["tools"]["kres_orig"]) -OUTDIR = Path(config["tools"]["kres_tsv"]) +analysis = '' +if 'kres_orig' in config["tools"]: + analysis = 'kres' + INDIR = Path(config["tools"]["kres_orig"]) + OUTDIR = Path(config["tools"]["kres_tsv"]) +elif 'giga_orig' in config["tools"]: + # analysis = 'gigafida' + analysis = 'giga' + INDIR_GIGA_ORIG = Path(config["tools"]["giga"]) + INDIR_GIGA = Path(config["tools"]["giga_orig"]) + INDIR_JOS = Path(config["tools"]["giga_jos"]) + OUTDIR = Path(config["tools"]["giga_tsv"]) + GIGA_PARTS = int(config["tools"]["giga_parts"]) + INTERNAL_DATA = config["tools"]["internal_data"] + CPU_CORES = int(config["tools"]["cpu_cores"]) LOGFILE = Path(config["tools"]["logfile"]).absolute() @@ -36,8 +51,9 @@ print("end parsing ssj") # kres_file = "../data/kres_example/F0019343.xml.parsed.xml" OUTDIR.mkdir(exist_ok=True) -infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()])) -logging.info("Parsing kres: {} files.".format(len(infiles))) +if analysis == 'kres': + infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()])) + logging.info("Parsing kres: {} files.".format(len(infiles))) def handle_file(infile): i = infile[0] @@ -65,10 +81,297 @@ def handle_file(infile): return True return False -with Pool(CPU_CORES) as p: - p.map(handle_file, infiles) +def giga_orig_generator(): + with open(INDIR_GIGA, 'r') as gof: + previous_new_line = False + for l_gof in gof: + if l_gof == '\n': + if previous_new_line: + continue + previous_new_line = True + elif previous_new_line: + previous_new_line = False + yield l_gof -logging.info("end parsing kres") +def handle_gigafida_file(): + """ + File that splits big text file into more minor files. Only split on empty lines. + """ + # with open(INDIR_GIGA, 'r') as gof: + # with open(INDIR_JOS, 'r') as gjf: + # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)): + # pass + # num_lines = i + 1 + # print(num_lines) + num_lines = 1393184026 + # 1393184026 + # 1393184033 + # return + num_lines_per_part = num_lines / GIGA_PARTS + curr_part = 0 + gof_generator = giga_orig_generator() + # with open(INDIR_GIGA, 'r') as gof: + with open(INDIR_JOS, 'r') as gjf: + sentence = {} + sentence['tokens'] = [] + sentence['links'] = {} + if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)): + ignore_lines = True + wf = False + else: + wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a') + ignore_lines = False + # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)): + for i, l_gjf in enumerate(gjf): + l_gof = next(gof_generator) + if ignore_lines: + if i > num_lines_per_part * curr_part and l_gof == '\n': + if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))): + ignore_lines = False + # delete last file (probably not whole) + os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1))) + if ignore_lines: + print(curr_part) + curr_part += 1 + continue + else: + continue + l_gof_split = l_gof.split('\t') + l_gjf_split = l_gjf.split('\t') + + # if punctuation + if l_gof != '\n': + if l_gof_split[1][-1] == 'u': + # print(l_gjf_split) + sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1])) + else: + sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1])) + + sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6]) + + # if l_gof == '\n': + else: + if wf: + # print(i) + wf.write(par.to_conll_2009_SRL(sentence)) + sentence['tokens'] = [] + sentence['links'] = {} + # wf.flush() + # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n': + if i > num_lines_per_part * (curr_part + 1): + curr_part += 1 + # if wf doesn't exist (first one) + if wf: + wf.close() + wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a') + curr_part += 1 + wf.close() + +import time +def handle_giga_file(ran): + """ + File that splits big text file into more minor files. Only split on empty lines. + """ + # with open(INDIR_GIGA, 'r') as gof: + # with open(INDIR_JOS, 'r') as gjf: + # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)): + # pass + # num_lines = i + 1 + # print(num_lines) + num_lines = 1393184026 + # 1393184026 + # 1393184033 + # return + num_lines_per_part = num_lines / GIGA_PARTS + curr_part = 0 + gof_generator = giga_orig_generator() + # with open(INDIR_GIGA, 'r') as gof: + with open(INDIR_JOS, 'r') as gjf: + sentence = {} + sentence['tokens'] = [] + sentence['links'] = {} + wf = None + if curr_part in file_indices: + if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])): + os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])) + + wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a') + + # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)): + for i, l_gjf in enumerate(gjf): + l_gof = next(gof_generator) + if curr_part < ran[0]: + if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' : + if curr_part < ran[0]: + print(curr_part) + curr_part += 1 + continue + else: + continue + + l_gof_split = l_gof.split('\t') + l_gjf_split = l_gjf.split('\t') + + # if punctuation + if l_gof != '\n': + if curr_part not in file_indices: + continue + if l_gof_split[1][-1] == 'u': + # print(l_gjf_split) + sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1])) + else: + sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1])) + + sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6]) + + # if l_gof == '\n': + else: + if curr_part in file_indices: + wf.write(par.to_conll_2009_SRL(sentence)) + sentence['tokens'] = [] + sentence['links'] = {} + # wf.flush() + # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n': + if i > num_lines_per_part * (curr_part + 1): + curr_part += 1 + # if wf doesn't exist (first one) + if curr_part in file_indices and wf: + wf.close() + if curr_part >= ran[1]: + break + if curr_part in file_indices: + if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)): + os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)) + + wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a') + + curr_part += 1 + wf.close() + +def handle_giga_file_selected_sentences(error_sentences): + """ + File that splits big text file into more minor files. Only split on empty lines. + """ + # with open(INDIR_GIGA, 'r') as gof: + # with open(INDIR_JOS, 'r') as gjf: + # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)): + # pass + # num_lines = i + 1 + # print(num_lines) + # print('num_lines' + 3) + # num_lines = 1393184026 + num_lines = 1393222523 + # 1393184026 + # 1393184033 + # return + # num_lines_per_part = num_lines / GIGA_PARTS + # curr_part = 0 + gof_generator = giga_orig_generator() + # with open(INDIR_GIGA, 'r') as gof: + with open(INDIR_JOS, 'r') as gjf: + sentence = {} + sentence['tokens'] = [] + sentence['links'] = {} + wf = None + if os.path.exists(os.path.join(OUTDIR, 'giga_errors')): + os.remove(os.path.join(OUTDIR, 'giga_errors')) + + wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a') + with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file: + sentence_ids_list = pickle.load(pkl_file) + sentence_id = 0 + skip_sentence = not sentence_ids_list[sentence_id] in error_sentences + + # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)): + for i, l_gjf in enumerate(gjf): + l_gof = next(gof_generator) + + + if l_gjf == '\n': + if not skip_sentence: + wf.write(par.to_conll_2009_SRL(sentence)) + sentence['tokens'] = [] + sentence['links'] = {} + sentence_id += 1 + if sentence_ids_list[sentence_id] in error_sentences: + print(sentence_ids_list[sentence_id]) + skip_sentence = False + else: + skip_sentence = True + + if skip_sentence: + continue + + + # if curr_part < ran[0]: + # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' : + # if curr_part < ran[0]: + # print(curr_part) + # curr_part += 1 + # continue + # else: + # continue + + l_gof_split = l_gof.split('\t') + l_gjf_split = l_gjf.split('\t') + + # if punctuation + if l_gof != '\n': + if l_gof_split[1][-1] == 'u': + # print(l_gjf_split) + sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1])) + else: + sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1])) + + sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6]) + + # if l_gof == '\n': + # wf.flush() + # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n': + # if i > num_lines_per_part * (curr_part + 1): + # curr_part += 1 + # # if wf doesn't exist (first one) + # if curr_part in file_indices and wf: + # wf.close() + # if curr_part >= ran[1]: + # break + # if curr_part in file_indices: + # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)): + # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)) + # + # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a') + + # curr_part += 1 + wf.close() + +file_indices = set(range(0, 100000)) +with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file: + file_indices = set(pickle.load(pkl_file)) + +with Pool(CPU_CORES) as p: + if analysis == 'kres': + p.map(handle_file, infiles) + elif analysis == 'gigafida': + handle_gigafida_file() + elif analysis == 'giga': + final_range = [0, 100000] + size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES + # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)] + ranges = [] + ps = None + for i in range(CPU_CORES): + s = int(final_range[0] + size_per_proc * i) + ns = int(final_range[0] + size_per_proc * (i + 1)) + ranges.append([s, ns]) + # ranges = [[0, 1]] + + # p.map(handle_giga_file, ranges) + # p.map(handle_giga_file, ranges) + error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))] + handle_giga_file_selected_sentences(set(error_sentences)) + + +logging.info("end parsing kres") diff --git a/tools/parser/parser.py b/tools/parser/parser.py index d37b49c..80e353f 100644 --- a/tools/parser/parser.py +++ b/tools/parser/parser.py @@ -57,7 +57,10 @@ class Parser: divs = [] # in ssj, there are divs, in Kres, there are separate files if "id" in root.keys(): # Kres files start with - guess_corpus = "KRES" + if root.get("id")[0:2] == 'GF': + guess_corpus = "GIGA" + else: + guess_corpus = "KRES" divs = [root] else: guess_corpus = "SSJ" @@ -65,7 +68,10 @@ class Parser: # parse divs for div in divs: - f_id = div.get("id") + f_id = div.get("id")[:-6] + + if guess_corpus == "GIGA": + div = div.findall(".//body")[0] # parse paragraphs for p in div.findall(".//p"): @@ -75,46 +81,62 @@ class Parser: for s in p.findall(".//s"): s_id = s.get("id").split(".")[-1] sentence_text = "" + sentence_list = [] sentence_tokens = [] # parse tokens for el in s.iter(): if el.tag in self.W_TAGS: - el_id = el.get("id").split(".")[-1] - if el_id[0] == 't': - el_id = el_id[1:] # ssj W_TAG ids start with t - sentence_text += el.text - sentence_tokens += [( - "w", - int(el_id), - el.text, - el.get("lemma"), - (el.get("msd") if guess_corpus == "KRES" - else el.get("ana").split(":")[-1]), - )] + if guess_corpus != "GIGA": + el_id = el.get("id").split(".")[-1] + if el_id[0] == 't': + el_id = el_id[1:] # ssj W_TAG ids start with t + sentence_text += el.text + sentence_tokens += [( + "w", + int(el_id), + el.text, + el.get("lemma"), + (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA" + else el.get("ana").split(":")[-1]), + )] + else: + sentence_list.append(el.text) elif el.tag in self.C_TAGS: # only Kres' C_TAGS have ids - el_id = el.get("id") or "none" - el_id = el_id.split(".")[-1] - sentence_text += el.text - sentence_tokens += [("c", el_id, el.text,)] + if guess_corpus != "GIGA": + el_id = el.get("id") or "none" + el_id = el_id.split(".")[-1] + sentence_text += el.text + sentence_tokens += [("c", el_id, el.text,)] elif el.tag in self.S_TAGS: # Kres' doesn't contain .text - sentence_text += " " + if guess_corpus == "GIGA": + sentence_list.append(el.text) + else: + sentence_text += " " else: # pass links and linkGroups pass sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) if sentence_id in res_dict: raise KeyError("duplicated id: {}".format(sentence_id)) - res_dict[sentence_id] = { - "sid": sentence_id, - "text": sentence_text, - "tokens": sentence_tokens, - "links": ( - parse_links(s) if guess_corpus == "KRES" else None - ) - } + if guess_corpus == "GIGA": + res_dict[sentence_id] = { + "sid": sentence_id, + "text": ' '.join(sentence_list), + "tokens": None, + "links": None + } + else: + res_dict[sentence_id] = { + "sid": sentence_id, + "text": sentence_text, + "tokens": sentence_tokens, + "links": ( + parse_links(s) if guess_corpus == "KRES" else None + ) + } fp.close() return res_dict @@ -123,7 +145,7 @@ class Parser: def fillpred(tsv_row): mrow = build_model_row(tsv_row) - x = mrow[:-1] + x = mrow[:-1] y = self.fillpred_model.predict([x]) return y[0] # bool diff --git a/tools/srl-20131216/scripts/parse_srl_only_mod.sh b/tools/srl-20131216/scripts/parse_srl_only_mod.sh index a374127..7ed46b2 100755 --- a/tools/srl-20131216/scripts/parse_srl_only_mod.sh +++ b/tools/srl-20131216/scripts/parse_srl_only_mod.sh @@ -34,7 +34,8 @@ JVM_ARGS="-cp $CP -Xmx$MEM" NOPI="-nopi" #Uncomment this if you want to skip the predicate identification step. This setting is equivalent to the CoNLL 2009 ST. -CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang $INPUT $MODEL $RERANKER $NOPI $OUTPUT" -echo "Executing: $CMD" +$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang "$INPUT" $MODEL $RERANKER $NOPI "$OUTPUT" +# CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang '$INPUT' $MODEL $RERANKER $NOPI '$OUTPUT'" +# echo "Executing: $CMD" -$CMD +# $CMD diff --git a/tools/srl-20131216/tag_all.gigafida.sh b/tools/srl-20131216/tag_all.gigafida.sh new file mode 100755 index 0000000..0a9f45b --- /dev/null +++ b/tools/srl-20131216/tag_all.gigafida.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# parsing tools.cfg values +IN_FOLDER="$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg)" +echo "input folder: $IN_FOLDER" +OUT_FOLDER="$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg)" +echo "output folder: $OUT_FOLDER" + +SUFFIX="srl.tsv" + +mkdir -p "$OUT_FOLDER" +rm "$OUT_FOLDER/*${SUFFIX}" &> /dev/null + +for infile in "$IN_FOLDER/*"; do + echo "Tagging: ${infile}" + base=$(basename $infile | cut -d'.' -f1) + outfile="${OUT_FOLDER}/${base}.${SUFFIX}" + + # mate-tools tagger + ./scripts/parse_srl_only_mod.sh "$infile" "$outfile" + + if [ $? -eq 0 ]; then + echo "Saved as ${outfile}" + else + echo "ERR" + exit 1 + fi +done + diff --git a/tools/srl-20131216/tag_all.kres.sh b/tools/srl-20131216/tag_all.kres.sh new file mode 100755 index 0000000..ec6cc5c --- /dev/null +++ b/tools/srl-20131216/tag_all.kres.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# parsing tools.cfg values +IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)" +echo "input folder: $IN_FOLDER" +OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)" +echo "output folder: $OUT_FOLDER" + +SUFFIX="srl.tsv" + +mkdir -p $OUT_FOLDER +rm $OUT_FOLDER/*${SUFFIX} &> /dev/null + +for infile in $IN_FOLDER/*; do + echo "Tagging: ${infile}" + base=$(basename $infile | cut -d'.' -f1) + outfile=${OUT_FOLDER}/${base}.${SUFFIX} + + # mate-tools tagger + ./scripts/parse_srl_only_mod.sh $infile $outfile + + if [ $? -eq 0 ]; then + echo "Saved as ${outfile}" + else + echo "ERR" + exit 1 + fi +done + diff --git a/tools/srl-20131216/tag_all.sh b/tools/srl-20131216/tag_all.sh index ec6cc5c..06df810 100755 --- a/tools/srl-20131216/tag_all.sh +++ b/tools/srl-20131216/tag_all.sh @@ -1,15 +1,16 @@ #!/bin/bash # parsing tools.cfg values -IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)" +IN_FOLDER="../$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg.kres_new)" +IN_FOLDER=$IN_FOLDER$1 echo "input folder: $IN_FOLDER" -OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)" +OUT_FOLDER="../$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg.kres_new)" echo "output folder: $OUT_FOLDER" SUFFIX="srl.tsv" mkdir -p $OUT_FOLDER -rm $OUT_FOLDER/*${SUFFIX} &> /dev/null +# rm $OUT_FOLDER/*${SUFFIX} &> /dev/null for infile in $IN_FOLDER/*; do echo "Tagging: ${infile}" diff --git a/tools/tools.cfg b/tools/tools.cfg index 2751d43..fb538df 100644 --- a/tools/tools.cfg +++ b/tools/tools.cfg @@ -1,8 +1,18 @@ [tools] -kres_orig = /kres_mount/kres_parsed/tei -kres_tsv = ../data/kres_out/1_tsv -kres_srl = ../data/kres_out/2_srl -kres_json = ../data/kres_out/final_json +giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig +giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001 +; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup +giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001 +giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part +; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP +; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy +; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP +giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl +giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv +; giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json_TEMP +giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json +internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data +giga_parts = 100000 logfile = ../progress.log -cpu_cores = 5 +cpu_cores = 16 debug = False diff --git a/tools/tools.cfg.gigafida b/tools/tools.cfg.gigafida new file mode 100644 index 0000000..79b02b0 --- /dev/null +++ b/tools/tools.cfg.gigafida @@ -0,0 +1,16 @@ +[tools] +giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig +giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001 +; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup +giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001 +giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part +; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP +; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy +; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP +giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl +giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json +internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data +giga_parts = 100000 +logfile = ../progress.log +cpu_cores = 1 +debug = False diff --git a/tools/tools.cfg.kres b/tools/tools.cfg.kres new file mode 100644 index 0000000..af64ec1 --- /dev/null +++ b/tools/tools.cfg.kres @@ -0,0 +1,8 @@ +[tools] +kres_orig = /home/luka/Development/srl/data/kres_parsed/tei +kres_tsv = ../data/kres_out/1_tsv +kres_srl = ../data/kres_out/2_srl +kres_json = ../data/kres_out/final_json +logfile = ../progress.log +cpu_cores = 5 +debug = False diff --git a/tools/tools.cfg.kres_new b/tools/tools.cfg.kres_new new file mode 100644 index 0000000..803d939 --- /dev/null +++ b/tools/tools.cfg.kres_new @@ -0,0 +1,8 @@ +[tools] +kres_orig = /home/luka/Development/srl/data/kres_parsed/tei +giga_tsv = ../data/giga_out/1_tsv +giga_srl = ../data/giga_out/2_srl +kres_json = ../data/giga_out/final_json +logfile = ../progress.log +cpu_cores = 5 +debug = False