2022-02-04 10:24:47 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
import os
|
|
|
|
import pickle
|
2019-02-25 23:22:15 +00:00
|
|
|
from pathlib import Path
|
|
|
|
from parser.parser import Parser
|
2019-02-27 08:15:40 +00:00
|
|
|
import configparser
|
2019-02-27 16:32:19 +00:00
|
|
|
import json
|
|
|
|
import sys
|
2019-02-28 09:15:14 +00:00
|
|
|
import logging
|
2019-02-28 22:37:47 +00:00
|
|
|
from multiprocessing import Pool
|
2019-02-25 23:22:15 +00:00
|
|
|
|
2019-02-27 08:15:40 +00:00
|
|
|
# parse config
|
|
|
|
config = configparser.ConfigParser()
|
2022-03-14 10:01:53 +00:00
|
|
|
config.read("tools.cfg.ssj500k2.3")
|
|
|
|
ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"])
|
|
|
|
INPATH = Path(config["tools"]["ssj500k_srl"])
|
|
|
|
OUTPATH = Path(config["tools"]["ssj500k_json"])
|
2022-02-04 10:24:47 +00:00
|
|
|
INTERNAL_DATA = Path(config["tools"]["internal_data"])
|
2019-02-27 16:32:19 +00:00
|
|
|
DEBUG = config["tools"]["debug"] == "True"
|
2019-02-28 22:37:47 +00:00
|
|
|
CPU_CORES = int(config["tools"]["cpu_cores"])
|
2019-02-28 09:34:12 +00:00
|
|
|
|
2019-02-28 22:37:47 +00:00
|
|
|
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
|
|
|
LOGFILE.touch(exist_ok=True)
|
|
|
|
LOGFILE.resolve()
|
2019-02-27 08:15:40 +00:00
|
|
|
|
2019-02-28 22:37:47 +00:00
|
|
|
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
|
2019-02-28 09:15:14 +00:00
|
|
|
|
2019-02-25 23:22:15 +00:00
|
|
|
def get_origfile(filename):
|
2019-02-28 22:37:47 +00:00
|
|
|
for origfile in ORIGPATH.iterdir():
|
|
|
|
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
|
|
|
return origfile
|
|
|
|
raise FileNotFoundError
|
2019-02-25 23:22:15 +00:00
|
|
|
|
|
|
|
def extract_sentences(line_reader):
|
2019-02-28 22:37:47 +00:00
|
|
|
acc = []
|
|
|
|
# last char in line is \n, remove it
|
|
|
|
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
|
|
|
|
if len(line) == 1: # empty line
|
|
|
|
tmp = acc
|
|
|
|
acc = []
|
|
|
|
yield tmp
|
|
|
|
else:
|
|
|
|
acc.append(line)
|
2019-02-25 23:22:15 +00:00
|
|
|
|
2019-02-27 15:58:04 +00:00
|
|
|
def to_sentence(sentence_arr):
|
2019-02-28 22:37:47 +00:00
|
|
|
return " ".join([token[1] for token in sentence_arr])
|
2019-02-27 15:58:04 +00:00
|
|
|
|
|
|
|
def match_sentence_id(sentence, orig_dict):
|
2019-02-28 22:37:47 +00:00
|
|
|
for k, e in orig_dict.items():
|
|
|
|
orig_sentence = " ".join(token[2] for token in e["tokens"])
|
|
|
|
if sentence == orig_sentence:
|
|
|
|
return k
|
|
|
|
raise KeyError
|
2019-02-25 23:22:15 +00:00
|
|
|
|
2022-02-04 10:24:47 +00:00
|
|
|
def match_sentence_id_giga(sentence, orig_dict):
|
|
|
|
for k, e in orig_dict.items():
|
|
|
|
# orig_sentence = " ".join(token[2] for token in e["tokens"])
|
|
|
|
if sentence == e["text"]:
|
|
|
|
return k
|
|
|
|
raise KeyError
|
|
|
|
|
2019-02-27 15:58:04 +00:00
|
|
|
def get_dep_rel(token):
|
2019-02-28 22:37:47 +00:00
|
|
|
logging.debug(token)
|
|
|
|
for i, field in enumerate(token[14:]):
|
|
|
|
if field != "_":
|
|
|
|
return {
|
|
|
|
"arg": field,
|
|
|
|
"from": i, # i-th predicate in sentence
|
|
|
|
"dep": token[0],
|
|
|
|
}
|
|
|
|
return None
|
|
|
|
|
2022-02-04 10:24:47 +00:00
|
|
|
def handle_file_old(infile_tpl):
|
2019-02-28 22:37:47 +00:00
|
|
|
i = infile_tpl[0]
|
|
|
|
infile = infile_tpl[1]
|
|
|
|
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
|
|
|
origfile = get_origfile(infile)
|
|
|
|
orig_dict = par.parse_tei(origfile)
|
|
|
|
|
|
|
|
with infile.open("rb") as fp:
|
|
|
|
outdata = {}
|
|
|
|
for sentence_arr in extract_sentences(fp.readlines()):
|
|
|
|
# tsv dropped sentence ids, match the ID, using original data
|
|
|
|
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
|
|
|
|
|
|
|
|
outdata[sid] = []
|
|
|
|
|
|
|
|
# find all predicate indices in the sentence
|
|
|
|
predicates = []
|
|
|
|
for token in sentence_arr:
|
|
|
|
if token[12] == "Y":
|
|
|
|
predicates += [token[0]] # idx
|
|
|
|
|
|
|
|
deprel = get_dep_rel(token)
|
|
|
|
if deprel is not None:
|
|
|
|
outdata[sid].append(deprel)
|
|
|
|
|
|
|
|
# deprel["from"] points to n-th predicate
|
|
|
|
# replace with predicate's token index
|
|
|
|
for deprel in outdata[sid]:
|
|
|
|
deprel["from"] = predicates[deprel["from"]]
|
|
|
|
|
|
|
|
if DEBUG:
|
|
|
|
print(to_sentence(sentence_arr))
|
|
|
|
print(outdata[sid])
|
|
|
|
print(sid)
|
|
|
|
print()
|
|
|
|
print()
|
|
|
|
|
|
|
|
with outfile.open("w") as fp:
|
|
|
|
json.dump(outdata, fp)
|
|
|
|
logging.info("SRL relations written to: {}".format(outfile))
|
|
|
|
|
|
|
|
|
2022-02-04 10:24:47 +00:00
|
|
|
def handle_file(whole_input):
|
|
|
|
# sentence_id = whole_input[0][3]
|
|
|
|
# orig_infile = whole_input[0][1]
|
|
|
|
sentence_id = whole_input[3]
|
|
|
|
orig_infile = whole_input[1]
|
|
|
|
|
|
|
|
# origfile = origfiles[0][1]
|
|
|
|
# infile_tpl = infile_tpl[0]
|
|
|
|
|
|
|
|
# i = infile_tpl[0]
|
|
|
|
# infile = infile_tpl[1]
|
|
|
|
outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
|
|
|
|
|
|
|
|
if outfile.exists():
|
|
|
|
return
|
|
|
|
# origfile = get_origfile()
|
|
|
|
orig_dict = par.parse_tei(orig_infile)
|
|
|
|
outdata = {}
|
|
|
|
|
|
|
|
gen = srl_multiple_files_sentences_generator(sentence_id)
|
|
|
|
# gen = srl_multiple_files_sentences_generator(whole_input[1])
|
|
|
|
|
|
|
|
mismatch_sentences = 0
|
|
|
|
|
|
|
|
for sentence_i, (orig_id, orig_val) in enumerate(orig_dict.items()):
|
|
|
|
if orig_id == 'GF0014802.2685.7':
|
|
|
|
print('PAUSE')
|
|
|
|
|
|
|
|
# look at neighbouring sentences if they are correct
|
2022-03-14 10:01:53 +00:00
|
|
|
sentence, sentence_arr = next(gen)
|
|
|
|
# orig_sentence = " ".join(token[2] for token in e["tokens"])
|
|
|
|
assert sentence.replace(' ', '') == orig_val['text']
|
|
|
|
# if i != 10 and i != 0:
|
|
|
|
# print('OK!')
|
|
|
|
sid = orig_id
|
|
|
|
|
|
|
|
outdata[sid] = []
|
|
|
|
|
|
|
|
# find all predicate indices in the sentence
|
|
|
|
predicates = []
|
|
|
|
for token in sentence_arr:
|
|
|
|
if token[12] == "Y":
|
|
|
|
predicates += [token[0]] # idx
|
|
|
|
|
|
|
|
deprel = get_dep_rel(token)
|
|
|
|
if deprel is not None:
|
|
|
|
outdata[sid].append(deprel)
|
|
|
|
|
|
|
|
# deprel["from"] points to n-th predicate
|
|
|
|
# replace with predicate's token index
|
|
|
|
for deprel in outdata[sid]:
|
|
|
|
deprel["from"] = predicates[deprel["from"]]
|
|
|
|
|
|
|
|
if DEBUG:
|
|
|
|
print(to_sentence(sentence_arr))
|
|
|
|
print(outdata[sid])
|
|
|
|
print(sid)
|
|
|
|
print()
|
|
|
|
print()
|
2022-02-04 10:24:47 +00:00
|
|
|
|
|
|
|
if mismatch_sentences > 0:
|
|
|
|
if mismatch_sentences / len(orig_dict.items()) < 0.1:
|
|
|
|
print('Slight mismatch - %d' % sentence_id)
|
|
|
|
print(whole_input)
|
|
|
|
print('ABS mitigated %d' % mismatch_sentences)
|
|
|
|
print('------------------------------------------------')
|
|
|
|
else:
|
|
|
|
print('ERRRRRRRRRRRRRRRROOOOOOORRRRRRRRRRR')
|
|
|
|
print('Big mismatch - %d' % sentence_id)
|
|
|
|
print(whole_input)
|
|
|
|
print('ABS mitigated errors:')
|
|
|
|
print(mismatch_sentences)
|
|
|
|
print('------------------------------------------------')
|
|
|
|
|
|
|
|
|
|
|
|
with outfile.open("w") as fp:
|
|
|
|
json.dump(outdata, fp)
|
|
|
|
logging.info("SRL relations written to: {}".format(outfile))
|
|
|
|
|
|
|
|
def count_orig_file_sentences(filename):
|
|
|
|
|
|
|
|
if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
|
|
|
|
return
|
|
|
|
print(filename[0])
|
|
|
|
orig_dict = par.parse_tei(filename[1])
|
|
|
|
# return filename[0], filename[1], len(orig_dict)
|
|
|
|
with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
|
|
|
|
pickle.dump((filename[0], filename[1], len(orig_dict)), output)
|
|
|
|
|
|
|
|
|
|
|
|
def count_srl_file_sentences(filename):
|
|
|
|
if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
|
|
|
|
return
|
|
|
|
|
|
|
|
print(filename[0])
|
|
|
|
num_sentences = 0
|
|
|
|
with filename[1].open("r") as fp:
|
|
|
|
for line in fp:
|
|
|
|
if line == '\n':
|
|
|
|
num_sentences += 1
|
|
|
|
|
|
|
|
# return filename[0], filename[1], num_sentences
|
|
|
|
with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
|
|
|
|
pickle.dump((filename[0], filename[1], num_sentences), output)
|
|
|
|
|
|
|
|
def srl_sentences_generator(infile, curr_index, sen_start_index):
|
|
|
|
with infile.open("rb") as fp:
|
|
|
|
outdata = {}
|
|
|
|
for sentence_arr in extract_sentences(fp.readlines()):
|
|
|
|
if curr_index < sen_start_index:
|
|
|
|
curr_index += 1
|
|
|
|
else:
|
|
|
|
yield to_sentence(sentence_arr), sentence_arr
|
|
|
|
yield None
|
|
|
|
|
|
|
|
|
|
|
|
def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
|
|
|
|
sentence_id = max(0, sentence_id - 10)
|
|
|
|
for i, srl_file in enumerate(srl_file_sizes):
|
|
|
|
if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
|
|
|
|
srl_files = srl_file_sizes[i:]
|
|
|
|
break
|
|
|
|
|
|
|
|
for file_info in srl_files:
|
|
|
|
# srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
|
|
|
|
srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
|
|
|
|
el = next(srl_gen)
|
|
|
|
while el is not None:
|
|
|
|
yield el
|
|
|
|
el = next(srl_gen)
|
|
|
|
|
|
|
|
yield None
|
|
|
|
|
|
|
|
|
2019-02-28 22:37:47 +00:00
|
|
|
# main
|
2019-02-27 16:04:03 +00:00
|
|
|
par = Parser()
|
|
|
|
OUTPATH.mkdir(exist_ok=True)
|
2019-02-25 23:22:15 +00:00
|
|
|
|
2022-02-04 10:24:47 +00:00
|
|
|
infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
|
2019-02-28 22:37:47 +00:00
|
|
|
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
|
|
|
|
|
2022-02-04 10:24:47 +00:00
|
|
|
origfiles = []
|
|
|
|
for subdir, dirs, files in os.walk(ORIGPATH):
|
|
|
|
for file in files:
|
|
|
|
origfiles.append(Path(os.path.join(subdir, file)))
|
|
|
|
origfiles=list(enumerate(sorted(origfiles)))
|
|
|
|
##### REMOVE ############
|
|
|
|
# origfiles = origfiles[:3]
|
|
|
|
|
|
|
|
# count sentences in orig (if not counted before)
|
|
|
|
# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
|
|
|
|
if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')):
|
|
|
|
# srl_file_sizes = {}
|
|
|
|
if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks')):
|
|
|
|
os.makedirs(os.path.join(INTERNAL_DATA, 'orig_chunks'))
|
|
|
|
# with Pool(CPU_CORES) as p:
|
|
|
|
# # p.map(handle_file, infiles)
|
|
|
|
# p.map(count_orig_file_sentences, origfiles)
|
|
|
|
for i in range(len(origfiles)):
|
|
|
|
count_orig_file_sentences(origfiles[i])
|
|
|
|
orig_file_sizes = []
|
|
|
|
for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'orig_chunks')).iterdir())):
|
|
|
|
print(x.name)
|
|
|
|
if x.is_file():
|
|
|
|
with x.open('rb') as pkl_small_file:
|
|
|
|
orig_file_sizes.append(pickle.load(pkl_small_file))
|
|
|
|
# orig_file_sizes = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
|
|
|
|
print("Sorting orig files")
|
|
|
|
orig_file_sizes = sorted(orig_file_sizes)
|
|
|
|
total_size = 0
|
|
|
|
orig_file_sizes_final = []
|
|
|
|
print("Calculating orig files size")
|
|
|
|
for n, pa, si in orig_file_sizes:
|
|
|
|
orig_file_sizes_final.append((n, pa, si, total_size))
|
|
|
|
total_size += si
|
|
|
|
orig_file_sizes = orig_file_sizes_final
|
|
|
|
print("Saving orig files size")
|
|
|
|
with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'wb') as output:
|
|
|
|
pickle.dump(orig_file_sizes, output)
|
|
|
|
print("Orig files saved")
|
|
|
|
else:
|
|
|
|
with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'rb') as pkl_file:
|
|
|
|
orig_file_sizes = pickle.load(pkl_file)
|
|
|
|
|
|
|
|
|
|
|
|
# count sentences in srl (if not counted before)
|
|
|
|
# os.remove(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'))
|
|
|
|
if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl')):
|
|
|
|
# srl_file_sizes = {}
|
|
|
|
if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks')):
|
|
|
|
os.makedirs(os.path.join(INTERNAL_DATA, 'srl_chunks'))
|
|
|
|
# with Pool(CPU_CORES) as p:
|
|
|
|
# # p.map(handle_file, infiles)
|
|
|
|
# p.map(count_srl_file_sentences, infiles)
|
|
|
|
|
|
|
|
for i in range(len(infiles)):
|
|
|
|
count_srl_file_sentences(infiles[i])
|
|
|
|
|
|
|
|
srl_file_sizes = []
|
|
|
|
for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'srl_chunks')).iterdir())):
|
|
|
|
print(x.name)
|
|
|
|
if x.is_file():
|
|
|
|
with x.open('rb') as pkl_small_file:
|
|
|
|
srl_file_sizes.append(pickle.load(pkl_small_file))
|
|
|
|
print("Sorting srl files")
|
|
|
|
srl_file_sizes = sorted(srl_file_sizes)
|
|
|
|
total_size = 0
|
|
|
|
srl_file_sizes_final = []
|
|
|
|
print("Calculating srl files size")
|
|
|
|
for n, pa, si in srl_file_sizes:
|
|
|
|
srl_file_sizes_final.append((n, pa, si, total_size))
|
|
|
|
total_size += si
|
|
|
|
srl_file_sizes = srl_file_sizes_final
|
|
|
|
print("Saving srl files size")
|
|
|
|
with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'wb') as output:
|
|
|
|
pickle.dump(srl_file_sizes, output)
|
|
|
|
print("Srl files saved")
|
|
|
|
else:
|
|
|
|
with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'rb') as pkl_file:
|
|
|
|
srl_file_sizes = pickle.load(pkl_file)
|
|
|
|
|
|
|
|
|
|
|
|
# print(len(orig_file_sizes))
|
|
|
|
# print('asd' + 2)
|
|
|
|
|
|
|
|
# inputs = []
|
|
|
|
# srl_i = 0
|
|
|
|
# srl_file = srl_file_sizes[srl_i]
|
|
|
|
# for orig_i, orig_path, orig_size, orig_first_sent_i in orig_file_sizes:
|
|
|
|
# interesting_srl_files = []
|
|
|
|
# # beginning of srl chunk in range of orig chunk or ending of srl chunk in range of orig chunk
|
|
|
|
# # while srl_file[3] >= orig_first_sent_i and srl_file[3] < orig_first_sent_i + orig_size or \
|
|
|
|
# # srl_file[3] + srl_file[2] - 1 >= orig_first_sent_i and srl_file[3] + srl_file[2] - 1 < orig_first_sent_i + orig_size:
|
|
|
|
# while srl_file[3] < orig_first_sent_i + orig_size and srl_file[3] + srl_file[2] > orig_first_sent_i:
|
|
|
|
# # if beginning of file is in
|
|
|
|
# if srl_file[3] > orig_first_sent_i:
|
|
|
|
# interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], srl_file[3]))
|
|
|
|
# # print('if %d' % srl_file[3])
|
|
|
|
# else:
|
|
|
|
# interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], orig_first_sent_i))
|
|
|
|
# # print('else %d' % orig_first_sent_i)
|
|
|
|
#
|
|
|
|
# if orig_first_sent_i + orig_size >= srl_file[3] + srl_file[2]:
|
|
|
|
# srl_i += 1
|
|
|
|
# if srl_i < len(srl_file_sizes):
|
|
|
|
# srl_file = srl_file_sizes[srl_i]
|
|
|
|
# else:
|
|
|
|
# break
|
|
|
|
# # print(srl_i)
|
|
|
|
# # print('a ' + 2)
|
|
|
|
# else:
|
|
|
|
# break
|
|
|
|
#
|
|
|
|
# inputs.append([[orig_i, orig_path, orig_size, orig_first_sent_i], interesting_srl_files])
|
|
|
|
# print(inputs[-1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# srl_gen = srl_sentences_generator(srl_file_sizes[0][1], 0, 533)
|
|
|
|
# a = next(srl_gen)
|
|
|
|
# b = next(srl_gen)
|
|
|
|
# c = next(srl_gen)
|
|
|
|
|
|
|
|
print('beginning processing')
|
2019-02-28 22:37:47 +00:00
|
|
|
with Pool(CPU_CORES) as p:
|
2022-02-04 10:24:47 +00:00
|
|
|
# p.map(handle_file, inputs)
|
|
|
|
p.map(handle_file, orig_file_sizes)
|
|
|
|
|
|
|
|
# for of in orig_file_sizes:
|
|
|
|
# handle_file(of)
|
2019-02-28 22:37:47 +00:00
|
|
|
|
2019-02-28 09:15:14 +00:00
|
|
|
logging.info("Finished generating .json files.")
|