#!/usr/bin/env python # -*- coding: utf-8 -*- import os import pickle from pathlib import Path from parser.parser import Parser import configparser import json import sys import logging from multiprocessing import Pool # parse config config = configparser.ConfigParser() config.read("tools.cfg.ssj500k2.3") ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"]) INPATH = Path(config["tools"]["ssj500k_srl"]) OUTPATH = Path(config["tools"]["ssj500k_json"]) INTERNAL_DATA = Path(config["tools"]["internal_data"]) DEBUG = config["tools"]["debug"] == "True" CPU_CORES = int(config["tools"]["cpu_cores"]) LOGFILE = Path(config["tools"]["logfile"]).absolute() LOGFILE.touch(exist_ok=True) LOGFILE.resolve() logging.basicConfig(filename=str(LOGFILE), level=logging.INFO) def get_origfile(filename): for origfile in ORIGPATH.iterdir(): if filename.name.split('.')[0] == origfile.name.split('.')[0]: return origfile raise FileNotFoundError def extract_sentences(line_reader): acc = [] # last char in line is \n, remove it for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]: if len(line) == 1: # empty line tmp = acc acc = [] yield tmp else: acc.append(line) def to_sentence(sentence_arr): return " ".join([token[1] for token in sentence_arr]) def match_sentence_id(sentence, orig_dict): for k, e in orig_dict.items(): orig_sentence = " ".join(token[2] for token in e["tokens"]) if sentence == orig_sentence: return k raise KeyError def match_sentence_id_giga(sentence, orig_dict): for k, e in orig_dict.items(): # orig_sentence = " ".join(token[2] for token in e["tokens"]) if sentence == e["text"]: return k raise KeyError def get_dep_rel(token): logging.debug(token) for i, field in enumerate(token[14:]): if field != "_": return { "arg": field, "from": i, # i-th predicate in sentence "dep": token[0], } return None def handle_file_old(infile_tpl): i = infile_tpl[0] infile = infile_tpl[1] outfile = (OUTPATH / infile.name).with_suffix(".json") origfile = get_origfile(infile) orig_dict = par.parse_tei(origfile) with infile.open("rb") as fp: outdata = {} for sentence_arr in extract_sentences(fp.readlines()): # tsv dropped sentence ids, match the ID, using original data sid = match_sentence_id(to_sentence(sentence_arr), orig_dict) outdata[sid] = [] # find all predicate indices in the sentence predicates = [] for token in sentence_arr: if token[12] == "Y": predicates += [token[0]] # idx deprel = get_dep_rel(token) if deprel is not None: outdata[sid].append(deprel) # deprel["from"] points to n-th predicate # replace with predicate's token index for deprel in outdata[sid]: deprel["from"] = predicates[deprel["from"]] if DEBUG: print(to_sentence(sentence_arr)) print(outdata[sid]) print(sid) print() print() with outfile.open("w") as fp: json.dump(outdata, fp) logging.info("SRL relations written to: {}".format(outfile)) def handle_file(whole_input): # sentence_id = whole_input[0][3] # orig_infile = whole_input[0][1] sentence_id = whole_input[3] orig_infile = whole_input[1] # origfile = origfiles[0][1] # infile_tpl = infile_tpl[0] # i = infile_tpl[0] # infile = infile_tpl[1] outfile = (OUTPATH / orig_infile.name).with_suffix(".json") if outfile.exists(): return # origfile = get_origfile() orig_dict = par.parse_tei(orig_infile) outdata = {} gen = srl_multiple_files_sentences_generator(sentence_id) # gen = srl_multiple_files_sentences_generator(whole_input[1]) mismatch_sentences = 0 for sentence_i, (orig_id, orig_val) in enumerate(orig_dict.items()): if orig_id == 'GF0014802.2685.7': print('PAUSE') # look at neighbouring sentences if they are correct sentence, sentence_arr = next(gen) # orig_sentence = " ".join(token[2] for token in e["tokens"]) assert sentence.replace(' ', '') == orig_val['text'] # if i != 10 and i != 0: # print('OK!') sid = orig_id outdata[sid] = [] # find all predicate indices in the sentence predicates = [] for token in sentence_arr: if token[12] == "Y": predicates += [token[0]] # idx deprel = get_dep_rel(token) if deprel is not None: outdata[sid].append(deprel) # deprel["from"] points to n-th predicate # replace with predicate's token index for deprel in outdata[sid]: deprel["from"] = predicates[deprel["from"]] if DEBUG: print(to_sentence(sentence_arr)) print(outdata[sid]) print(sid) print() print() if mismatch_sentences > 0: if mismatch_sentences / len(orig_dict.items()) < 0.1: print('Slight mismatch - %d' % sentence_id) print(whole_input) print('ABS mitigated %d' % mismatch_sentences) print('------------------------------------------------') else: print('ERRRRRRRRRRRRRRRROOOOOOORRRRRRRRRRR') print('Big mismatch - %d' % sentence_id) print(whole_input) print('ABS mitigated errors:') print(mismatch_sentences) print('------------------------------------------------') with outfile.open("w") as fp: json.dump(outdata, fp) logging.info("SRL relations written to: {}".format(outfile)) def count_orig_file_sentences(filename): if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)): return print(filename[0]) orig_dict = par.parse_tei(filename[1]) # return filename[0], filename[1], len(orig_dict) with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output: pickle.dump((filename[0], filename[1], len(orig_dict)), output) def count_srl_file_sentences(filename): if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)): return print(filename[0]) num_sentences = 0 with filename[1].open("r") as fp: for line in fp: if line == '\n': num_sentences += 1 # return filename[0], filename[1], num_sentences with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output: pickle.dump((filename[0], filename[1], num_sentences), output) def srl_sentences_generator(infile, curr_index, sen_start_index): with infile.open("rb") as fp: outdata = {} for sentence_arr in extract_sentences(fp.readlines()): if curr_index < sen_start_index: curr_index += 1 else: yield to_sentence(sentence_arr), sentence_arr yield None def srl_multiple_files_sentences_generator(sentence_id): # srl_files): sentence_id = max(0, sentence_id - 10) for i, srl_file in enumerate(srl_file_sizes): if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]: srl_files = srl_file_sizes[i:] break for file_info in srl_files: # srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4]) srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id) el = next(srl_gen) while el is not None: yield el el = next(srl_gen) yield None # main par = Parser() OUTPATH.mkdir(exist_ok=True) infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()])) logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles))) origfiles = [] for subdir, dirs, files in os.walk(ORIGPATH): for file in files: origfiles.append(Path(os.path.join(subdir, file))) origfiles=list(enumerate(sorted(origfiles))) ##### REMOVE ############ # origfiles = origfiles[:3] # count sentences in orig (if not counted before) # os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')) if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')): # srl_file_sizes = {} if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks')): os.makedirs(os.path.join(INTERNAL_DATA, 'orig_chunks')) # with Pool(CPU_CORES) as p: # # p.map(handle_file, infiles) # p.map(count_orig_file_sentences, origfiles) for i in range(len(origfiles)): count_orig_file_sentences(origfiles[i]) orig_file_sizes = [] for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'orig_chunks')).iterdir())): print(x.name) if x.is_file(): with x.open('rb') as pkl_small_file: orig_file_sizes.append(pickle.load(pkl_small_file)) # orig_file_sizes = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()])) print("Sorting orig files") orig_file_sizes = sorted(orig_file_sizes) total_size = 0 orig_file_sizes_final = [] print("Calculating orig files size") for n, pa, si in orig_file_sizes: orig_file_sizes_final.append((n, pa, si, total_size)) total_size += si orig_file_sizes = orig_file_sizes_final print("Saving orig files size") with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'wb') as output: pickle.dump(orig_file_sizes, output) print("Orig files saved") else: with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'rb') as pkl_file: orig_file_sizes = pickle.load(pkl_file) # count sentences in srl (if not counted before) # os.remove(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl')) if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl')): # srl_file_sizes = {} if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks')): os.makedirs(os.path.join(INTERNAL_DATA, 'srl_chunks')) # with Pool(CPU_CORES) as p: # # p.map(handle_file, infiles) # p.map(count_srl_file_sentences, infiles) for i in range(len(infiles)): count_srl_file_sentences(infiles[i]) srl_file_sizes = [] for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'srl_chunks')).iterdir())): print(x.name) if x.is_file(): with x.open('rb') as pkl_small_file: srl_file_sizes.append(pickle.load(pkl_small_file)) print("Sorting srl files") srl_file_sizes = sorted(srl_file_sizes) total_size = 0 srl_file_sizes_final = [] print("Calculating srl files size") for n, pa, si in srl_file_sizes: srl_file_sizes_final.append((n, pa, si, total_size)) total_size += si srl_file_sizes = srl_file_sizes_final print("Saving srl files size") with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'wb') as output: pickle.dump(srl_file_sizes, output) print("Srl files saved") else: with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'rb') as pkl_file: srl_file_sizes = pickle.load(pkl_file) # print(len(orig_file_sizes)) # print('asd' + 2) # inputs = [] # srl_i = 0 # srl_file = srl_file_sizes[srl_i] # for orig_i, orig_path, orig_size, orig_first_sent_i in orig_file_sizes: # interesting_srl_files = [] # # beginning of srl chunk in range of orig chunk or ending of srl chunk in range of orig chunk # # while srl_file[3] >= orig_first_sent_i and srl_file[3] < orig_first_sent_i + orig_size or \ # # srl_file[3] + srl_file[2] - 1 >= orig_first_sent_i and srl_file[3] + srl_file[2] - 1 < orig_first_sent_i + orig_size: # while srl_file[3] < orig_first_sent_i + orig_size and srl_file[3] + srl_file[2] > orig_first_sent_i: # # if beginning of file is in # if srl_file[3] > orig_first_sent_i: # interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], srl_file[3])) # # print('if %d' % srl_file[3]) # else: # interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], orig_first_sent_i)) # # print('else %d' % orig_first_sent_i) # # if orig_first_sent_i + orig_size >= srl_file[3] + srl_file[2]: # srl_i += 1 # if srl_i < len(srl_file_sizes): # srl_file = srl_file_sizes[srl_i] # else: # break # # print(srl_i) # # print('a ' + 2) # else: # break # # inputs.append([[orig_i, orig_path, orig_size, orig_first_sent_i], interesting_srl_files]) # print(inputs[-1]) # srl_gen = srl_sentences_generator(srl_file_sizes[0][1], 0, 533) # a = next(srl_gen) # b = next(srl_gen) # c = next(srl_gen) print('beginning processing') with Pool(CPU_CORES) as p: # p.map(handle_file, inputs) p.map(handle_file, orig_file_sizes) # for of in orig_file_sizes: # handle_file(of) logging.info("Finished generating .json files.")