from pathlib import Path from parser.parser import Parser import configparser import json import sys import logging from multiprocessing import Pool # parse config config = configparser.ConfigParser() config.read("tools.cfg") ORIGPATH = Path(config["tools"]["kres_orig"]) INPATH = Path(config["tools"]["kres_srl"]) OUTPATH = Path(config["tools"]["kres_json"]) DEBUG = config["tools"]["debug"] == "True" CPU_CORES = int(config["tools"]["cpu_cores"]) LOGFILE = Path(config["tools"]["logfile"]).absolute() LOGFILE.touch(exist_ok=True) LOGFILE.resolve() logging.basicConfig(filename=str(LOGFILE), level=logging.INFO) def get_origfile(filename): for origfile in ORIGPATH.iterdir(): if filename.name.split('.')[0] == origfile.name.split('.')[0]: return origfile raise FileNotFoundError def extract_sentences(line_reader): acc = [] # last char in line is \n, remove it for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]: if len(line) == 1: # empty line tmp = acc acc = [] yield tmp else: acc.append(line) def to_sentence(sentence_arr): return " ".join([token[1] for token in sentence_arr]) def match_sentence_id(sentence, orig_dict): for k, e in orig_dict.items(): orig_sentence = " ".join(token[2] for token in e["tokens"]) if sentence == orig_sentence: return k raise KeyError def get_dep_rel(token): logging.debug(token) for i, field in enumerate(token[14:]): if field != "_": return { "arg": field, "from": i, # i-th predicate in sentence "dep": token[0], } return None def handle_file(infile_tpl): i = infile_tpl[0] infile = infile_tpl[1] outfile = (OUTPATH / infile.name).with_suffix(".json") origfile = get_origfile(infile) orig_dict = par.parse_tei(origfile) with infile.open("rb") as fp: outdata = {} for sentence_arr in extract_sentences(fp.readlines()): # tsv dropped sentence ids, match the ID, using original data sid = match_sentence_id(to_sentence(sentence_arr), orig_dict) outdata[sid] = [] # find all predicate indices in the sentence predicates = [] for token in sentence_arr: if token[12] == "Y": predicates += [token[0]] # idx deprel = get_dep_rel(token) if deprel is not None: outdata[sid].append(deprel) # deprel["from"] points to n-th predicate # replace with predicate's token index for deprel in outdata[sid]: deprel["from"] = predicates[deprel["from"]] if DEBUG: print(to_sentence(sentence_arr)) print(outdata[sid]) print(sid) print() print() with outfile.open("w") as fp: json.dump(outdata, fp) logging.info("SRL relations written to: {}".format(outfile)) # main par = Parser() OUTPATH.mkdir(exist_ok=True) infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()])) logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles))) with Pool(CPU_CORES) as p: p.map(handle_file, infiles) logging.info("Finished generating .json files.")