from pathlib import Path from parser.parser import Parser import configparser import json import sys # defaults ORIGPATH = Path("../data/kres_example") # we need the IDs INPATH = Path("../data/kres_example_srl") OUTPATH = Path("../data/kres_example_json") DEBUG = False # parse config config = configparser.ConfigParser() config.read("tools.cfg") ORIGPATH = Path(config["tools"]["kres_orig"]) INPATH = Path(config["tools"]["kres_srl"]) OUTPATH = Path(config["tools"]["kres_json"]) DEBUG = config["tools"]["debug"] == "True" def get_origfile(filename): for origfile in ORIGPATH.iterdir(): if filename.name.split('.')[0] == origfile.name.split('.')[0]: return origfile raise FileNotFoundError def extract_sentences(line_reader): acc = [] # last char in line is \n, remove it for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]: if len(line) == 1: # empty line tmp = acc acc = [] yield tmp else: acc.append(line) def to_sentence(sentence_arr): return " ".join([token[1] for token in sentence_arr]) def match_sentence_id(sentence, orig_dict): for k, e in orig_dict.items(): orig_sentence = " ".join(token[2] for token in e["tokens"]) if sentence == orig_sentence: return k raise KeyError def get_dep_rel(token): if DEBUG: print(token) for i, field in enumerate(token[14:]): if field != "_": return { "arg": field, "from": i, # i-th predicate in sentence "dep": token[0], } return None par = Parser() OUTPATH.mkdir(exist_ok=True) for infile in [x for x in INPATH.iterdir() if x.is_file()]: origfile = get_origfile(infile) orig_dict = par.parse_tei(origfile) with infile.open("rb") as fp: outdata = {} for sentence_arr in extract_sentences(fp.readlines()): # tsv dropped sentence ids, match the ID, using original data sid = match_sentence_id(to_sentence(sentence_arr), orig_dict) outdata[sid] = [] # find all predicate indices in the sentence predicates = [] for token in sentence_arr: if token[12] == "Y": predicates += [token[0]] # idx deprel = get_dep_rel(token) if deprel is not None: outdata[sid].append(deprel) # deprel["from"] points to n-th predicate # replace with predicate's token index for deprel in outdata[sid]: deprel["from"] = predicates[deprel["from"]] if DEBUG: print(to_sentence(sentence_arr)) print(outdata[sid]) print(sid) print() print() outfile = (OUTPATH / infile.name).with_suffix(".json") with outfile.open("w") as fp: json.dump(outdata, fp)