from pathlib import Path from parser.parser import Parser import configparser # defaults ORIGPATH = Path("../data/kres_example") # we need the IDs INPATH = Path("../data/kres_example_srl") OUTPATH = Path("../data/kres_example_json") # parse config config = configparser.ConfigParser() config.read("tools.cfg") ORIGPATH = Path(config["tools"]["kres_orig"]) INPATH = Path(config["tools"]["kres_srl"]) OUTPATH = Path(config["tools"]["kres_json"]) def get_origfile(filename): for origfile in ORIGPATH.iterdir(): if filename.name.split('.')[0] == origfile.name.split('.')[0]: return origfile raise FileNotFoundError def extract_sentences(line_reader): acc = [] for line in [x.decode("utf-8").split('\t') for x in line_reader]: if line[0] == '\n': tmp = acc acc = [] yield tmp else: acc.append(line) def match_sentence_id(string, rd): str1 = " ".join([token[1] for token in sentence_arr]) for k, e in rd.items(): str2 = " ".join(token[2] for token in dict_entry["tokens"]) if str1 == str2: return k raise KeyError if __name__ == "__main__": par = Parser() for infile in [x for x in INPATH.iterdir() if x.is_file()]: origfile = get_origfile(infile) rd = par.parse_tei(origfile) fp = infile.open("rb") for sentence_arr in extract_sentences(fp.readlines()): sid = match_sentence_id(sentence_arr, rd) print(sid) # OK, we got the sentence id, now generate the predicate map! outfile = (OUTPATH / infile.name).with_suffix(".json")