from parser.parser import Parser import os from os.path import join, dirname from pathlib import Path import re import sys import cProfile import configparser import logging from multiprocessing import Pool SSJ500K_2_1 = 27829 # number of sentences par = Parser() # path to data config = configparser.ConfigParser() config.read("tools.cfg") INDIR = Path(config["tools"]["kres_orig"]) OUTDIR = Path(config["tools"]["kres_tsv"]) CPU_CORES = int(config["tools"]["cpu_cores"]) LOGFILE = Path(config["tools"]["logfile"]).absolute() LOGFILE.touch(exist_ok=True) LOGFILE.resolve() logging.basicConfig(filename=str(LOGFILE), level=logging.INFO) """ print("parsing ssj") ssj_file = "../data/ssj500k-sl.sample.xml" ssj_dict = par.parse_tei(ssj_file) # assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences." print("end parsing ssj") """ # kres_file = "../data/kres_example/F0019343.xml.parsed.xml" OUTDIR.mkdir(exist_ok=True) infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()])) logging.info("Parsing kres: {} files.".format(len(infiles))) def handle_file(infile): i = infile[0] kres_file = infile[1] outfile = (OUTDIR / kres_file.name).with_suffix(".tsv") if outfile.is_file(): logging.info("Skipping existing file: {}.".format(str(kres_file))) return True try: res_dict = par.parse_tei(kres_file) kres_out_str = "" for _, sentence in res_dict.items(): kres_out_str += par.to_conll_2009_SRL(sentence) except Exception as exc: logging.info("Failed processing file: {}".format(str(kres_file))) logging.error(exc) return False with outfile.open("wb+") as fp: fp.write(kres_out_str.encode("utf-8")) logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file))) return True return False with Pool(CPU_CORES) as p: p.map(handle_file, infiles) logging.info("end parsing kres")