diff --git a/tools/parse_all.py b/tools/parse_all.py index 7415113..5d4410b 100644 --- a/tools/parse_all.py +++ b/tools/parse_all.py @@ -17,6 +17,7 @@ config = configparser.ConfigParser() config.read("tools.cfg") INDIR = Path(config["tools"]["kres_orig"]) OUTDIR = Path(config["tools"]["kres_tsv"]) +CPU_CORES = int(config["tools"]["cpu_cores"]) LOGFILE = Path(config["tools"]["logfile"]).absolute() LOGFILE.touch(exist_ok=True) @@ -32,27 +33,38 @@ ssj_dict = par.parse_tei(ssj_file) print("end parsing ssj") """ -logging.info("parsing kres") # kres_file = "../data/kres_example/F0019343.xml.parsed.xml" OUTDIR.mkdir(exist_ok=True) infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()])) +logging.info("Parsing kres: {} files.".format(len(infiles))) def handle_file(infile): i = infile[0] kres_file = infile[1] + outfile = (OUTDIR / kres_file.name).with_suffix(".tsv") - logging.info("Processing file ({}/{}): {}".format(i+1, len(infiles), str(kres_file))) - res_dict = par.parse_tei(kres_file) - kres_out_str = "" + if outfile.is_file(): + logging.info("Skipping existing file: {}.".format(str(kres_file))) + return True - for _, sentence in res_dict.items(): - kres_out_str += par.to_conll_2009_SRL(sentence) + try: + res_dict = par.parse_tei(kres_file) + kres_out_str = "" + for _, sentence in res_dict.items(): + kres_out_str += par.to_conll_2009_SRL(sentence) + except: + logging.info("Failed processing file: {}".format(str(kres_file))) + return False - with (OUTDIR / kres_file.name).with_suffix(".tsv").open("wb+") as fp: + + with outfile.open("wb+") as fp: fp.write(kres_out_str.encode("utf-8")) + logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file))) + return True + return False -with Pool(3) as p: +with Pool(CPU_CORES) as p: p.map(handle_file, infiles) diff --git a/tools/tools.cfg b/tools/tools.cfg index 9f32788..2751d43 100644 --- a/tools/tools.cfg +++ b/tools/tools.cfg @@ -4,4 +4,5 @@ kres_tsv = ../data/kres_out/1_tsv kres_srl = ../data/kres_out/2_srl kres_json = ../data/kres_out/final_json logfile = ../progress.log +cpu_cores = 5 debug = False