diff --git a/tools/parse_all.py b/tools/parse_all.py index 82f414b..7415113 100644 --- a/tools/parse_all.py +++ b/tools/parse_all.py @@ -7,6 +7,7 @@ import sys import cProfile import configparser import logging +from multiprocessing import Pool SSJ500K_2_1 = 27829 # number of sentences par = Parser() @@ -16,7 +17,10 @@ config = configparser.ConfigParser() config.read("tools.cfg") INDIR = Path(config["tools"]["kres_orig"]) OUTDIR = Path(config["tools"]["kres_tsv"]) -LOGFILE = Path(config["tools"]["logfile"]).absolute().resolve() + +LOGFILE = Path(config["tools"]["logfile"]).absolute() +LOGFILE.touch(exist_ok=True) +LOGFILE.resolve() logging.basicConfig(filename=str(LOGFILE), level=logging.INFO) @@ -32,8 +36,11 @@ logging.info("parsing kres") # kres_file = "../data/kres_example/F0019343.xml.parsed.xml" OUTDIR.mkdir(exist_ok=True) -infiles = [x for x in INDIR.iterdir() if x.is_file()] -for i, kres_file in enumerate(infiles): +infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()])) + +def handle_file(infile): + i = infile[0] + kres_file = infile[1] logging.info("Processing file ({}/{}): {}".format(i+1, len(infiles), str(kres_file))) res_dict = par.parse_tei(kres_file) @@ -44,4 +51,11 @@ for i, kres_file in enumerate(infiles): with (OUTDIR / kres_file.name).with_suffix(".tsv").open("wb+") as fp: fp.write(kres_out_str.encode("utf-8")) + +with Pool(3) as p: + p.map(handle_file, infiles) + + logging.info("end parsing kres") + +