added parallel json output creation

master
voje 5 years ago
parent 406e88ade8
commit 044fae2001

@ -1,15 +1,15 @@
.PHONY: tsv_files srl_tagged_files json_files env clean
all: json_files
all: tools/fillpred_model/model.pickle tsv_files srl_tagged_files json_files
json_files: srl_tagged_files
json_files: # srl_tagged_files
cd tools; python3 gen_json.py
srl_tagged_files: tsv_files
srl_tagged_files: # tsv_files
# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
cd tools/srl-20131216; ./tag_all.sh
tsv_files: tools/fillpred_model/model.pickle
tsv_files: # tools/fillpred_model/model.pickle
cd tools; python3 parse_all.py
tools/fillpred_model/model.pickle:

@ -5,8 +5,9 @@ RUN apt-get install -y \
vim \
default-jdk \
python3 \
python3-pip \
sshfs
python3-pip
RUN apt-get install -y sshfs
RUN pip3 install lxml pandas sklearn

@ -4,6 +4,7 @@ import configparser
import json
import sys
import logging
from multiprocessing import Pool
# parse config
config = configparser.ConfigParser()
@ -12,10 +13,13 @@ ORIGPATH = Path(config["tools"]["kres_orig"])
INPATH = Path(config["tools"]["kres_srl"])
OUTPATH = Path(config["tools"]["kres_json"])
DEBUG = config["tools"]["debug"] == "True"
LOGFILE = Path(config["tools"]["logfile"]).absolute().resolve()
CPU_CORES = int(config["tools"]["cpu_cores"])
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
def get_origfile(filename):
for origfile in ORIGPATH.iterdir():
@ -55,12 +59,10 @@ def get_dep_rel(token):
}
return None
par = Parser()
OUTPATH.mkdir(exist_ok=True)
logging.info("Start generating .josn files.")
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
def handle_file(infile_tpl):
i = infile_tpl[0]
infile = infile_tpl[1]
outfile = (OUTPATH / infile.name).with_suffix(".json")
origfile = get_origfile(infile)
orig_dict = par.parse_tei(origfile)
@ -94,8 +96,19 @@ for infile in [x for x in INPATH.iterdir() if x.is_file()]:
print()
print()
outfile = (OUTPATH / infile.name).with_suffix(".json")
with outfile.open("w") as fp:
json.dump(outdata, fp)
logging.info("SRL relations written to: ", outfile)
logging.info("SRL relations written to: {}".format(outfile))
# main
par = Parser()
OUTPATH.mkdir(exist_ok=True)
infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
with Pool(CPU_CORES) as p:
p.map(handle_file, infiles)
logging.info("Finished generating .json files.")

@ -4,5 +4,5 @@ kres_tsv = ../data/kres_out/1_tsv
kres_srl = ../data/kres_out/2_srl
kres_json = ../data/kres_out/final_json
logfile = ../progress.log
cpu_cores = 5
cpu_cores = 1
debug = False

Loading…
Cancel
Save