added parallel json output creation

master
voje 5 years ago
parent 406e88ade8
commit 044fae2001

@ -1,15 +1,15 @@
.PHONY: tsv_files srl_tagged_files json_files env clean .PHONY: tsv_files srl_tagged_files json_files env clean
all: json_files all: tools/fillpred_model/model.pickle tsv_files srl_tagged_files json_files
json_files: srl_tagged_files json_files: # srl_tagged_files
cd tools; python3 gen_json.py cd tools; python3 gen_json.py
srl_tagged_files: tsv_files srl_tagged_files: # tsv_files
# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd - # cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
cd tools/srl-20131216; ./tag_all.sh cd tools/srl-20131216; ./tag_all.sh
tsv_files: tools/fillpred_model/model.pickle tsv_files: # tools/fillpred_model/model.pickle
cd tools; python3 parse_all.py cd tools; python3 parse_all.py
tools/fillpred_model/model.pickle: tools/fillpred_model/model.pickle:

@ -5,8 +5,9 @@ RUN apt-get install -y \
vim \ vim \
default-jdk \ default-jdk \
python3 \ python3 \
python3-pip \ python3-pip
sshfs
RUN apt-get install -y sshfs
RUN pip3 install lxml pandas sklearn RUN pip3 install lxml pandas sklearn

@ -4,6 +4,7 @@ import configparser
import json import json
import sys import sys
import logging import logging
from multiprocessing import Pool
# parse config # parse config
config = configparser.ConfigParser() config = configparser.ConfigParser()
@ -12,10 +13,13 @@ ORIGPATH = Path(config["tools"]["kres_orig"])
INPATH = Path(config["tools"]["kres_srl"]) INPATH = Path(config["tools"]["kres_srl"])
OUTPATH = Path(config["tools"]["kres_json"]) OUTPATH = Path(config["tools"]["kres_json"])
DEBUG = config["tools"]["debug"] == "True" DEBUG = config["tools"]["debug"] == "True"
LOGFILE = Path(config["tools"]["logfile"]).absolute().resolve() CPU_CORES = int(config["tools"]["cpu_cores"])
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO) LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
def get_origfile(filename): def get_origfile(filename):
for origfile in ORIGPATH.iterdir(): for origfile in ORIGPATH.iterdir():
@ -55,12 +59,10 @@ def get_dep_rel(token):
} }
return None return None
def handle_file(infile_tpl):
par = Parser() i = infile_tpl[0]
OUTPATH.mkdir(exist_ok=True) infile = infile_tpl[1]
outfile = (OUTPATH / infile.name).with_suffix(".json")
logging.info("Start generating .josn files.")
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
origfile = get_origfile(infile) origfile = get_origfile(infile)
orig_dict = par.parse_tei(origfile) orig_dict = par.parse_tei(origfile)
@ -94,8 +96,19 @@ for infile in [x for x in INPATH.iterdir() if x.is_file()]:
print() print()
print() print()
outfile = (OUTPATH / infile.name).with_suffix(".json")
with outfile.open("w") as fp: with outfile.open("w") as fp:
json.dump(outdata, fp) json.dump(outdata, fp)
logging.info("SRL relations written to: ", outfile) logging.info("SRL relations written to: {}".format(outfile))
# main
par = Parser()
OUTPATH.mkdir(exist_ok=True)
infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
with Pool(CPU_CORES) as p:
p.map(handle_file, infiles)
logging.info("Finished generating .json files.") logging.info("Finished generating .json files.")

@ -4,5 +4,5 @@ kres_tsv = ../data/kres_out/1_tsv
kres_srl = ../data/kres_out/2_srl kres_srl = ../data/kres_out/2_srl
kres_json = ../data/kres_out/final_json kres_json = ../data/kres_out/final_json
logfile = ../progress.log logfile = ../progress.log
cpu_cores = 5 cpu_cores = 1
debug = False debug = False

Loading…
Cancel
Save