From 044fae2001f9971029e6ab12f0cecaf097d5364c Mon Sep 17 00:00:00 2001 From: voje Date: Thu, 28 Feb 2019 23:37:47 +0100 Subject: [PATCH] added parallel json output creation --- Makefile | 8 +- dockerfiles/python-java/Dockerfile | 5 +- tools/gen_json.py | 155 ++++++++++++++++------------- tools/tools.cfg | 2 +- 4 files changed, 92 insertions(+), 78 deletions(-) diff --git a/Makefile b/Makefile index c15183e..69ae834 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,15 @@ .PHONY: tsv_files srl_tagged_files json_files env clean -all: json_files +all: tools/fillpred_model/model.pickle tsv_files srl_tagged_files json_files -json_files: srl_tagged_files +json_files: # srl_tagged_files cd tools; python3 gen_json.py -srl_tagged_files: tsv_files +srl_tagged_files: # tsv_files # cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd - cd tools/srl-20131216; ./tag_all.sh -tsv_files: tools/fillpred_model/model.pickle +tsv_files: # tools/fillpred_model/model.pickle cd tools; python3 parse_all.py tools/fillpred_model/model.pickle: diff --git a/dockerfiles/python-java/Dockerfile b/dockerfiles/python-java/Dockerfile index e2b3926..ab4d578 100644 --- a/dockerfiles/python-java/Dockerfile +++ b/dockerfiles/python-java/Dockerfile @@ -5,8 +5,9 @@ RUN apt-get install -y \ vim \ default-jdk \ python3 \ -python3-pip \ -sshfs +python3-pip + +RUN apt-get install -y sshfs RUN pip3 install lxml pandas sklearn diff --git a/tools/gen_json.py b/tools/gen_json.py index ad6aa30..628f597 100644 --- a/tools/gen_json.py +++ b/tools/gen_json.py @@ -4,6 +4,7 @@ import configparser import json import sys import logging +from multiprocessing import Pool # parse config config = configparser.ConfigParser() @@ -12,90 +13,102 @@ ORIGPATH = Path(config["tools"]["kres_orig"]) INPATH = Path(config["tools"]["kres_srl"]) OUTPATH = Path(config["tools"]["kres_json"]) DEBUG = config["tools"]["debug"] == "True" -LOGFILE = Path(config["tools"]["logfile"]).absolute().resolve() +CPU_CORES = int(config["tools"]["cpu_cores"]) -logging.basicConfig(filename=str(LOGFILE), level=logging.INFO) +LOGFILE = Path(config["tools"]["logfile"]).absolute() +LOGFILE.touch(exist_ok=True) +LOGFILE.resolve() +logging.basicConfig(filename=str(LOGFILE), level=logging.INFO) def get_origfile(filename): - for origfile in ORIGPATH.iterdir(): - if filename.name.split('.')[0] == origfile.name.split('.')[0]: - return origfile - raise FileNotFoundError + for origfile in ORIGPATH.iterdir(): + if filename.name.split('.')[0] == origfile.name.split('.')[0]: + return origfile + raise FileNotFoundError def extract_sentences(line_reader): - acc = [] - # last char in line is \n, remove it - for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]: - if len(line) == 1: # empty line - tmp = acc - acc = [] - yield tmp - else: - acc.append(line) + acc = [] + # last char in line is \n, remove it + for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]: + if len(line) == 1: # empty line + tmp = acc + acc = [] + yield tmp + else: + acc.append(line) def to_sentence(sentence_arr): - return " ".join([token[1] for token in sentence_arr]) + return " ".join([token[1] for token in sentence_arr]) def match_sentence_id(sentence, orig_dict): - for k, e in orig_dict.items(): - orig_sentence = " ".join(token[2] for token in e["tokens"]) - if sentence == orig_sentence: - return k - raise KeyError + for k, e in orig_dict.items(): + orig_sentence = " ".join(token[2] for token in e["tokens"]) + if sentence == orig_sentence: + return k + raise KeyError def get_dep_rel(token): - logging.debug(token) - for i, field in enumerate(token[14:]): - if field != "_": - return { - "arg": field, - "from": i, # i-th predicate in sentence - "dep": token[0], - } - return None - - + logging.debug(token) + for i, field in enumerate(token[14:]): + if field != "_": + return { + "arg": field, + "from": i, # i-th predicate in sentence + "dep": token[0], + } + return None + +def handle_file(infile_tpl): + i = infile_tpl[0] + infile = infile_tpl[1] + outfile = (OUTPATH / infile.name).with_suffix(".json") + origfile = get_origfile(infile) + orig_dict = par.parse_tei(origfile) + + with infile.open("rb") as fp: + outdata = {} + for sentence_arr in extract_sentences(fp.readlines()): + # tsv dropped sentence ids, match the ID, using original data + sid = match_sentence_id(to_sentence(sentence_arr), orig_dict) + + outdata[sid] = [] + + # find all predicate indices in the sentence + predicates = [] + for token in sentence_arr: + if token[12] == "Y": + predicates += [token[0]] # idx + + deprel = get_dep_rel(token) + if deprel is not None: + outdata[sid].append(deprel) + + # deprel["from"] points to n-th predicate + # replace with predicate's token index + for deprel in outdata[sid]: + deprel["from"] = predicates[deprel["from"]] + + if DEBUG: + print(to_sentence(sentence_arr)) + print(outdata[sid]) + print(sid) + print() + print() + + with outfile.open("w") as fp: + json.dump(outdata, fp) + logging.info("SRL relations written to: {}".format(outfile)) + + +# main par = Parser() OUTPATH.mkdir(exist_ok=True) -logging.info("Start generating .josn files.") -for infile in [x for x in INPATH.iterdir() if x.is_file()]: - origfile = get_origfile(infile) - orig_dict = par.parse_tei(origfile) - - with infile.open("rb") as fp: - outdata = {} - for sentence_arr in extract_sentences(fp.readlines()): - # tsv dropped sentence ids, match the ID, using original data - sid = match_sentence_id(to_sentence(sentence_arr), orig_dict) - - outdata[sid] = [] - - # find all predicate indices in the sentence - predicates = [] - for token in sentence_arr: - if token[12] == "Y": - predicates += [token[0]] # idx - - deprel = get_dep_rel(token) - if deprel is not None: - outdata[sid].append(deprel) - - # deprel["from"] points to n-th predicate - # replace with predicate's token index - for deprel in outdata[sid]: - deprel["from"] = predicates[deprel["from"]] - - if DEBUG: - print(to_sentence(sentence_arr)) - print(outdata[sid]) - print(sid) - print() - print() - - outfile = (OUTPATH / infile.name).with_suffix(".json") - with outfile.open("w") as fp: - json.dump(outdata, fp) - logging.info("SRL relations written to: ", outfile) +infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()])) +logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles))) + +with Pool(CPU_CORES) as p: + p.map(handle_file, infiles) + logging.info("Finished generating .json files.") diff --git a/tools/tools.cfg b/tools/tools.cfg index 33373c1..2e1c839 100644 --- a/tools/tools.cfg +++ b/tools/tools.cfg @@ -4,5 +4,5 @@ kres_tsv = ../data/kres_out/1_tsv kres_srl = ../data/kres_out/2_srl kres_json = ../data/kres_out/final_json logfile = ../progress.log -cpu_cores = 5 +cpu_cores = 1 debug = False