added parallel json output creation
This commit is contained in:
parent
406e88ade8
commit
044fae2001
8
Makefile
8
Makefile
|
@ -1,15 +1,15 @@
|
|||
.PHONY: tsv_files srl_tagged_files json_files env clean
|
||||
|
||||
all: json_files
|
||||
all: tools/fillpred_model/model.pickle tsv_files srl_tagged_files json_files
|
||||
|
||||
json_files: srl_tagged_files
|
||||
json_files: # srl_tagged_files
|
||||
cd tools; python3 gen_json.py
|
||||
|
||||
srl_tagged_files: tsv_files
|
||||
srl_tagged_files: # tsv_files
|
||||
# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
|
||||
cd tools/srl-20131216; ./tag_all.sh
|
||||
|
||||
tsv_files: tools/fillpred_model/model.pickle
|
||||
tsv_files: # tools/fillpred_model/model.pickle
|
||||
cd tools; python3 parse_all.py
|
||||
|
||||
tools/fillpred_model/model.pickle:
|
||||
|
|
|
@ -5,8 +5,9 @@ RUN apt-get install -y \
|
|||
vim \
|
||||
default-jdk \
|
||||
python3 \
|
||||
python3-pip \
|
||||
sshfs
|
||||
python3-pip
|
||||
|
||||
RUN apt-get install -y sshfs
|
||||
|
||||
RUN pip3 install lxml pandas sklearn
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ import configparser
|
|||
import json
|
||||
import sys
|
||||
import logging
|
||||
from multiprocessing import Pool
|
||||
|
||||
# parse config
|
||||
config = configparser.ConfigParser()
|
||||
|
@ -12,90 +13,102 @@ ORIGPATH = Path(config["tools"]["kres_orig"])
|
|||
INPATH = Path(config["tools"]["kres_srl"])
|
||||
OUTPATH = Path(config["tools"]["kres_json"])
|
||||
DEBUG = config["tools"]["debug"] == "True"
|
||||
LOGFILE = Path(config["tools"]["logfile"]).absolute().resolve()
|
||||
CPU_CORES = int(config["tools"]["cpu_cores"])
|
||||
|
||||
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
||||
LOGFILE.touch(exist_ok=True)
|
||||
LOGFILE.resolve()
|
||||
|
||||
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
|
||||
|
||||
|
||||
def get_origfile(filename):
|
||||
for origfile in ORIGPATH.iterdir():
|
||||
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
||||
return origfile
|
||||
raise FileNotFoundError
|
||||
for origfile in ORIGPATH.iterdir():
|
||||
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
||||
return origfile
|
||||
raise FileNotFoundError
|
||||
|
||||
def extract_sentences(line_reader):
|
||||
acc = []
|
||||
# last char in line is \n, remove it
|
||||
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
|
||||
if len(line) == 1: # empty line
|
||||
tmp = acc
|
||||
acc = []
|
||||
yield tmp
|
||||
else:
|
||||
acc.append(line)
|
||||
acc = []
|
||||
# last char in line is \n, remove it
|
||||
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
|
||||
if len(line) == 1: # empty line
|
||||
tmp = acc
|
||||
acc = []
|
||||
yield tmp
|
||||
else:
|
||||
acc.append(line)
|
||||
|
||||
def to_sentence(sentence_arr):
|
||||
return " ".join([token[1] for token in sentence_arr])
|
||||
return " ".join([token[1] for token in sentence_arr])
|
||||
|
||||
def match_sentence_id(sentence, orig_dict):
|
||||
for k, e in orig_dict.items():
|
||||
orig_sentence = " ".join(token[2] for token in e["tokens"])
|
||||
if sentence == orig_sentence:
|
||||
return k
|
||||
raise KeyError
|
||||
for k, e in orig_dict.items():
|
||||
orig_sentence = " ".join(token[2] for token in e["tokens"])
|
||||
if sentence == orig_sentence:
|
||||
return k
|
||||
raise KeyError
|
||||
|
||||
def get_dep_rel(token):
|
||||
logging.debug(token)
|
||||
for i, field in enumerate(token[14:]):
|
||||
if field != "_":
|
||||
return {
|
||||
"arg": field,
|
||||
"from": i, # i-th predicate in sentence
|
||||
"dep": token[0],
|
||||
}
|
||||
return None
|
||||
logging.debug(token)
|
||||
for i, field in enumerate(token[14:]):
|
||||
if field != "_":
|
||||
return {
|
||||
"arg": field,
|
||||
"from": i, # i-th predicate in sentence
|
||||
"dep": token[0],
|
||||
}
|
||||
return None
|
||||
|
||||
def handle_file(infile_tpl):
|
||||
i = infile_tpl[0]
|
||||
infile = infile_tpl[1]
|
||||
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
||||
origfile = get_origfile(infile)
|
||||
orig_dict = par.parse_tei(origfile)
|
||||
|
||||
with infile.open("rb") as fp:
|
||||
outdata = {}
|
||||
for sentence_arr in extract_sentences(fp.readlines()):
|
||||
# tsv dropped sentence ids, match the ID, using original data
|
||||
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
|
||||
|
||||
outdata[sid] = []
|
||||
|
||||
# find all predicate indices in the sentence
|
||||
predicates = []
|
||||
for token in sentence_arr:
|
||||
if token[12] == "Y":
|
||||
predicates += [token[0]] # idx
|
||||
|
||||
deprel = get_dep_rel(token)
|
||||
if deprel is not None:
|
||||
outdata[sid].append(deprel)
|
||||
|
||||
# deprel["from"] points to n-th predicate
|
||||
# replace with predicate's token index
|
||||
for deprel in outdata[sid]:
|
||||
deprel["from"] = predicates[deprel["from"]]
|
||||
|
||||
if DEBUG:
|
||||
print(to_sentence(sentence_arr))
|
||||
print(outdata[sid])
|
||||
print(sid)
|
||||
print()
|
||||
print()
|
||||
|
||||
with outfile.open("w") as fp:
|
||||
json.dump(outdata, fp)
|
||||
logging.info("SRL relations written to: {}".format(outfile))
|
||||
|
||||
|
||||
# main
|
||||
par = Parser()
|
||||
OUTPATH.mkdir(exist_ok=True)
|
||||
|
||||
logging.info("Start generating .josn files.")
|
||||
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
|
||||
origfile = get_origfile(infile)
|
||||
orig_dict = par.parse_tei(origfile)
|
||||
infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
|
||||
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
|
||||
|
||||
with infile.open("rb") as fp:
|
||||
outdata = {}
|
||||
for sentence_arr in extract_sentences(fp.readlines()):
|
||||
# tsv dropped sentence ids, match the ID, using original data
|
||||
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
|
||||
with Pool(CPU_CORES) as p:
|
||||
p.map(handle_file, infiles)
|
||||
|
||||
outdata[sid] = []
|
||||
|
||||
# find all predicate indices in the sentence
|
||||
predicates = []
|
||||
for token in sentence_arr:
|
||||
if token[12] == "Y":
|
||||
predicates += [token[0]] # idx
|
||||
|
||||
deprel = get_dep_rel(token)
|
||||
if deprel is not None:
|
||||
outdata[sid].append(deprel)
|
||||
|
||||
# deprel["from"] points to n-th predicate
|
||||
# replace with predicate's token index
|
||||
for deprel in outdata[sid]:
|
||||
deprel["from"] = predicates[deprel["from"]]
|
||||
|
||||
if DEBUG:
|
||||
print(to_sentence(sentence_arr))
|
||||
print(outdata[sid])
|
||||
print(sid)
|
||||
print()
|
||||
print()
|
||||
|
||||
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
||||
with outfile.open("w") as fp:
|
||||
json.dump(outdata, fp)
|
||||
logging.info("SRL relations written to: ", outfile)
|
||||
logging.info("Finished generating .json files.")
|
||||
|
|
|
@ -4,5 +4,5 @@ kres_tsv = ../data/kres_out/1_tsv
|
|||
kres_srl = ../data/kres_out/2_srl
|
||||
kres_json = ../data/kres_out/final_json
|
||||
logfile = ../progress.log
|
||||
cpu_cores = 5
|
||||
cpu_cores = 1
|
||||
debug = False
|
||||
|
|
Loading…
Reference in New Issue
Block a user