cjvt-srl-tagging/tools/gen_json.py

from pathlib import Path
from parser.parser import Parser
import configparser
import json
import sys
import logging
from multiprocessing import Pool

# parse config
config = configparser.ConfigParser()
config.read("tools.cfg")
ORIGPATH = Path(config["tools"]["kres_orig"])
INPATH = Path(config["tools"]["kres_srl"])
OUTPATH = Path(config["tools"]["kres_json"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])

LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()

logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)

def get_origfile(filename):
    for origfile in ORIGPATH.iterdir():
        if filename.name.split('.')[0] == origfile.name.split('.')[0]:
            return origfile
    raise FileNotFoundError

def extract_sentences(line_reader):
    acc = []
    # last char in line is \n, remove it
    for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
        if len(line) == 1:  # empty line
            tmp = acc
            acc = []
            yield tmp
        else:
            acc.append(line)

def to_sentence(sentence_arr):
    return " ".join([token[1] for token in sentence_arr])

def match_sentence_id(sentence, orig_dict):
    for k, e in orig_dict.items():
        orig_sentence = " ".join(token[2] for token in e["tokens"])
        if sentence == orig_sentence:
            return k
    raise KeyError

def get_dep_rel(token):
    logging.debug(token)
    for i, field in enumerate(token[14:]):
        if field != "_":
            return {
                "arg":  field,
                "from": i,  # i-th predicate in sentence
                "dep":  token[0],
            }
    return None

def handle_file(infile_tpl):
    i = infile_tpl[0]
    infile = infile_tpl[1]
    outfile = (OUTPATH / infile.name).with_suffix(".json")
    origfile = get_origfile(infile)
    orig_dict = par.parse_tei(origfile)

    with infile.open("rb") as fp:
        outdata = {}
        for sentence_arr in extract_sentences(fp.readlines()):
            # tsv dropped sentence ids, match the ID, using original data
            sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)

            outdata[sid] = []

            # find all predicate indices in the sentence
            predicates = []
            for token in sentence_arr:
                if token[12] == "Y":
                    predicates += [token[0]]  # idx

                deprel = get_dep_rel(token)
                if deprel is not None:
                    outdata[sid].append(deprel)

            # deprel["from"] points to n-th predicate
            # replace with predicate's token index
            for deprel in outdata[sid]:
                deprel["from"] = predicates[deprel["from"]]

            if DEBUG:
                print(to_sentence(sentence_arr))
                print(outdata[sid])
                print(sid)
                print()
                print()

    with outfile.open("w") as fp:
        json.dump(outdata, fp)
        logging.info("SRL relations written to: {}".format(outfile))


# main
par = Parser()
OUTPATH.mkdir(exist_ok=True)

infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))

with Pool(CPU_CORES) as p:
    p.map(handle_file, infiles)

logging.info("Finished generating .json files.")
gen_json.py needs a bit more work 5 years ago			`from pathlib import Path`
			`from parser.parser import Parser`
added tools.cfg for configurable paths 5 years ago			`import configparser`
fixed paths 5 years ago			`import json`
			`import sys`
added logger 5 years ago			`import logging`
added parallel json output creation 5 years ago			`from multiprocessing import Pool`
gen_json.py needs a bit more work 5 years ago
added tools.cfg for configurable paths 5 years ago			`# parse config`
			`config = configparser.ConfigParser()`
			`config.read("tools.cfg")`
			`ORIGPATH = Path(config["tools"]["kres_orig"])`
			`INPATH = Path(config["tools"]["kres_srl"])`
			`OUTPATH = Path(config["tools"]["kres_json"])`
fixed paths 5 years ago			`DEBUG = config["tools"]["debug"] == "True"`
added parallel json output creation 5 years ago			`CPU_CORES = int(config["tools"]["cpu_cores"])`
added logging; paralelize the first part now 5 years ago
added parallel json output creation 5 years ago			`LOGFILE = Path(config["tools"]["logfile"]).absolute()`
			`LOGFILE.touch(exist_ok=True)`
			`LOGFILE.resolve()`
added tools.cfg for configurable paths 5 years ago
added parallel json output creation 5 years ago			`logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)`
added logger 5 years ago
gen_json.py needs a bit more work 5 years ago			`def get_origfile(filename):`
added parallel json output creation 5 years ago			`for origfile in ORIGPATH.iterdir():`
			`if filename.name.split('.')[0] == origfile.name.split('.')[0]:`
			`return origfile`
			`raise FileNotFoundError`
gen_json.py needs a bit more work 5 years ago
			`def extract_sentences(line_reader):`
added parallel json output creation 5 years ago			`acc = []`
			`# last char in line is \n, remove it`
			`for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:`
			`if len(line) == 1: # empty line`
			`tmp = acc`
			`acc = []`
			`yield tmp`
			`else:`
			`acc.append(line)`
gen_json.py needs a bit more work 5 years ago
tmp 5 years ago			`def to_sentence(sentence_arr):`
added parallel json output creation 5 years ago			`return " ".join([token[1] for token in sentence_arr])`
tmp 5 years ago
			`def match_sentence_id(sentence, orig_dict):`
added parallel json output creation 5 years ago			`for k, e in orig_dict.items():`
			`orig_sentence = " ".join(token[2] for token in e["tokens"])`
			`if sentence == orig_sentence:`
			`return k`
			`raise KeyError`
gen_json.py needs a bit more work 5 years ago
tmp 5 years ago			`def get_dep_rel(token):`
added parallel json output creation 5 years ago			`logging.debug(token)`
			`for i, field in enumerate(token[14:]):`
			`if field != "_":`
			`return {`
			`"arg": field,`
			`"from": i, # i-th predicate in sentence`
			`"dep": token[0],`
			`}`
			`return None`

			`def handle_file(infile_tpl):`
			`i = infile_tpl[0]`
			`infile = infile_tpl[1]`
			`outfile = (OUTPATH / infile.name).with_suffix(".json")`
			`origfile = get_origfile(infile)`
			`orig_dict = par.parse_tei(origfile)`

			`with infile.open("rb") as fp:`
			`outdata = {}`
			`for sentence_arr in extract_sentences(fp.readlines()):`
			`# tsv dropped sentence ids, match the ID, using original data`
			`sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)`

			`outdata[sid] = []`

			`# find all predicate indices in the sentence`
			`predicates = []`
			`for token in sentence_arr:`
			`if token[12] == "Y":`
			`predicates += [token[0]] # idx`

			`deprel = get_dep_rel(token)`
			`if deprel is not None:`
			`outdata[sid].append(deprel)`

			`# deprel["from"] points to n-th predicate`
			`# replace with predicate's token index`
			`for deprel in outdata[sid]:`
			`deprel["from"] = predicates[deprel["from"]]`

			`if DEBUG:`
			`print(to_sentence(sentence_arr))`
			`print(outdata[sid])`
			`print(sid)`
			`print()`
			`print()`

			`with outfile.open("w") as fp:`
			`json.dump(outdata, fp)`
			`logging.info("SRL relations written to: {}".format(outfile))`


			`# main`
testing new config 5 years ago			`par = Parser()`
			`OUTPATH.mkdir(exist_ok=True)`
gen_json.py needs a bit more work 5 years ago
added parallel json output creation 5 years ago			`infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))`
			`logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))`

			`with Pool(CPU_CORES) as p:`
			`p.map(handle_file, infiles)`

added logger 5 years ago			`logging.info("Finished generating .json files.")`