cjvt-srl-tagging/tools/gen_json.py

from pathlib import Path
from parser.parser import Parser
import configparser
import json
import sys

# defaults
ORIGPATH = Path("../data/kres_example")  # we need the IDs
INPATH = Path("../data/kres_example_srl")
OUTPATH = Path("../data/kres_example_json")
DEBUG = False

# parse config
config = configparser.ConfigParser()
config.read("tools.cfg")
ORIGPATH = Path(config["tools"]["kres_orig"])
INPATH = Path(config["tools"]["kres_srl"])
OUTPATH = Path(config["tools"]["kres_json"])
DEBUG = config["tools"]["debug"] == "True"

def get_origfile(filename):
	for origfile in ORIGPATH.iterdir():
		if filename.name.split('.')[0] == origfile.name.split('.')[0]:
			return origfile
	raise FileNotFoundError

def extract_sentences(line_reader):
	acc = []
	# last char in line is \n, remove it
	for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
		if len(line) == 1:  # empty line
			tmp = acc
			acc = []
			yield tmp
		else:
			acc.append(line)

def to_sentence(sentence_arr):
	return " ".join([token[1] for token in sentence_arr])

def match_sentence_id(sentence, orig_dict):
	for k, e in orig_dict.items():
		orig_sentence = " ".join(token[2] for token in e["tokens"])
		if sentence == orig_sentence:
			return k
	raise KeyError

def get_dep_rel(token):
	if DEBUG:
		print(token)
	for i, field in enumerate(token[14:]):
		if field != "_":
			return {
				"arg":	field,
				"from":	i,  # i-th predicate in sentence
				"dep":	token[0],
			}
	return None


par = Parser()
OUTPATH.mkdir(exist_ok=True)

print("Start generating .josn files.")
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
	origfile = get_origfile(infile)
	orig_dict = par.parse_tei(origfile)

	with infile.open("rb") as fp:
		outdata = {}
		for sentence_arr in extract_sentences(fp.readlines()):
			# tsv dropped sentence ids, match the ID, using original data
			sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)

			outdata[sid] = []

			# find all predicate indices in the sentence
			predicates = []
			for token in sentence_arr:
				if token[12] == "Y":
					predicates += [token[0]]  # idx

				deprel = get_dep_rel(token)
				if deprel is not None:
					outdata[sid].append(deprel)

			# deprel["from"] points to n-th predicate
			# replace with predicate's token index
			for deprel in outdata[sid]:
				deprel["from"] = predicates[deprel["from"]]

			if DEBUG:
				print(to_sentence(sentence_arr))
				print(outdata[sid])
				print(sid)
				print()
				print()

	outfile = (OUTPATH / infile.name).with_suffix(".json")
	with outfile.open("w") as fp:
		json.dump(outdata, fp)
		print("SRL relations written to: ", outfile)
print("Finished generating .json files.")
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00			`from pathlib import Path`
			`from parser.parser import Parser`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`import configparser`
fixed paths 2019-02-27 16:32:19 +00:00			`import json`
			`import sys`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`# defaults`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00			`ORIGPATH = Path("../data/kres_example") # we need the IDs`
			`INPATH = Path("../data/kres_example_srl")`
			`OUTPATH = Path("../data/kres_example_json")`
tmp 2019-02-27 15:58:04 +00:00			`DEBUG = False`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`# parse config`
			`config = configparser.ConfigParser()`
			`config.read("tools.cfg")`
			`ORIGPATH = Path(config["tools"]["kres_orig"])`
			`INPATH = Path(config["tools"]["kres_srl"])`
			`OUTPATH = Path(config["tools"]["kres_json"])`
fixed paths 2019-02-27 16:32:19 +00:00			`DEBUG = config["tools"]["debug"] == "True"`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00			`def get_origfile(filename):`
			`for origfile in ORIGPATH.iterdir():`
			`if filename.name.split('.')[0] == origfile.name.split('.')[0]:`
			`return origfile`
			`raise FileNotFoundError`

			`def extract_sentences(line_reader):`
			`acc = []`
tmp 2019-02-27 15:58:04 +00:00			`# last char in line is \n, remove it`
			`for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:`
			`if len(line) == 1: # empty line`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00			`tmp = acc`
			`acc = []`
			`yield tmp`
			`else:`
			`acc.append(line)`

tmp 2019-02-27 15:58:04 +00:00			`def to_sentence(sentence_arr):`
			`return " ".join([token[1] for token in sentence_arr])`

			`def match_sentence_id(sentence, orig_dict):`
			`for k, e in orig_dict.items():`
			`orig_sentence = " ".join(token[2] for token in e["tokens"])`
			`if sentence == orig_sentence:`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00			`return k`
			`raise KeyError`

tmp 2019-02-27 15:58:04 +00:00			`def get_dep_rel(token):`
			`if DEBUG:`
			`print(token)`
			`for i, field in enumerate(token[14:]):`
			`if field != "_":`
			`return {`
			`"arg": field,`
			`"from": i, # i-th predicate in sentence`
			`"dep": token[0],`
			`}`
			`return None`

Makefile fix 2019-02-25 12:44:24 +00:00
testing new config 2019-02-27 16:04:03 +00:00			`par = Parser()`
			`OUTPATH.mkdir(exist_ok=True)`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00
ready to go 2019-02-28 07:20:21 +00:00			`print("Start generating .josn files.")`
testing new config 2019-02-27 16:04:03 +00:00			`for infile in [x for x in INPATH.iterdir() if x.is_file()]:`
			`origfile = get_origfile(infile)`
			`orig_dict = par.parse_tei(origfile)`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00
fixed paths 2019-02-27 16:32:19 +00:00			`with infile.open("rb") as fp:`
			`outdata = {}`
			`for sentence_arr in extract_sentences(fp.readlines()):`
			`# tsv dropped sentence ids, match the ID, using original data`
			`sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)`
tmp 2019-02-27 15:58:04 +00:00
fixed paths 2019-02-27 16:32:19 +00:00			`outdata[sid] = []`
tmp 2019-02-27 15:58:04 +00:00
fixed paths 2019-02-27 16:32:19 +00:00			`# find all predicate indices in the sentence`
			`predicates = []`
			`for token in sentence_arr:`
			`if token[12] == "Y":`
			`predicates += [token[0]] # idx`
tmp 2019-02-27 15:58:04 +00:00
fixed paths 2019-02-27 16:32:19 +00:00			`deprel = get_dep_rel(token)`
			`if deprel is not None:`
			`outdata[sid].append(deprel)`
tmp 2019-02-27 15:58:04 +00:00
fixed paths 2019-02-27 16:32:19 +00:00			`# deprel["from"] points to n-th predicate`
			`# replace with predicate's token index`
			`for deprel in outdata[sid]:`
			`deprel["from"] = predicates[deprel["from"]]`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00
fixed paths 2019-02-27 16:32:19 +00:00			`if DEBUG:`
			`print(to_sentence(sentence_arr))`
			`print(outdata[sid])`
			`print(sid)`
			`print()`
			`print()`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00
testing new config 2019-02-27 16:04:03 +00:00			`outfile = (OUTPATH / infile.name).with_suffix(".json")`
fixed paths 2019-02-27 16:32:19 +00:00			`with outfile.open("w") as fp:`
			`json.dump(outdata, fp)`
ready to go 2019-02-28 07:20:21 +00:00			`print("SRL relations written to: ", outfile)`
			`print("Finished generating .json files.")`