cjvt-srl-tagging/tools/gen_json.py

from pathlib import Path
from parser.parser import Parser
import configparser

# defaults
ORIGPATH = Path("../data/kres_example")  # we need the IDs
INPATH = Path("../data/kres_example_srl")
OUTPATH = Path("../data/kres_example_json")

# parse config
config = configparser.ConfigParser()
config.read("tools.cfg")
ORIGPATH = Path(config["tools"]["kres_orig"])
INPATH = Path(config["tools"]["kres_srl"])
OUTPATH = Path(config["tools"]["kres_json"])

def get_origfile(filename):
	for origfile in ORIGPATH.iterdir():
		if filename.name.split('.')[0] == origfile.name.split('.')[0]:
			return origfile
	raise FileNotFoundError

def extract_sentences(line_reader):
	acc = []
	for line in [x.decode("utf-8").split('\t') for x in line_reader]:
		if line[0] == '\n':
			tmp = acc
			acc = []
			yield tmp
		else:
			acc.append(line)

def match_sentence_id(string, rd):
	str1 = " ".join([token[1] for token in sentence_arr])
	for k, e in rd.items():
		str2 = " ".join(token[2] for token in dict_entry["tokens"])
		if str1 == str2:
			return k
	raise KeyError


if __name__ == "__main__":

	par = Parser()

	for infile in [x for x in INPATH.iterdir() if x.is_file()]:
		origfile = get_origfile(infile)
		rd = par.parse_tei(origfile)

		fp = infile.open("rb")
		for sentence_arr in extract_sentences(fp.readlines()):
			sid = match_sentence_id(sentence_arr, rd)
			print(sid)
			# OK, we got the sentence id, now generate the predicate map!


		outfile = (OUTPATH / infile.name).with_suffix(".json")
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00			`from pathlib import Path`
			`from parser.parser import Parser`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`import configparser`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`# defaults`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00			`ORIGPATH = Path("../data/kres_example") # we need the IDs`
			`INPATH = Path("../data/kres_example_srl")`
			`OUTPATH = Path("../data/kres_example_json")`

added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`# parse config`
			`config = configparser.ConfigParser()`
			`config.read("tools.cfg")`
			`ORIGPATH = Path(config["tools"]["kres_orig"])`
			`INPATH = Path(config["tools"]["kres_srl"])`
			`OUTPATH = Path(config["tools"]["kres_json"])`

gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00			`def get_origfile(filename):`
			`for origfile in ORIGPATH.iterdir():`
			`if filename.name.split('.')[0] == origfile.name.split('.')[0]:`
			`return origfile`
			`raise FileNotFoundError`

			`def extract_sentences(line_reader):`
			`acc = []`
			`for line in [x.decode("utf-8").split('\t') for x in line_reader]:`
			`if line[0] == '\n':`
			`tmp = acc`
			`acc = []`
			`yield tmp`
			`else:`
			`acc.append(line)`

			`def match_sentence_id(string, rd):`
			`str1 = " ".join([token[1] for token in sentence_arr])`
			`for k, e in rd.items():`
			`str2 = " ".join(token[2] for token in dict_entry["tokens"])`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`if str1 == str2:`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00			`return k`
			`raise KeyError`

Makefile fix 2019-02-25 12:44:24 +00:00
			`if __name__ == "__main__":`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00
			`par = Parser()`

			`for infile in [x for x in INPATH.iterdir() if x.is_file()]:`
			`origfile = get_origfile(infile)`
			`rd = par.parse_tei(origfile)`

			`fp = infile.open("rb")`
			`for sentence_arr in extract_sentences(fp.readlines()):`
			`sid = match_sentence_id(sentence_arr, rd)`
			`print(sid)`
			`# OK, we got the sentence id, now generate the predicate map!`


			`outfile = (OUTPATH / infile.name).with_suffix(".json")`