|
|
|
@ -1,12 +1,21 @@
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from parser.parser import Parser
|
|
|
|
|
import json
|
|
|
|
|
import configparser
|
|
|
|
|
|
|
|
|
|
# defaults
|
|
|
|
|
ORIGPATH = Path("../data/kres_example") # we need the IDs
|
|
|
|
|
INPATH = Path("../data/kres_example_srl")
|
|
|
|
|
OUTPATH = Path("../data/kres_example_json")
|
|
|
|
|
DEBUG = False
|
|
|
|
|
|
|
|
|
|
# parse config
|
|
|
|
|
config = configparser.ConfigParser()
|
|
|
|
|
config.read("tools.cfg")
|
|
|
|
|
ORIGPATH = Path(config["tools"]["kres_orig"])
|
|
|
|
|
INPATH = Path(config["tools"]["kres_srl"])
|
|
|
|
|
OUTPATH = Path(config["tools"]["kres_json"])
|
|
|
|
|
DEBUG = bool(config["tools"]["debug"])
|
|
|
|
|
|
|
|
|
|
def get_origfile(filename):
|
|
|
|
|
for origfile in ORIGPATH.iterdir():
|
|
|
|
|
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
|
|
|
@ -47,45 +56,43 @@ def get_dep_rel(token):
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
par = Parser()
|
|
|
|
|
OUTPATH.mkdir(exist_ok=True)
|
|
|
|
|
par = Parser()
|
|
|
|
|
OUTPATH.mkdir(exist_ok=True)
|
|
|
|
|
|
|
|
|
|
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
|
|
|
|
|
origfile = get_origfile(infile)
|
|
|
|
|
orig_dict = par.parse_tei(origfile)
|
|
|
|
|
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
|
|
|
|
|
origfile = get_origfile(infile)
|
|
|
|
|
orig_dict = par.parse_tei(origfile)
|
|
|
|
|
|
|
|
|
|
fp = infile.open("rb")
|
|
|
|
|
outdata = {}
|
|
|
|
|
for sentence_arr in extract_sentences(fp.readlines()):
|
|
|
|
|
# tsv dropped sentence ids, match the ID, using original data
|
|
|
|
|
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
|
|
|
|
|
fp = infile.open("rb")
|
|
|
|
|
outdata = {}
|
|
|
|
|
for sentence_arr in extract_sentences(fp.readlines()):
|
|
|
|
|
# tsv dropped sentence ids, match the ID, using original data
|
|
|
|
|
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
|
|
|
|
|
|
|
|
|
|
outdata[sid] = []
|
|
|
|
|
outdata[sid] = []
|
|
|
|
|
|
|
|
|
|
# find all predicate indices in the sentence
|
|
|
|
|
predicates = []
|
|
|
|
|
for token in sentence_arr:
|
|
|
|
|
if token[12] == "Y":
|
|
|
|
|
predicates += [token[0]] # idx
|
|
|
|
|
# find all predicate indices in the sentence
|
|
|
|
|
predicates = []
|
|
|
|
|
for token in sentence_arr:
|
|
|
|
|
if token[12] == "Y":
|
|
|
|
|
predicates += [token[0]] # idx
|
|
|
|
|
|
|
|
|
|
deprel = get_dep_rel(token)
|
|
|
|
|
if deprel is not None:
|
|
|
|
|
outdata[sid].append(deprel)
|
|
|
|
|
deprel = get_dep_rel(token)
|
|
|
|
|
if deprel is not None:
|
|
|
|
|
outdata[sid].append(deprel)
|
|
|
|
|
|
|
|
|
|
# deprel["from"] points to n-th predicate
|
|
|
|
|
# replace with predicate's token index
|
|
|
|
|
for deprel in outdata[sid]:
|
|
|
|
|
deprel["from"] = predicates[deprel["from"]]
|
|
|
|
|
# deprel["from"] points to n-th predicate
|
|
|
|
|
# replace with predicate's token index
|
|
|
|
|
for deprel in outdata[sid]:
|
|
|
|
|
deprel["from"] = predicates[deprel["from"]]
|
|
|
|
|
|
|
|
|
|
if DEBUG:
|
|
|
|
|
print(to_sentence(sentence_arr))
|
|
|
|
|
print(outdata[sid])
|
|
|
|
|
print(sid)
|
|
|
|
|
print()
|
|
|
|
|
print()
|
|
|
|
|
if DEBUG:
|
|
|
|
|
print(to_sentence(sentence_arr))
|
|
|
|
|
print(outdata[sid])
|
|
|
|
|
print(sid)
|
|
|
|
|
print()
|
|
|
|
|
print()
|
|
|
|
|
|
|
|
|
|
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
|
|
|
|
# print(outdata)
|
|
|
|
|
json.dump(outdata, outfile.open("w"))
|
|
|
|
|
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
|
|
|
|
# print(outdata)
|
|
|
|
|
json.dump(outdata, outfile.open("w"))
|
|
|
|
|