cjvt-srl-tagging/tools/gen_json.py

92 lines
2.2 KiB
Python
Raw Normal View History

2019-02-25 23:22:15 +00:00
from pathlib import Path
from parser.parser import Parser
2019-02-27 15:58:04 +00:00
import json
2019-02-25 23:22:15 +00:00
ORIGPATH = Path("../data/kres_example") # we need the IDs
INPATH = Path("../data/kres_example_srl")
OUTPATH = Path("../data/kres_example_json")
2019-02-27 15:58:04 +00:00
DEBUG = False
2019-02-25 23:22:15 +00:00
def get_origfile(filename):
for origfile in ORIGPATH.iterdir():
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
return origfile
raise FileNotFoundError
def extract_sentences(line_reader):
acc = []
2019-02-27 15:58:04 +00:00
# last char in line is \n, remove it
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
if len(line) == 1: # empty line
2019-02-25 23:22:15 +00:00
tmp = acc
acc = []
yield tmp
else:
acc.append(line)
2019-02-27 15:58:04 +00:00
def to_sentence(sentence_arr):
return " ".join([token[1] for token in sentence_arr])
def match_sentence_id(sentence, orig_dict):
for k, e in orig_dict.items():
orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == orig_sentence:
2019-02-25 23:22:15 +00:00
return k
raise KeyError
2019-02-27 15:58:04 +00:00
def get_dep_rel(token):
if DEBUG:
print(token)
for i, field in enumerate(token[14:]):
if field != "_":
return {
"arg": field,
"from": i, # i-th predicate in sentence
"dep": token[0],
}
return None
2019-02-25 12:44:24 +00:00
if __name__ == "__main__":
2019-02-25 23:22:15 +00:00
par = Parser()
2019-02-27 15:58:04 +00:00
OUTPATH.mkdir(exist_ok=True)
2019-02-25 23:22:15 +00:00
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
origfile = get_origfile(infile)
2019-02-27 15:58:04 +00:00
orig_dict = par.parse_tei(origfile)
2019-02-25 23:22:15 +00:00
fp = infile.open("rb")
2019-02-27 15:58:04 +00:00
outdata = {}
2019-02-25 23:22:15 +00:00
for sentence_arr in extract_sentences(fp.readlines()):
2019-02-27 15:58:04 +00:00
# tsv dropped sentence ids, match the ID, using original data
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
outdata[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
2019-02-25 23:22:15 +00:00
2019-02-27 15:58:04 +00:00
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
2019-02-25 23:22:15 +00:00
2019-02-27 15:58:04 +00:00
outfile = (OUTPATH / infile.name).with_suffix(".json")
# print(outdata)
json.dump(outdata, outfile.open("w"))