gen_json.py needs a bit more work

This commit is contained in:
voje 2019-02-26 00:22:15 +01:00
parent 66c43b3d19
commit bcaf226b9e
3 changed files with 50 additions and 12 deletions

View File

@ -1,8 +1,8 @@
.PHONY: tsv_files srl_tagged_files
.PHONY: tsv_files srl_tagged_files json_files env
all: srl_tagged_files
all: json_files
json_files: srl_tagged_files
json_files: #TODO srl_tagged_files
cd tools; python3 gen_json.py
srl_tagged_files: tsv_files

View File

@ -1,5 +1,48 @@
import Path
from pathlib import Path
from parser.parser import Parser
ORIGPATH = Path("../data/kres_example") # we need the IDs
INPATH = Path("../data/kres_example_srl")
OUTPATH = Path("../data/kres_example_json")
def get_origfile(filename):
for origfile in ORIGPATH.iterdir():
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
return origfile
raise FileNotFoundError
def extract_sentences(line_reader):
acc = []
for line in [x.decode("utf-8").split('\t') for x in line_reader]:
if line[0] == '\n':
tmp = acc
acc = []
yield tmp
else:
acc.append(line)
def match_sentence_id(string, rd):
str1 = " ".join([token[1] for token in sentence_arr])
for k, e in rd.items():
str2 = " ".join(token[2] for token in dict_entry["tokens"])
if str1 == str2
return k
raise KeyError
if __name__ == "__main__":
print("TODO: take data/kres_example_srl/* and generate data/kres_example_json/*")
print("TODO: check ssj and kres <links> for structure")
par = Parser()
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
origfile = get_origfile(infile)
rd = par.parse_tei(origfile)
fp = infile.open("rb")
for sentence_arr in extract_sentences(fp.readlines()):
sid = match_sentence_id(sentence_arr, rd)
print(sid)
# OK, we got the sentence id, now generate the predicate map!
outfile = (OUTPATH / infile.name).with_suffix(".json")

View File

@ -7,7 +7,7 @@ import sys
import cProfile
def main():
if __name__ == "__main__":
# make sure you sanitize every input into unicode
SSJ500K_2_1 = 27829 # number of sentences
@ -44,8 +44,3 @@ def main():
fp.close()
print("end parsing kres")
if __name__ == "__main__":
# cProfile.run("main()", sort="tottime")
main()