forked from kristjan/cjvt-srl-tagging
gen_json.py needs a bit more work
This commit is contained in:
parent
66c43b3d19
commit
bcaf226b9e
6
Makefile
6
Makefile
|
@ -1,8 +1,8 @@
|
|||
.PHONY: tsv_files srl_tagged_files
|
||||
.PHONY: tsv_files srl_tagged_files json_files env
|
||||
|
||||
all: srl_tagged_files
|
||||
all: json_files
|
||||
|
||||
json_files: srl_tagged_files
|
||||
json_files: #TODO srl_tagged_files
|
||||
cd tools; python3 gen_json.py
|
||||
|
||||
srl_tagged_files: tsv_files
|
||||
|
|
|
@ -1,5 +1,48 @@
|
|||
import Path
|
||||
from pathlib import Path
|
||||
from parser.parser import Parser
|
||||
|
||||
ORIGPATH = Path("../data/kres_example") # we need the IDs
|
||||
INPATH = Path("../data/kres_example_srl")
|
||||
OUTPATH = Path("../data/kres_example_json")
|
||||
|
||||
def get_origfile(filename):
|
||||
for origfile in ORIGPATH.iterdir():
|
||||
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
||||
return origfile
|
||||
raise FileNotFoundError
|
||||
|
||||
def extract_sentences(line_reader):
|
||||
acc = []
|
||||
for line in [x.decode("utf-8").split('\t') for x in line_reader]:
|
||||
if line[0] == '\n':
|
||||
tmp = acc
|
||||
acc = []
|
||||
yield tmp
|
||||
else:
|
||||
acc.append(line)
|
||||
|
||||
def match_sentence_id(string, rd):
|
||||
str1 = " ".join([token[1] for token in sentence_arr])
|
||||
for k, e in rd.items():
|
||||
str2 = " ".join(token[2] for token in dict_entry["tokens"])
|
||||
if str1 == str2
|
||||
return k
|
||||
raise KeyError
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("TODO: take data/kres_example_srl/* and generate data/kres_example_json/*")
|
||||
print("TODO: check ssj and kres <links> for structure")
|
||||
|
||||
par = Parser()
|
||||
|
||||
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
|
||||
origfile = get_origfile(infile)
|
||||
rd = par.parse_tei(origfile)
|
||||
|
||||
fp = infile.open("rb")
|
||||
for sentence_arr in extract_sentences(fp.readlines()):
|
||||
sid = match_sentence_id(sentence_arr, rd)
|
||||
print(sid)
|
||||
# OK, we got the sentence id, now generate the predicate map!
|
||||
|
||||
|
||||
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
|
@ -7,7 +7,7 @@ import sys
|
|||
import cProfile
|
||||
|
||||
|
||||
def main():
|
||||
if __name__ == "__main__":
|
||||
# make sure you sanitize every input into unicode
|
||||
|
||||
SSJ500K_2_1 = 27829 # number of sentences
|
||||
|
@ -44,8 +44,3 @@ def main():
|
|||
fp.close()
|
||||
|
||||
print("end parsing kres")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# cProfile.run("main()", sort="tottime")
|
||||
main()
|
||||
|
|
Loading…
Reference in New Issue
Block a user