diff --git a/Makefile b/Makefile index 8c5adc9..ad7e8a0 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ -.PHONY: tsv_files srl_tagged_files +.PHONY: tsv_files srl_tagged_files json_files env -all: srl_tagged_files +all: json_files -json_files: srl_tagged_files +json_files: #TODO srl_tagged_files cd tools; python3 gen_json.py srl_tagged_files: tsv_files diff --git a/tools/gen_json.py b/tools/gen_json.py index c074a8d..0c7e701 100644 --- a/tools/gen_json.py +++ b/tools/gen_json.py @@ -1,5 +1,48 @@ -import Path +from pathlib import Path +from parser.parser import Parser + +ORIGPATH = Path("../data/kres_example") # we need the IDs +INPATH = Path("../data/kres_example_srl") +OUTPATH = Path("../data/kres_example_json") + +def get_origfile(filename): + for origfile in ORIGPATH.iterdir(): + if filename.name.split('.')[0] == origfile.name.split('.')[0]: + return origfile + raise FileNotFoundError + +def extract_sentences(line_reader): + acc = [] + for line in [x.decode("utf-8").split('\t') for x in line_reader]: + if line[0] == '\n': + tmp = acc + acc = [] + yield tmp + else: + acc.append(line) + +def match_sentence_id(string, rd): + str1 = " ".join([token[1] for token in sentence_arr]) + for k, e in rd.items(): + str2 = " ".join(token[2] for token in dict_entry["tokens"]) + if str1 == str2 + return k + raise KeyError + if __name__ == "__main__": - print("TODO: take data/kres_example_srl/* and generate data/kres_example_json/*") - print("TODO: check ssj and kres for structure") + + par = Parser() + + for infile in [x for x in INPATH.iterdir() if x.is_file()]: + origfile = get_origfile(infile) + rd = par.parse_tei(origfile) + + fp = infile.open("rb") + for sentence_arr in extract_sentences(fp.readlines()): + sid = match_sentence_id(sentence_arr, rd) + print(sid) + # OK, we got the sentence id, now generate the predicate map! + + + outfile = (OUTPATH / infile.name).with_suffix(".json") \ No newline at end of file diff --git a/tools/parse_all.py b/tools/parse_all.py index 0bd0146..01a867f 100644 --- a/tools/parse_all.py +++ b/tools/parse_all.py @@ -7,7 +7,7 @@ import sys import cProfile -def main(): +if __name__ == "__main__": # make sure you sanitize every input into unicode SSJ500K_2_1 = 27829 # number of sentences @@ -44,8 +44,3 @@ def main(): fp.close() print("end parsing kres") - - -if __name__ == "__main__": - # cProfile.run("main()", sort="tottime") - main()