diff --git a/Makefile b/Makefile index 0092763..0251da3 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ json_files: srl_tagged_files srl_tagged_files: tsv_files # cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd - - cd tools/srl-20131216; ./tag_all.sh ../../data/kres_example_tsv ../../data/kres_example_srl + cd tools/srl-20131216; ./tag_all.sh tsv_files: tools/fillpred_model/model.pickle cd tools; python3 parse_all.py diff --git a/tools/gen_json.py b/tools/gen_json.py index 72e7068..b8ba508 100644 --- a/tools/gen_json.py +++ b/tools/gen_json.py @@ -1,12 +1,21 @@ from pathlib import Path from parser.parser import Parser -import json +import configparser +# defaults ORIGPATH = Path("../data/kres_example") # we need the IDs INPATH = Path("../data/kres_example_srl") OUTPATH = Path("../data/kres_example_json") DEBUG = False +# parse config +config = configparser.ConfigParser() +config.read("tools.cfg") +ORIGPATH = Path(config["tools"]["kres_orig"]) +INPATH = Path(config["tools"]["kres_srl"]) +OUTPATH = Path(config["tools"]["kres_json"]) +DEBUG = bool(config["tools"]["debug"]) + def get_origfile(filename): for origfile in ORIGPATH.iterdir(): if filename.name.split('.')[0] == origfile.name.split('.')[0]: @@ -47,45 +56,43 @@ def get_dep_rel(token): return None -if __name__ == "__main__": - - par = Parser() - OUTPATH.mkdir(exist_ok=True) +par = Parser() +OUTPATH.mkdir(exist_ok=True) - for infile in [x for x in INPATH.iterdir() if x.is_file()]: - origfile = get_origfile(infile) - orig_dict = par.parse_tei(origfile) +for infile in [x for x in INPATH.iterdir() if x.is_file()]: + origfile = get_origfile(infile) + orig_dict = par.parse_tei(origfile) - fp = infile.open("rb") - outdata = {} - for sentence_arr in extract_sentences(fp.readlines()): - # tsv dropped sentence ids, match the ID, using original data - sid = match_sentence_id(to_sentence(sentence_arr), orig_dict) + fp = infile.open("rb") + outdata = {} + for sentence_arr in extract_sentences(fp.readlines()): + # tsv dropped sentence ids, match the ID, using original data + sid = match_sentence_id(to_sentence(sentence_arr), orig_dict) - outdata[sid] = [] + outdata[sid] = [] - # find all predicate indices in the sentence - predicates = [] - for token in sentence_arr: - if token[12] == "Y": - predicates += [token[0]] # idx + # find all predicate indices in the sentence + predicates = [] + for token in sentence_arr: + if token[12] == "Y": + predicates += [token[0]] # idx - deprel = get_dep_rel(token) - if deprel is not None: - outdata[sid].append(deprel) + deprel = get_dep_rel(token) + if deprel is not None: + outdata[sid].append(deprel) - # deprel["from"] points to n-th predicate - # replace with predicate's token index - for deprel in outdata[sid]: - deprel["from"] = predicates[deprel["from"]] + # deprel["from"] points to n-th predicate + # replace with predicate's token index + for deprel in outdata[sid]: + deprel["from"] = predicates[deprel["from"]] - if DEBUG: - print(to_sentence(sentence_arr)) - print(outdata[sid]) - print(sid) - print() - print() + if DEBUG: + print(to_sentence(sentence_arr)) + print(outdata[sid]) + print(sid) + print() + print() - outfile = (OUTPATH / infile.name).with_suffix(".json") - # print(outdata) - json.dump(outdata, outfile.open("w")) + outfile = (OUTPATH / infile.name).with_suffix(".json") + # print(outdata) + json.dump(outdata, outfile.open("w")) diff --git a/tools/parse_all.py b/tools/parse_all.py index f6f67dc..1300afa 100644 --- a/tools/parse_all.py +++ b/tools/parse_all.py @@ -5,40 +5,45 @@ from pathlib import Path import re import sys import cProfile - - -if __name__ == "__main__": - # make sure you sanitize every input into unicode - - SSJ500K_2_1 = 27829 # number of sentences - par = Parser() - - """ - print("parsing ssj") - ssj_file = "../data/ssj500k-sl.sample.xml" - ssj_dict = par.parse_tei(ssj_file) - # assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences." - print("end parsing ssj") - """ - - print("parsing kres") - # kres_file = "../data/kres_example/F0019343.xml.parsed.xml" - kres_dir = Path("../data/kres_example/").resolve() - - kres_out_dir = kres_dir.parent / (kres_dir.name + "_tsv") - kres_out_dir.mkdir(exist_ok=True) - - for kres_file in [x for x in kres_dir.iterdir() if x.is_file()]: - - print("Processing file: " + str(kres_file)) - res_dict = par.parse_tei(kres_file) - kres_out_str = "" - - for _, sentence in res_dict.items(): - kres_out_str += par.to_conll_2009_SRL(sentence) - - with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp: - fp.write(kres_out_str.encode("utf-8")) - fp.close() - - print("end parsing kres") +import configparser + +# some defaults +INDIR = Path("../data/kres_example") +OUTDIR = Path("../data/kres_example_tsv") + +SSJ500K_2_1 = 27829 # number of sentences +par = Parser() + +# path to data +config = configparser.ConfigParser() +config.read("tools.cfg") +INDIR = Path(config["tools"]["kres_orig"]) +OUTDIR = Path(config["tools"]["kres_tsv"]) + +""" +print("parsing ssj") +ssj_file = "../data/ssj500k-sl.sample.xml" +ssj_dict = par.parse_tei(ssj_file) +# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences." +print("end parsing ssj") +""" + +print("parsing kres") +# kres_file = "../data/kres_example/F0019343.xml.parsed.xml" +OUTDIR.mkdir(exist_ok=True) + +for kres_file in [x for x in INDIR.iterdir() if x.is_file()]: + + print("Processing file: " + str(kres_file)) + res_dict = par.parse_tei(kres_file) + longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()]) + print("Longest sentence: ", longest_sent) + kres_out_str = "" + + for _, sentence in res_dict.items(): + kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent) + + with (OUTDIR / kres_file.name).with_suffix(".tsv").open("wb+") as fp: + fp.write(kres_out_str.encode("utf-8")) + fp.close() +print("end parsing kres") diff --git a/tools/parser/parser.py b/tools/parser/parser.py index 92c226c..d37b49c 100644 --- a/tools/parser/parser.py +++ b/tools/parser/parser.py @@ -4,6 +4,7 @@ from parser.msd.msdmap import Msdmap import pickle from pathlib import Path from fillpred_model.step1 import build_model_row +import sys class Parser: # reads a TEI xml file and returns a dictionary: @@ -18,8 +19,12 @@ class Parser: self.W_TAGS = ['w'] self.C_TAGS = ['c'] self.S_TAGS = ['S', 'pc'] - with Path("./fillpred_model/model.pickle").open("rb") as fp: + try: + fp = Path("./fillpred_model/model.pickle").open("rb") self.fillpred_model = pickle.load(fp) + except IOError: + print("Generate the model first: $ make tools/fillpred_mode/model.pickle") + sys.exit(1) def parse_tei(self, filepath): diff --git a/tools/srl-20131216/tag_all.sh b/tools/srl-20131216/tag_all.sh index 6f39eab..ec6cc5c 100755 --- a/tools/srl-20131216/tag_all.sh +++ b/tools/srl-20131216/tag_all.sh @@ -1,7 +1,11 @@ #!/bin/bash -IN_FOLDER="$1" -OUT_FOLDER="$2" +# parsing tools.cfg values +IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)" +echo "input folder: $IN_FOLDER" +OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)" +echo "output folder: $OUT_FOLDER" + SUFFIX="srl.tsv" mkdir -p $OUT_FOLDER diff --git a/tools/tools.cfg b/tools/tools.cfg new file mode 100644 index 0000000..f2ed069 --- /dev/null +++ b/tools/tools.cfg @@ -0,0 +1,6 @@ +[tools] +kres_orig = ../data/kres_example +kres_tsv = ../data/kres_example_tsv +kres_srl = ../data/kres_example_srl +kres_json = ../data/kres/example_json +debug = False \ No newline at end of file