From eb83519f51c315b49757e9bc8a634fe988d86c92 Mon Sep 17 00:00:00 2001 From: voje Date: Thu, 14 Mar 2019 14:13:01 +0100 Subject: [PATCH] done creating kres and ssj json files --- Makefile | 2 +- src/pkg/corpusparser/corpusparser/main.py | 49 +++++++++++++++++++---- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 790cbf0..7734cb5 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_example/ssj500k-sl.body.sample.xml" KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_example" KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_srl" OUTPUT = "file" -OUTDIR = "$(HOME)/workdir/outputfolder" +OUTDIR = "/home/voje/workdir/test_out" DBADDR = "" export diff --git a/src/pkg/corpusparser/corpusparser/main.py b/src/pkg/corpusparser/corpusparser/main.py index b6dd803..e0ba065 100644 --- a/src/pkg/corpusparser/corpusparser/main.py +++ b/src/pkg/corpusparser/corpusparser/main.py @@ -9,20 +9,52 @@ logger = logging.getLogger(__name__) ## Main handles command line arguments and writing to files / DB. -def ssj_to_json_file(sentence_generator, outfile): +def ssj_to_json_file(sentence_generator, outfolder): # this funciton is based on the fact that files are parsed sequentially + outfolder = Path(outfolder) + outfolder.mkdir(parents=True, exist_ok=True) + outfile = outfolder / "ssj500k.json" + data_buffer = [] for s in sentence_generator: sdata = s[1] data_buffer += [sdata] # outfile = Path(outfile) - with open(outfile, "w") as fp: + with outfile.open("w") as fp: logger.info("Writing to {}".format(outfile)) json.dump(data_buffer, fp) -def kres_to_json_files() - return "TODO" +def kres_to_json_files(sentence_generator, outfolder): + outfolder = Path(outfolder) / "kres_json" + outfolder.mkdir(parents=True, exist_ok=True) + + def write_buffer_to_file(outfile, outfile_buffer): + logger.info("Writing file: {}".format(outfile)) + with outfile.open("w") as fp: + json.dump(outfile_buffer, fp) + + outfile_buffer = None + current_outfile = None + for s in sentence_generator: + infile = s[0] + outfile = outfolder / Path(infile.name.split(".")[0]).with_suffix(".json") + + # parser sequentially parses files; when we're done with a file, write it out + if current_outfile is None: + current_outfile = outfile + outfile_buffer = [] + elif outfile != current_outfile: + write_buffer_to_file(current_outfile, outfile_buffer) + current_outfile = outfile + outfile_buffer = [] + + # update buffer + sdata = s[1] + outfile_buffer += [sdata] + write_buffer_to_file(current_outfile, outfile_buffer) + + def to_db(): return "TODO" @@ -47,7 +79,7 @@ if __name__ == "__main__": # logger.info("Parsed {} sentences (ssj500k)".format(len(res))) # ssj to json - ssj_to_json_file(ssj_parser.sentence_generator(), "/home/voje/workdir/ssj.json") + ssj_to_json_file(ssj_parser.sentence_generator(), args.outdir) # parse kres logger.info("Parsing Kres: {}".format(args.ssj_file)) @@ -55,8 +87,11 @@ if __name__ == "__main__": corpus="kres", infiles=[args.kres_folder, args.kres_srl_folder], ) - res = [x[1]["sid"] for x in kres_parser.sentence_generator()] - logger.info("Parsed {} sentences (kres)".format(len(res))) + # res = [x[1]["sid"] for x in kres_parser.sentence_generator()] + # logger.info("Parsed {} sentences (kres)".format(len(res))) + + # kres to json + kres_to_json_files(kres_parser.sentence_generator(), args.outdir) ## Handling output is situational --- implement it outside of Parser.