from pathlib import Path from corpusparser import Parser import argparse import logging import json logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) ## Main handles command line arguments and writing to files / DB. def ssj_to_json_file(sentence_generator, outfolder): # this funciton is based on the fact that files are parsed sequentially outfolder = Path(outfolder) outfolder.mkdir(parents=True, exist_ok=True) outfile = outfolder / "ssj500k.json" data_buffer = [] for s in sentence_generator: sdata = s[1] data_buffer += [sdata] # outfile = Path(outfile) with outfile.open("w") as fp: logger.info("Writing to {}".format(outfile)) json.dump(data_buffer, fp) def kres_to_json_files(sentence_generator, outfolder): outfolder = Path(outfolder) / "kres_json" outfolder.mkdir(parents=True, exist_ok=True) def write_buffer_to_file(outfile, outfile_buffer): logger.info("Writing file: {}".format(outfile)) with outfile.open("w") as fp: json.dump(outfile_buffer, fp) outfile_buffer = None current_outfile = None for s in sentence_generator: infile = s[0] outfile = outfolder / Path(infile.name.split(".")[0]).with_suffix(".json") # parser sequentially parses files; when we're done with a file, write it out if current_outfile is None: current_outfile = outfile outfile_buffer = [] elif outfile != current_outfile: write_buffer_to_file(current_outfile, outfile_buffer) current_outfile = outfile outfile_buffer = [] # update buffer sdata = s[1] outfile_buffer += [sdata] write_buffer_to_file(current_outfile, outfile_buffer) def to_db(): return "TODO" if __name__ == "__main__": parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.") parser.add_argument('--kres-folder', required=True) parser.add_argument('--kres-srl-folder', required=True) parser.add_argument('--ssj-file', required=True) parser.add_argument('--output', required=False, default=None) parser.add_argument('--outdir', required=False, default=None) parser.add_argument('--dbaddr', required=False, default=None) args = parser.parse_args() # parse ssj logger.info("Parsing ssj500k: {}".format(args.ssj_file)) ssj_parser = Parser( corpus="ssj", infiles=[args.ssj_file], ) # res = [x[1]["sid"] for x in ssj_parser.sentence_generator()] # logger.info("Parsed {} sentences (ssj500k)".format(len(res))) # ssj to json ssj_to_json_file(ssj_parser.sentence_generator(), args.outdir) # parse kres logger.info("Parsing Kres: {}".format(args.ssj_file)) kres_parser = Parser( corpus="kres", infiles=[args.kres_folder, args.kres_srl_folder], ) # res = [x[1]["sid"] for x in kres_parser.sentence_generator()] # logger.info("Parsed {} sentences (kres)".format(len(res))) # kres to json kres_to_json_files(kres_parser.sentence_generator(), args.outdir) ## Handling output is situational --- implement it outside of Parser. ## Parser returns tuples (orig_file, element) # 1. parse per-file and output to file (JSON) # 2. parse and save to DB # TODO