from pathlib import Path from corpusparser import Parser import argparse import logging import json from pymongo import MongoClient import sys logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) ## Main handles command line arguments and writing to files / DB. def ssj_to_json_file(sentence_generator, outfolder): # this funciton is based on the fact that files are parsed sequentially outfolder = Path(outfolder) outfolder.mkdir(parents=True, exist_ok=True) outfile = outfolder / "ssj500k.json" data_buffer = [] for s in sentence_generator: sdata = s[1] data_buffer += [sdata] # outfile = Path(outfile) with outfile.open("w") as fp: logger.info("Writing to {}".format(outfile)) json.dump(data_buffer, fp) def kres_to_json_files(sentence_generator, outfolder): outfolder = Path(outfolder) / "kres_json" outfolder.mkdir(parents=True, exist_ok=True) def write_buffer_to_file(outfile, outfile_buffer): logger.info("Writing file: {}".format(outfile)) with outfile.open("w") as fp: json.dump(outfile_buffer, fp) outfile_buffer = None current_outfile = None for s in sentence_generator: infile = s[0] outfile = outfolder / Path(infile.name.split(".")[0]).with_suffix(".json") # parser sequentially parses files; when we're done with a file, write it out if current_outfile is None: current_outfile = outfile outfile_buffer = [] elif outfile != current_outfile: write_buffer_to_file(current_outfile, outfile_buffer) current_outfile = outfile outfile_buffer = [] # update buffer sdata = s[1] outfile_buffer += [sdata] write_buffer_to_file(current_outfile, outfile_buffer) def data_to_valdb(sentence_generator, dbaddr, username, password, collection_name): logger.info("Connecting to: {}".format(dbaddr)) client = MongoClient( "mongodb://{}".format(dbaddr), username=username, password=password, authSource="valdb", authMechanism='SCRAM-SHA-256' ) valdb = client.valdb logger.info("Writing data to {}.".format(collection_name)) col = valdb[collection_name] for s in sentence_generator: sdata = s[1] # col.insert_one(sdata) col.update({"sid": sdata["sid"]}, sdata, upsert=True) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.") parser.add_argument('--kres-folder', required=True) parser.add_argument('--kres-srl-folder', required=True) parser.add_argument('--ssj-file', required=True) parser.add_argument('--output', required=False, default=None) parser.add_argument('--outdir', required=False, default=None) parser.add_argument('--dbaddr', required=False, default=None) parser.add_argument('--dbuser', required=False, default=None) parser.add_argument('--dbpass', required=False, default=None) args = parser.parse_args() # parse ssj logger.info("Parsing ssj500k: {}".format(args.ssj_file)) ssj_parser = Parser( corpus="ssj", infiles=[args.ssj_file], ) # ssj to json if args.output == "file": ssj_to_json_file(ssj_parser.sentence_generator(), args.outdir) elif args.output == "db": data_to_valdb( ssj_parser.sentence_generator(), args.dbaddr, args.dbuser, args.dbpass, collection_name="ssj" ) # parse kres logger.info("Parsing Kres: {}".format(args.ssj_file)) kres_parser = Parser( corpus="kres", infiles=[args.kres_folder, args.kres_srl_folder], ) # kres to json if args.output == "file": kres_to_json_files(kres_parser.sentence_generator(), args.outdir) elif args.output == "db": data_to_valdb( kres_parser.sentence_generator(), args.dbaddr, args.dbuser, args.dbpass, collection_name="kres" )