from pathlib import Path from corpusparser import Parser import argparse import logging import json from pymongo import MongoClient import sys from multiprocessing import Pool logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) n_kres_files = -1 # for logging # handler for concurrency def _handle_kres_file_tpl(kres_file_tpl): kres_file_idx = kres_file_tpl[0] kres_file = kres_file_tpl[1] logging.info("Handling {} ({}/{})".format( kres_file, kres_file_idx, n_kres_files)) kres_data = kres_parser.parse_xml_file(kres_file) if args.output == "file": kres_outdir = outdir / "kres_json" kres_outdir.mkdir(parents=True, exist_ok=True) kres_outfile = kres_outdir / Path(kres_file.name.split(".")[0]).with_suffix(".json") with kres_outfile.open("w") as fp: json.dump(kres_data, fp) elif args.output == "db": kres_col = valdb["kres"] for sentence in kres_data: kres_col.update({"sid": sentence["sid"]}, sentence, upsert=True) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.") parser.add_argument('--kres-folder', required=True) parser.add_argument('--kres-srl-folder', required=True) parser.add_argument('--ssj-file', required=True) parser.add_argument('--output', required=False, default=None) parser.add_argument('--outdir', required=False, default=None) parser.add_argument('--dbaddr', required=False, default=None) parser.add_argument('--dbuser', required=False, default=None) parser.add_argument('--dbpass', required=False, default=None) parser.add_argument('--cores', required=False, default=1) args = parser.parse_args() outdir = None valdb = None if args.output == "file": outdir = Path(args.outdir) outdir.mkdir(parents=True, exist_ok=True) elif args.output == "db": dbclient = MongoClient( "mongodb://{}".format(args.dbaddr), username=args.dbuser, password=args.dbpass, authSource="valdb", authMechanism='SCRAM-SHA-1' ) valdb = dbclient.valdb # SSJ logger.info("Parsing Ssj: {}".format(args.ssj_file)) ssj_parser = Parser(corpus="ssj") ssj_data = ssj_parser.parse_xml_file(Path(args.ssj_file)) if args.output == "file": ssj_outfile = outdir / "ssj500k.json" with ssj_outfile.open("w") as fp: json.dump(ssj_data, fp) elif args.output == "db": ssj_col = valdb["ssj"] for sentence in ssj_data: ssj_col.update({"sid": sentence["sid"]}, sentence, upsert=True) # Kres logger.info("Parsing Kres: {}".format(args.kres_folder)) kres_parser = Parser( corpus="kres", kres_srl_folder=args.kres_srl_folder ) # [(idx, filepath)] kres_files = [x for x in Path(args.kres_folder).iterdir()] kres_files = [x for x in enumerate(kres_files)] n_kres_files = len(kres_files) p = Pool(int(args.cores)) p.map(_handle_kres_file_tpl, kres_files) logger.info("Finished parsing.")