diff --git a/corpusparser/main.py b/corpusparser/main.py index 6d40f9a..b1277ce 100644 --- a/corpusparser/main.py +++ b/corpusparser/main.py @@ -9,11 +9,13 @@ import sys from multiprocessing import Pool import time +CORPORA = ["kres", "ssj"] + # logging.basicConfig(filename=Path("/var/tmp/corpusparser.log"), filemode='a', level=logging.INFO) logger = logging.getLogger(__name__) -# lfh = logging.FileHandler("/var/tmp/fill-database.log") -lfh = logging.StreamHandler(sys.stdout) +lfh = logging.FileHandler("/project/logs/fill-database.log") +# lfh = logging.StreamHandler(sys.stdout) formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s") lfh.setFormatter(formatter) logger.addHandler(lfh) @@ -100,31 +102,12 @@ def _get_dbclient(args): return dbclient -# wrap it in a function for better garbage collection -def parse_ssj(args): - logger.info("Parsing Ssj: {}".format(args.ssj_file)) - ssj_parser = Parser(logger=logger, corpus="ssj") - ssj_data = ssj_parser.parse_xml_file(Path(args.ssj_file)) - if args.output == "file": - ssj_outfile = outdir / "ssj500k.json" - with ssj_outfile.open("w") as fp: - json.dump(ssj_data, fp) - elif args.output == "db": - dbclient = _get_dbclient(args) - valdb = dbclient.valdb - ssj_col = valdb["ssj"] - for sentence in ssj_data: - sentence = _db_preprocess(sentence) - ssj_col.update({"sid": sentence["sid"]}, sentence, upsert=True) - del ssj_parser - del ssj_data - - if __name__ == "__main__": parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.") - parser.add_argument('--kres-folder', required=True) - parser.add_argument('--kres-srl-folder', required=True) - parser.add_argument('--ssj-file', required=True) + parser.add_argument('--corpus', required=True) + parser.add_argument('--kres-folder', required=False) + parser.add_argument('--kres-srl-folder', required=False) + parser.add_argument('--ssj-file', required=False) parser.add_argument('--output', required=False, default=None) parser.add_argument('--outdir', required=False, default=None) parser.add_argument('--dbaddr', required=False, default=None) @@ -133,39 +116,55 @@ if __name__ == "__main__": parser.add_argument('--cores', required=False, default=1) args = parser.parse_args() + + corpus = args.corpus + assert (corpus in CORPORA), "Wrong corpus name." + + outdir = None if args.output == "file": outdir = Path(args.outdir) outdir.mkdir(parents=True, exist_ok=True) elif args.output == "db": - # Force unique sid dbclient = _get_dbclient(args) - for corpus in ["kres", "ssj"]: - dbclient.valdb[corpus].ensure_index([("sid", pymongo.ASCENDING)]) - dbclient.valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)]) - dbclient.valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)]) - - - # SSJ - p = Pool(1) - p.map(parse_ssj, [args]) - - time.sleep(30) - - # Kres - logger.info("Parsing Kres: {}".format(args.kres_folder)) - kres_parser = Parser( - logger=logger, - corpus="kres", - kres_srl_folder=args.kres_srl_folder - ) + dbclient.valdb[corpus].ensure_index([("sid", pymongo.ASCENDING)]) + dbclient.valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)]) + dbclient.valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)]) + + + if corpus == "ssj": + logger.info("Parsing Ssj: {}".format(args.ssj_file)) + ssj_parser = Parser(logger=logger, corpus="ssj") + ssj_data = ssj_parser.parse_xml_file(Path(args.ssj_file)) + if args.output == "file": + ssj_outfile = outdir / "ssj500k.json" + with ssj_outfile.open("w") as fp: + json.dump(ssj_data, fp) + elif args.output == "db": + dbclient = _get_dbclient(args) + valdb = dbclient.valdb + ssj_col = valdb["ssj"] + for sentence in ssj_data: + sentence = _db_preprocess(sentence) + ssj_col.update({"sid": sentence["sid"]}, sentence, upsert=True) + + + if corpus == "kres": + # Kres + logger.info("Parsing Kres: {}".format(args.kres_folder)) + kres_parser = Parser( + logger=logger, + corpus="kres", + kres_srl_folder=args.kres_srl_folder + ) - # [(idx, filepath)] - kres_files = [x for x in Path(args.kres_folder).iterdir()] - kres_files = [x for x in enumerate(kres_files)] - n_kres_files = len(kres_files) + # [(idx, filepath)] + kres_files = [x for x in Path(args.kres_folder).iterdir()] + kres_files = [x for x in enumerate(kres_files)] + n_kres_files = len(kres_files) - p = Pool(int(args.cores)) - p.map(_handle_kres_file_tpl, kres_files) + p = Pool(int(args.cores)) + p.map(_handle_kres_file_tpl, kres_files) + logger.info("Finished parsing.")