From 00d9192993a2ccfa45237d7a3a6a851058d4ee09 Mon Sep 17 00:00:00 2001 From: voje Date: Sun, 21 Apr 2019 17:04:20 +0200 Subject: [PATCH] moved parse_ssj into a subprocess --- corpusparser/main.py | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/corpusparser/main.py b/corpusparser/main.py index 6e70ec8..6d40f9a 100644 --- a/corpusparser/main.py +++ b/corpusparser/main.py @@ -99,6 +99,27 @@ def _get_dbclient(args): ) return dbclient + +# wrap it in a function for better garbage collection +def parse_ssj(args): + logger.info("Parsing Ssj: {}".format(args.ssj_file)) + ssj_parser = Parser(logger=logger, corpus="ssj") + ssj_data = ssj_parser.parse_xml_file(Path(args.ssj_file)) + if args.output == "file": + ssj_outfile = outdir / "ssj500k.json" + with ssj_outfile.open("w") as fp: + json.dump(ssj_data, fp) + elif args.output == "db": + dbclient = _get_dbclient(args) + valdb = dbclient.valdb + ssj_col = valdb["ssj"] + for sentence in ssj_data: + sentence = _db_preprocess(sentence) + ssj_col.update({"sid": sentence["sid"]}, sentence, upsert=True) + del ssj_parser + del ssj_data + + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.") parser.add_argument('--kres-folder', required=True) @@ -124,22 +145,12 @@ if __name__ == "__main__": dbclient.valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)]) dbclient.valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)]) + # SSJ - logger.info("Parsing Ssj: {}".format(args.ssj_file)) - ssj_parser = Parser(logger=logger, corpus="ssj") - ssj_data = ssj_parser.parse_xml_file(Path(args.ssj_file)) - if args.output == "file": - ssj_outfile = outdir / "ssj500k.json" - with ssj_outfile.open("w") as fp: - json.dump(ssj_data, fp) - elif args.output == "db": - dbclient = _get_dbclient(args) - valdb = dbclient.valdb - ssj_col = valdb["ssj"] - for sentence in ssj_data: - sentence = _db_preprocess(sentence) - ssj_col.update({"sid": sentence["sid"]}, sentence, upsert=True) + p = Pool(1) + p.map(parse_ssj, [args]) + time.sleep(30) # Kres logger.info("Parsing Kres: {}".format(args.kres_folder))