diff --git a/corpusparser/main.py b/corpusparser/main.py index 1bcafd4..7fa68b6 100644 --- a/corpusparser/main.py +++ b/corpusparser/main.py @@ -21,7 +21,7 @@ lfh.setFormatter(formatter) logger.addHandler(lfh) logger.setLevel(logging.INFO) -n_chunks = -1 # for logging +n_chunks = -1 def enriched_lemma(token): @@ -69,7 +69,11 @@ def _handle_kres_file_chunk(kres_file_chunk): ) # dbclient.valdb["kres"] for kres_file in kres_chunk: - kres_data = kres_parser.parse_xml_file(kres_file) + try: + kres_data = kres_parser.parse_xml_file(kres_file) + except: + logger.error("Failed to parse file: {}".format(kres_file)) + continue if args.output == "file": kres_outdir = outdir / "kres_json" kres_outdir.mkdir(parents=True, exist_ok=True) @@ -78,17 +82,23 @@ def _handle_kres_file_chunk(kres_file_chunk): json.dump(kres_data, fp) elif args.output == "db": + """ if dbclient.valdb["kres"].find({"sid": kres_data[0]["sid"]}).count() > 0: logger.info("File {} already in DB, closing chunk ({}/{})".format( kres_file, kres_chunk_idx, n_chunks)) dbclient.close() return + """ kres_data_1 = [_db_preprocess(x) for x in kres_data] db_payload += kres_data_1 - dbclient.valdb["kres"].insert_many(db_payload, ordered=False) # much much better (just make sure sid has a unique index) - logger.info("Inserted kres files chunk ({}/{}) in {:.2f} s".format( + try: + dbclient.valdb["kres"].insert_many(db_payload, ordered=False) # much much better (just make sure sid has a unique index) + except: + logger.error("Failed inserting kres files chunk ({}/{}) in {:.2f} s".format( + kres_chunk_idx, n_chunks, time.time() - tstart)) + logger.info("Db insert: chunks ({}/{}) in {:.2f} s".format( kres_chunk_idx, n_chunks, time.time() - tstart)) dbclient.close()