a
This commit is contained in:
parent
2582314c4d
commit
01adf47b9b
|
@ -21,7 +21,7 @@ lfh.setFormatter(formatter)
|
|||
logger.addHandler(lfh)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
n_chunks = -1 # for logging
|
||||
n_chunks = -1
|
||||
|
||||
|
||||
def enriched_lemma(token):
|
||||
|
@ -69,7 +69,11 @@ def _handle_kres_file_chunk(kres_file_chunk):
|
|||
)
|
||||
# dbclient.valdb["kres"]
|
||||
for kres_file in kres_chunk:
|
||||
try:
|
||||
kres_data = kres_parser.parse_xml_file(kres_file)
|
||||
except:
|
||||
logger.error("Failed to parse file: {}".format(kres_file))
|
||||
continue
|
||||
if args.output == "file":
|
||||
kres_outdir = outdir / "kres_json"
|
||||
kres_outdir.mkdir(parents=True, exist_ok=True)
|
||||
|
@ -78,17 +82,23 @@ def _handle_kres_file_chunk(kres_file_chunk):
|
|||
json.dump(kres_data, fp)
|
||||
|
||||
elif args.output == "db":
|
||||
"""
|
||||
if dbclient.valdb["kres"].find({"sid": kres_data[0]["sid"]}).count() > 0:
|
||||
logger.info("File {} already in DB, closing chunk ({}/{})".format(
|
||||
kres_file, kres_chunk_idx, n_chunks))
|
||||
dbclient.close()
|
||||
return
|
||||
"""
|
||||
|
||||
kres_data_1 = [_db_preprocess(x) for x in kres_data]
|
||||
db_payload += kres_data_1
|
||||
|
||||
try:
|
||||
dbclient.valdb["kres"].insert_many(db_payload, ordered=False) # much much better (just make sure sid has a unique index)
|
||||
logger.info("Inserted kres files chunk ({}/{}) in {:.2f} s".format(
|
||||
except:
|
||||
logger.error("Failed inserting kres files chunk ({}/{}) in {:.2f} s".format(
|
||||
kres_chunk_idx, n_chunks, time.time() - tstart))
|
||||
logger.info("Db insert: chunks ({}/{}) in {:.2f} s".format(
|
||||
kres_chunk_idx, n_chunks, time.time() - tstart))
|
||||
dbclient.close()
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user