voje 2 years ago
parent
commit
01adf47b9b
  1. 18
      corpusparser/main.py

18
corpusparser/main.py

@ -21,7 +21,7 @@ lfh.setFormatter(formatter)
logger.addHandler(lfh)
logger.setLevel(logging.INFO)
n_chunks = -1 # for logging
n_chunks = -1
def enriched_lemma(token):
@ -69,7 +69,11 @@ def _handle_kres_file_chunk(kres_file_chunk):
)
# dbclient.valdb["kres"]
for kres_file in kres_chunk:
kres_data = kres_parser.parse_xml_file(kres_file)
try:
kres_data = kres_parser.parse_xml_file(kres_file)
except:
logger.error("Failed to parse file: {}".format(kres_file))
continue
if args.output == "file":
kres_outdir = outdir / "kres_json"
kres_outdir.mkdir(parents=True, exist_ok=True)
@ -78,17 +82,23 @@ def _handle_kres_file_chunk(kres_file_chunk):
json.dump(kres_data, fp)
elif args.output == "db":
"""
if dbclient.valdb["kres"].find({"sid": kres_data[0]["sid"]}).count() > 0:
logger.info("File {} already in DB, closing chunk ({}/{})".format(
kres_file, kres_chunk_idx, n_chunks))
dbclient.close()
return
"""
kres_data_1 = [_db_preprocess(x) for x in kres_data]
db_payload += kres_data_1
dbclient.valdb["kres"].insert_many(db_payload, ordered=False) # much much better (just make sure sid has a unique index)
logger.info("Inserted kres files chunk ({}/{}) in {:.2f} s".format(
try:
dbclient.valdb["kres"].insert_many(db_payload, ordered=False) # much much better (just make sure sid has a unique index)
except:
logger.error("Failed inserting kres files chunk ({}/{}) in {:.2f} s".format(
kres_chunk_idx, n_chunks, time.time() - tstart))
logger.info("Db insert: chunks ({}/{}) in {:.2f} s".format(
kres_chunk_idx, n_chunks, time.time() - tstart))
dbclient.close()

Loading…
Cancel
Save