diff --git a/corpusparser/Parser.py b/corpusparser/Parser.py index d135256..32acb66 100644 --- a/corpusparser/Parser.py +++ b/corpusparser/Parser.py @@ -5,7 +5,7 @@ from lxml import etree import logging import time -logging.basicConfig(level=logging.INFO) +# logging.basicConfig(level=logging.INFO) # Read input file(.xml, .json; kres or ssj500k). # Create an iterator that outputs resulting sentences (python dict format). diff --git a/corpusparser/main.py b/corpusparser/main.py index 38e8156..6e70ec8 100644 --- a/corpusparser/main.py +++ b/corpusparser/main.py @@ -9,9 +9,16 @@ import sys from multiprocessing import Pool import time -logging.basicConfig(level=logging.INFO) +# logging.basicConfig(filename=Path("/var/tmp/corpusparser.log"), filemode='a', level=logging.INFO) logger = logging.getLogger(__name__) +# lfh = logging.FileHandler("/var/tmp/fill-database.log") +lfh = logging.StreamHandler(sys.stdout) +formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s") +lfh.setFormatter(formatter) +logger.addHandler(lfh) +logger.setLevel(logging.INFO) + n_kres_files = -1 # for logging @@ -73,13 +80,13 @@ def _handle_kres_file_tpl(kres_file_tpl): # skip if one of the sentences is already in DB if kres_col.find({"sid": kres_data[0]["sid"]}).count() > 0: - logging.info("File {} already in DB ({}/{})".format( + logger.info("File {} already in DB ({}/{})".format( kres_file, kres_file_idx, n_kres_files)) return kres_data_1 = [_db_preprocess(x) for x in kres_data] kres_col.insert_many(kres_data_1) # much much better (just make sure sid has a unique index) - logging.info("Inserted data from {} ({}/{}) in {:.2f} s".format( + logger.info("Inserted data from {} ({}/{}) in {:.2f} s".format( kres_file, kres_file_idx, n_kres_files, time.time() - tstart)) def _get_dbclient(args): @@ -119,7 +126,7 @@ if __name__ == "__main__": # SSJ logger.info("Parsing Ssj: {}".format(args.ssj_file)) - ssj_parser = Parser(corpus="ssj") + ssj_parser = Parser(logger=logger, corpus="ssj") ssj_data = ssj_parser.parse_xml_file(Path(args.ssj_file)) if args.output == "file": ssj_outfile = outdir / "ssj500k.json" @@ -137,6 +144,7 @@ if __name__ == "__main__": # Kres logger.info("Parsing Kres: {}".format(args.kres_folder)) kres_parser = Parser( + logger=logger, corpus="kres", kres_srl_folder=args.kres_srl_folder )