diff --git a/tools/gen_json.py b/tools/gen_json.py index 4c489a3..5475f67 100644 --- a/tools/gen_json.py +++ b/tools/gen_json.py @@ -3,12 +3,7 @@ from parser.parser import Parser import configparser import json import sys - -# defaults -ORIGPATH = Path("../data/kres_example") # we need the IDs -INPATH = Path("../data/kres_example_srl") -OUTPATH = Path("../data/kres_example_json") -DEBUG = False +import logging # parse config config = configparser.ConfigParser() @@ -16,8 +11,11 @@ config.read("tools.cfg") ORIGPATH = Path(config["tools"]["kres_orig"]) INPATH = Path(config["tools"]["kres_srl"]) OUTPATH = Path(config["tools"]["kres_json"]) +LOGFILE = Path(config["tools"]["logfile"]) DEBUG = config["tools"]["debug"] == "True" +logging.basicConfig(filename=LOGFILE, level=logging.INFO) + def get_origfile(filename): for origfile in ORIGPATH.iterdir(): if filename.name.split('.')[0] == origfile.name.split('.')[0]: @@ -46,8 +44,7 @@ def match_sentence_id(sentence, orig_dict): raise KeyError def get_dep_rel(token): - if DEBUG: - print(token) + logging.debug(token) for i, field in enumerate(token[14:]): if field != "_": return { @@ -61,7 +58,7 @@ def get_dep_rel(token): par = Parser() OUTPATH.mkdir(exist_ok=True) -print("Start generating .josn files.") +logging.info("Start generating .josn files.") for infile in [x for x in INPATH.iterdir() if x.is_file()]: origfile = get_origfile(infile) orig_dict = par.parse_tei(origfile) @@ -99,5 +96,5 @@ for infile in [x for x in INPATH.iterdir() if x.is_file()]: outfile = (OUTPATH / infile.name).with_suffix(".json") with outfile.open("w") as fp: json.dump(outdata, fp) - print("SRL relations written to: ", outfile) -print("Finished generating .json files.") \ No newline at end of file + logging.info("SRL relations written to: ", outfile) +logging.info("Finished generating .json files.") diff --git a/tools/parse_all.py b/tools/parse_all.py index 6b76782..1218b67 100644 --- a/tools/parse_all.py +++ b/tools/parse_all.py @@ -6,10 +6,7 @@ import re import sys import cProfile import configparser - -# some defaults -INDIR = Path("../data/kres_example") -OUTDIR = Path("../data/kres_example_tsv") +import logging SSJ500K_2_1 = 27829 # number of sentences par = Parser() @@ -19,6 +16,9 @@ config = configparser.ConfigParser() config.read("tools.cfg") INDIR = Path(config["tools"]["kres_orig"]) OUTDIR = Path(config["tools"]["kres_tsv"]) +LOGFILE = Path(config["tools"]["logfile"]) + +logging.basicConfig(filename=LOGFILE, level=logging.INFO) """ print("parsing ssj") @@ -28,14 +28,14 @@ ssj_dict = par.parse_tei(ssj_file) print("end parsing ssj") """ -print("parsing kres") +logging.info("parsing kres") # kres_file = "../data/kres_example/F0019343.xml.parsed.xml" OUTDIR.mkdir(exist_ok=True) infiles = [x for x in INDIR.iterdir() if x.is_file()] for i, kres_file in enumerate(infiles): - print("Processing file ({}/{}): {}".format(i+1, len(infiles), str(kres_file))) + logging.info("Processing file ({}/{}): {}".format(i+1, len(infiles), str(kres_file))) res_dict = par.parse_tei(kres_file) kres_out_str = "" @@ -44,4 +44,4 @@ for i, kres_file in enumerate(infiles): with (OUTDIR / kres_file.name).with_suffix(".tsv").open("wb+") as fp: fp.write(kres_out_str.encode("utf-8")) -print("end parsing kres") +logging.info("end parsing kres") diff --git a/tools/tools.cfg b/tools/tools.cfg index b32cb78..d684723 100644 --- a/tools/tools.cfg +++ b/tools/tools.cfg @@ -3,4 +3,5 @@ kres_orig = /kres_mount/kres_parsed/tei kres_tsv = ../data/kres_out/1_tsv kres_srl = ../data/kres_out/_srl kres_json = ../data/kres_out/final_json +logfile = "../progress.log" debug = False