cjvt-srl-tagging/tools/parse_all.py

75 lines
1.9 KiB
Python
Raw Normal View History

from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
2019-02-27 08:15:40 +00:00
import configparser
2019-02-28 09:15:14 +00:00
import logging
2019-02-28 09:53:27 +00:00
from multiprocessing import Pool
2019-02-27 08:15:40 +00:00
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
2019-02-28 12:57:27 +00:00
CPU_CORES = int(config["tools"]["cpu_cores"])
2019-02-28 09:53:27 +00:00
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
2019-02-28 09:15:14 +00:00
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
2019-02-27 08:15:40 +00:00
"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
OUTDIR.mkdir(exist_ok=True)
2019-02-28 09:53:27 +00:00
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
2019-02-28 12:57:27 +00:00
logging.info("Parsing kres: {} files.".format(len(infiles)))
2019-02-28 09:53:27 +00:00
def handle_file(infile):
i = infile[0]
kres_file = infile[1]
2019-02-28 12:57:27 +00:00
outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")
2019-02-27 08:15:40 +00:00
2019-02-28 12:57:27 +00:00
if outfile.is_file():
logging.info("Skipping existing file: {}.".format(str(kres_file)))
return True
2019-02-27 08:15:40 +00:00
2019-02-28 12:57:27 +00:00
try:
res_dict = par.parse_tei(kres_file)
kres_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence)
2019-02-28 20:49:49 +00:00
except Exception as exc:
2019-02-28 12:57:27 +00:00
logging.info("Failed processing file: {}".format(str(kres_file)))
2019-02-28 20:49:49 +00:00
logging.error(exc)
2019-02-28 12:57:27 +00:00
return False
2019-02-27 08:15:40 +00:00
2019-02-28 12:57:27 +00:00
with outfile.open("wb+") as fp:
2019-02-27 08:15:40 +00:00
fp.write(kres_out_str.encode("utf-8"))
2019-02-28 12:57:27 +00:00
logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
return True
return False
2019-02-28 09:53:27 +00:00
2019-02-28 12:57:27 +00:00
with Pool(CPU_CORES) as p:
2019-02-28 09:53:27 +00:00
p.map(handle_file, infiles)
2019-02-28 09:15:14 +00:00
logging.info("end parsing kres")
2019-02-28 09:53:27 +00:00