cjvt-srl-tagging/tools/parse_all.py

75 lines
1.9 KiB
Python
Raw Permalink Normal View History

from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
2019-02-27 08:15:40 +00:00
import configparser
2019-02-28 09:15:14 +00:00
import logging
2019-02-28 09:53:27 +00:00
from multiprocessing import Pool
2019-02-27 08:15:40 +00:00
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
2019-02-28 12:57:27 +00:00
CPU_CORES = int(config["tools"]["cpu_cores"])
2019-02-28 09:53:27 +00:00
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
2019-02-28 09:15:14 +00:00
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
2019-02-27 08:15:40 +00:00
"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
OUTDIR.mkdir(exist_ok=True)
2019-02-28 09:53:27 +00:00
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
2019-02-28 12:57:27 +00:00
logging.info("Parsing kres: {} files.".format(len(infiles)))
2019-02-28 09:53:27 +00:00
def handle_file(infile):
i = infile[0]
kres_file = infile[1]
2019-02-28 12:57:27 +00:00
outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")
2019-02-27 08:15:40 +00:00
2019-02-28 12:57:27 +00:00
if outfile.is_file():
logging.info("Skipping existing file: {}.".format(str(kres_file)))
return True
2019-02-27 08:15:40 +00:00
2019-02-28 12:57:27 +00:00
try:
res_dict = par.parse_tei(kres_file)
kres_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence)
2019-02-28 20:49:49 +00:00
except Exception as exc:
2019-02-28 12:57:27 +00:00
logging.info("Failed processing file: {}".format(str(kres_file)))
2019-02-28 20:49:49 +00:00
logging.error(exc)
2019-02-28 12:57:27 +00:00
return False
2019-02-27 08:15:40 +00:00
2019-02-28 12:57:27 +00:00
with outfile.open("wb+") as fp:
2019-02-27 08:15:40 +00:00
fp.write(kres_out_str.encode("utf-8"))
2019-02-28 12:57:27 +00:00
logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
return True
return False
2019-02-28 09:53:27 +00:00
2019-02-28 12:57:27 +00:00
with Pool(CPU_CORES) as p:
2019-02-28 09:53:27 +00:00
p.map(handle_file, infiles)
2019-02-28 09:15:14 +00:00
logging.info("end parsing kres")
2019-02-28 09:53:27 +00:00