You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
cjvt-srl-tagging/tools/parse_all.py

75 lines
1.9 KiB

from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
import configparser
import logging
from multiprocessing import Pool
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
OUTDIR.mkdir(exist_ok=True)
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
logging.info("Parsing kres: {} files.".format(len(infiles)))
def handle_file(infile):
i = infile[0]
kres_file = infile[1]
outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")
if outfile.is_file():
logging.info("Skipping existing file: {}.".format(str(kres_file)))
return True
try:
res_dict = par.parse_tei(kres_file)
kres_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence)
except Exception as exc:
logging.info("Failed processing file: {}".format(str(kres_file)))
logging.error(exc)
return False
with outfile.open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
return True
return False
with Pool(CPU_CORES) as p:
p.map(handle_file, infiles)
logging.info("end parsing kres")