You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
cjvt-srl-tagging/tools/parse_all.py

62 lines
1.5 KiB

from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
import configparser
import logging
from multiprocessing import Pool
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""
logging.info("parsing kres")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
OUTDIR.mkdir(exist_ok=True)
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
def handle_file(infile):
i = infile[0]
kres_file = infile[1]
logging.info("Processing file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
res_dict = par.parse_tei(kres_file)
kres_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence)
with (OUTDIR / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
with Pool(3) as p:
p.map(handle_file, infiles)
logging.info("end parsing kres")