cjvt-srl-tagging/tools/gen_tei.py

48 lines
1.2 KiB
Python
Raw Permalink Normal View History

# parse config
import configparser
import json
import logging
import os
from pathlib import Path
from tools.parser.parser import Parser
config = configparser.ConfigParser()
config.read("tools.cfg.ssj500k2.3")
ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"])
JSONPATH = Path(config["tools"]["ssj500k_json"] + '/ssj500k-sl.body.json')
OUTPATH = Path(config["tools"]["ssj500k_tei"])
INTERNAL_DATA = Path(config["tools"]["internal_data"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
par = Parser()
OUTPATH.mkdir(exist_ok=True)
jsondata = []
with open(JSONPATH, 'r') as jf:
jsondata = json.load(jf)
logging.info("Generating TEI with annotated SRL.")
def handle_file(file, jsondata):
teifile = (ORIGPATH / file)
resfile = (OUTPATH / file)
orig_dict = par.parse_tei(teifile)
# origfile = get_origfile()
orig_dict = par.minimize_tei(teifile, jsondata)
origfiles = []
for subdir, dirs, files in os.walk(ORIGPATH):
for file in files:
handle_file(file, jsondata)