75 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			75 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from parser.parser import Parser
 | |
| import os
 | |
| from os.path import join, dirname
 | |
| from pathlib import Path
 | |
| import re
 | |
| import sys
 | |
| import cProfile
 | |
| import configparser
 | |
| import logging
 | |
| from multiprocessing import Pool
 | |
| 
 | |
| SSJ500K_2_1 = 27829  # number of sentences
 | |
| par = Parser()
 | |
| 
 | |
| # path to data
 | |
| config = configparser.ConfigParser()
 | |
| config.read("tools.cfg")
 | |
| INDIR = Path(config["tools"]["kres_orig"])
 | |
| OUTDIR = Path(config["tools"]["kres_tsv"])
 | |
| CPU_CORES = int(config["tools"]["cpu_cores"])
 | |
| 
 | |
| LOGFILE = Path(config["tools"]["logfile"]).absolute()
 | |
| LOGFILE.touch(exist_ok=True)
 | |
| LOGFILE.resolve()
 | |
| 
 | |
| logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
 | |
| 
 | |
| """
 | |
| print("parsing ssj")
 | |
| ssj_file = "../data/ssj500k-sl.sample.xml"
 | |
| ssj_dict = par.parse_tei(ssj_file)
 | |
| # assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
 | |
| print("end parsing ssj")
 | |
| """
 | |
| 
 | |
| # kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
 | |
| OUTDIR.mkdir(exist_ok=True)
 | |
| 
 | |
| infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
 | |
| logging.info("Parsing kres: {} files.".format(len(infiles)))
 | |
| 
 | |
| def handle_file(infile):
 | |
|     i = infile[0]
 | |
|     kres_file = infile[1]
 | |
|     outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")
 | |
| 
 | |
|     if outfile.is_file():
 | |
|         logging.info("Skipping existing file: {}.".format(str(kres_file)))
 | |
|         return True
 | |
| 
 | |
|     try:
 | |
|         res_dict = par.parse_tei(kres_file)
 | |
|         kres_out_str = ""
 | |
|         for _, sentence in res_dict.items():
 | |
|             kres_out_str += par.to_conll_2009_SRL(sentence)
 | |
|     except Exception as exc:
 | |
|         logging.info("Failed processing file: {}".format(str(kres_file)))
 | |
|         logging.error(exc)
 | |
|         return False
 | |
| 
 | |
| 
 | |
|     with outfile.open("wb+") as fp:
 | |
|         fp.write(kres_out_str.encode("utf-8"))
 | |
|         logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
 | |
|         return True
 | |
|     return False
 | |
| 
 | |
| with Pool(CPU_CORES) as p:
 | |
|     p.map(handle_file, infiles)
 | |
| 
 | |
| 
 | |
| logging.info("end parsing kres")
 | |
| 
 | |
| 
 |