from parser.parser import Parser import os from os.path import join, dirname from pathlib import Path import re import sys import cProfile def main(): # make sure you sanitize every input into unicode SSJ500K_2_1 = 27829 # number of sentences par = Parser() """ print("parsing ssj") ssj_file = "../data/ssj500k-sl.sample.xml" ssj_dict = par.parse_tei(ssj_file) # assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences." print("end parsing ssj") """ print("parsing kres") # kres_file = "../data/kres_example/F0019343.xml.parsed.xml" kres_dir = Path("../data/kres_example/").resolve() kres_out_dir = kres_dir.parent / (kres_dir.name + "_out") kres_out_dir.mkdir(exist_ok=True) kres_full_out_dir = kres_dir.parent / (kres_dir.name + "_full_out") kres_full_out_dir.mkdir(exist_ok=True) for kres_file in [x for x in kres_dir.iterdir() if x.is_file()]: print("Processing file: " + str(kres_file)) res_dict = par.parse_tei(kres_file) longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()]) print("Longest sentence: ", longest_sent) kres_out_str = "" kres_full_out_str = "" for _, sentence in res_dict.items(): kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent) kres_full_out_str += par.to_conll_2009_full(sentence) # for SRL tagging try: with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp: fp.write(kres_out_str.encode("utf-8")) fp.close() except: pass try: # for full tokenization with (kres_full_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp: fp.write(kres_full_out_str.encode("utf-8")) fp.close() except: pass print("end parsing kres") if __name__ == "__main__": # cProfile.run("main()", sort="tottime") main()