from parser.parser import Parser import os from os.path import join import re import sys import cProfile def main(): # make sure you sanitize every input into unicode SSJ500K_2_1 = 27829 # number of sentences par = Parser() print("parsing ssj") ssj_file = "../data/ssj500k-sl.sample.xml" ssj_dict = par.parse_tei(ssj_file) # assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences." print("end parsing ssj") print("parsing kres") # kres_file = "../data/kres_example/F0019343.xml.parsed.xml" kres_dir = "../data/kres_example/" for kres_file in os.listdir(kres_dir): # since there will be processed files in the same folder if re.match("^F.+\.xml\.parsed\.xml$", kres_file) is None: continue print("Processing file: " + kres_file) out_file = "" res_dict = par.parse_tei(join(kres_dir, kres_file)) for _, sentence in res_dict.items(): out_file += par.to_conll_2009_SRL(sentence) with open(join(kres_dir, kres_file + ".tsv"), "wb+") as fp: fp.write(out_file.encode("utf-8")) fp.close() print("end parsing kres") if __name__ == "__main__": cProfile.run("main()", sort="tottime") # main()