You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

69 lines
2.0 KiB

from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
def main():
# make sure you sanitize every input into unicode
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""
print("parsing kres")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
kres_dir = Path("../data/kres_example/").resolve()
kres_out_dir = kres_dir.parent / (kres_dir.name + "_out")
kres_out_dir.mkdir(exist_ok=True)
kres_full_out_dir = kres_dir.parent / (kres_dir.name + "_full_out")
kres_full_out_dir.mkdir(exist_ok=True)
for kres_file in [x for x in kres_dir.iterdir() if x.is_file()]:
print("Processing file: " + str(kres_file))
res_dict = par.parse_tei(kres_file)
longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
print("Longest sentence: ", longest_sent)
kres_out_str = ""
kres_full_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
kres_full_out_str += par.to_conll_2009_full(sentence)
# for SRL tagging
try:
with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
fp.close()
except:
pass
try:
# for full tokenization
with (kres_full_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_full_out_str.encode("utf-8"))
fp.close()
except:
pass
print("end parsing kres")
if __name__ == "__main__":
# cProfile.run("main()", sort="tottime")
main()