cjvt-srl-tagging/tools/main.py

69 lines
2.0 KiB
Python
Raw Normal View History

from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
def main():
# make sure you sanitize every input into unicode
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""
print("parsing kres")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
kres_dir = Path("../data/kres_example/").resolve()
2019-02-19 07:07:03 +00:00
kres_out_dir = kres_dir.parent / (kres_dir.name + "_out")
kres_out_dir.mkdir(exist_ok=True)
2019-02-19 07:07:03 +00:00
kres_full_out_dir = kres_dir.parent / (kres_dir.name + "_full_out")
kres_full_out_dir.mkdir(exist_ok=True)
for kres_file in [x for x in kres_dir.iterdir() if x.is_file()]:
print("Processing file: " + str(kres_file))
res_dict = par.parse_tei(kres_file)
2019-02-19 07:07:03 +00:00
longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
print("Longest sentence: ", longest_sent)
kres_out_str = ""
kres_full_out_str = ""
for _, sentence in res_dict.items():
2019-02-19 07:07:03 +00:00
kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
kres_full_out_str += par.to_conll_2009_full(sentence)
# for SRL tagging
try:
with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
fp.close()
except:
pass
try:
# for full tokenization
with (kres_full_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_full_out_str.encode("utf-8"))
fp.close()
except:
pass
print("end parsing kres")
if __name__ == "__main__":
# cProfile.run("main()", sort="tottime")
main()