cjvt-srl-tagging/tools/parse_all.py

45 lines
1.2 KiB
Python
Raw Normal View History

from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
2019-02-25 23:22:15 +00:00
if __name__ == "__main__":
# make sure you sanitize every input into unicode
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""
print("parsing kres")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
kres_dir = Path("../data/kres_example/").resolve()
2019-02-19 07:07:03 +00:00
2019-02-24 21:23:32 +00:00
kres_out_dir = kres_dir.parent / (kres_dir.name + "_tsv")
kres_out_dir.mkdir(exist_ok=True)
for kres_file in [x for x in kres_dir.iterdir() if x.is_file()]:
print("Processing file: " + str(kres_file))
res_dict = par.parse_tei(kres_file)
2019-02-19 07:07:03 +00:00
kres_out_str = ""
for _, sentence in res_dict.items():
2019-02-27 15:58:04 +00:00
kres_out_str += par.to_conll_2009_SRL(sentence)
2019-02-19 07:07:03 +00:00
2019-02-24 21:23:32 +00:00
with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
fp.close()
2019-02-19 07:07:03 +00:00
print("end parsing kres")