69 lines
2.0 KiB
Python
69 lines
2.0 KiB
Python
from parser.parser import Parser
|
|
import os
|
|
from os.path import join, dirname
|
|
from pathlib import Path
|
|
import re
|
|
import sys
|
|
import cProfile
|
|
|
|
|
|
def main():
|
|
# make sure you sanitize every input into unicode
|
|
|
|
SSJ500K_2_1 = 27829 # number of sentences
|
|
par = Parser()
|
|
|
|
"""
|
|
print("parsing ssj")
|
|
ssj_file = "../data/ssj500k-sl.sample.xml"
|
|
ssj_dict = par.parse_tei(ssj_file)
|
|
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
|
|
print("end parsing ssj")
|
|
"""
|
|
|
|
print("parsing kres")
|
|
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
|
|
kres_dir = Path("../data/kres_example/").resolve()
|
|
|
|
kres_out_dir = kres_dir.parent / (kres_dir.name + "_out")
|
|
kres_out_dir.mkdir(exist_ok=True)
|
|
|
|
kres_full_out_dir = kres_dir.parent / (kres_dir.name + "_full_out")
|
|
kres_full_out_dir.mkdir(exist_ok=True)
|
|
|
|
for kres_file in [x for x in kres_dir.iterdir() if x.is_file()]:
|
|
|
|
print("Processing file: " + str(kres_file))
|
|
res_dict = par.parse_tei(kres_file)
|
|
longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
|
|
print("Longest sentence: ", longest_sent)
|
|
kres_out_str = ""
|
|
kres_full_out_str = ""
|
|
|
|
for _, sentence in res_dict.items():
|
|
kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
|
|
kres_full_out_str += par.to_conll_2009_full(sentence)
|
|
|
|
# for SRL tagging
|
|
try:
|
|
with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
|
|
fp.write(kres_out_str.encode("utf-8"))
|
|
fp.close()
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
# for full tokenization
|
|
with (kres_full_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
|
|
fp.write(kres_full_out_str.encode("utf-8"))
|
|
fp.close()
|
|
except:
|
|
pass
|
|
|
|
print("end parsing kres")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# cProfile.run("main()", sort="tottime")
|
|
main()
|