You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

28 lines
938 B

from parser import parser
import os
from os.path import join
SSJ500K_2_1 = 27829 # number of sentences
if __name__ == "__main__":
# make sure you sanitize every input into unicode
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = parser.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
print("parsing kres")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
kres_dir = "../data/kres_example/"
for kres_file in os.listdir(kres_dir):
out_file = ""
res_dict = parser.parse_tei(join(kres_dir, kres_file))
for _, sentence in res_dict.items():
out_file += parser.to_conll_2009_full(sentence)
with open(join(kres_dir, kres_file + ".tsv"), "wb+") as fp:
fp.write(out_file.encode("utf-8"))
fp.close()
print("end parsing kres")