cjvt-srl-tagging/tools/main.py

44 lines
1.2 KiB
Python
Raw Normal View History

2019-02-15 08:09:11 +00:00
from parser.parser import Parser
2019-02-03 21:54:26 +00:00
import os
from os.path import join
2019-02-15 08:09:11 +00:00
import re
import sys
import cProfile
2019-02-03 21:54:26 +00:00
2019-02-15 08:09:11 +00:00
def main():
2019-02-12 01:48:34 +00:00
# make sure you sanitize every input into unicode
2019-02-03 21:54:26 +00:00
2019-02-15 08:09:11 +00:00
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
2019-02-12 01:48:34 +00:00
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
2019-02-15 08:09:11 +00:00
ssj_dict = par.parse_tei(ssj_file)
2019-02-12 01:48:34 +00:00
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
2019-02-03 21:54:26 +00:00
2019-02-12 01:48:34 +00:00
print("parsing kres")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
kres_dir = "../data/kres_example/"
for kres_file in os.listdir(kres_dir):
2019-02-15 08:09:11 +00:00
# since there will be processed files in the same folder
if re.match("^F.+\.xml\.parsed\.xml$", kres_file) is None:
continue
print("Processing file: " + kres_file)
out_file = ""
2019-02-15 08:09:11 +00:00
res_dict = par.parse_tei(join(kres_dir, kres_file))
2019-02-12 01:48:34 +00:00
for _, sentence in res_dict.items():
2019-02-15 08:09:11 +00:00
out_file += par.to_conll_2009_SRL(sentence)
with open(join(kres_dir, kres_file + ".tsv"), "wb+") as fp:
fp.write(out_file.encode("utf-8"))
fp.close()
2019-02-12 01:48:34 +00:00
print("end parsing kres")
2019-02-15 08:09:11 +00:00
if __name__ == "__main__":
cProfile.run("main()", sort="tottime")
# main()