from parser import parser import os from os.path import join import sys SSJ500K_2_1 = 27829 # number of sentences if __name__ == "__main__": # make sure you sanitize every input into unicode print("parsing ssj") # ssj_file = "/home/kristjan/git/diploma/data/ssj500k-sl.TEI/ssj500k-sl.body.xml" # ssj_file = "/dipldata/ssj500k-sl.TEI/ssj500k-sl.body.xml" ssj_file = "/dipldata/ssj500k-sl.TEI/ssj500k-sl.body.sample.xml" # smaller file ssj_dict = parser.parse_tei(ssj_file) # assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences." print("parsing kres") # kres_file = "../data/kres_example/F0019343.xml.parsed.xml" kres_dir = "../data/kres_example/" for kres_file in os.listdir(kres_dir): parser.parse_tei(join(kres_dir, kres_file)) print("end parsing kres")