24 lines
790 B
Python
24 lines
790 B
Python
from parser import parser
|
|
import os
|
|
from os.path import join
|
|
import sys
|
|
|
|
SSJ500K_2_1 = 27829 # number of sentences
|
|
|
|
if __name__ == "__main__":
|
|
# make sure you sanitize every input into unicode
|
|
|
|
print("parsing ssj")
|
|
# ssj_file = "/home/kristjan/git/diploma/data/ssj500k-sl.TEI/ssj500k-sl.body.xml"
|
|
# ssj_file = "/dipldata/ssj500k-sl.TEI/ssj500k-sl.body.xml"
|
|
ssj_file = "/dipldata/ssj500k-sl.TEI/ssj500k-sl.body.sample.xml" # smaller file
|
|
ssj_dict = parser.parse_tei(ssj_file)
|
|
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
|
|
|
|
print("parsing kres")
|
|
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
|
|
kres_dir = "../data/kres_example/"
|
|
for kres_file in os.listdir(kres_dir):
|
|
parser.parse_tei(join(kres_dir, kres_file))
|
|
print("end parsing kres")
|