forked from kristjan/cjvt-srl-tagging
parent
c398de66f7
commit
b617fb5e16
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,7 @@
|
||||
FROM python
|
||||
|
||||
RUN pip install lxml
|
||||
|
||||
# for my convenience
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y vim
|
||||
|
@ -1,23 +1,23 @@
|
||||
from parser import parser
|
||||
import os
|
||||
from os.path import join
|
||||
import sys
|
||||
|
||||
SSJ500K_2_1 = 27829 # number of sentences
|
||||
|
||||
if __name__ == "__main__":
|
||||
# make sure you sanitize every input into unicode
|
||||
# make sure you sanitize every input into unicode
|
||||
|
||||
print("parsing ssj")
|
||||
# ssj_file = "/home/kristjan/git/diploma/data/ssj500k-sl.TEI/ssj500k-sl.body.xml"
|
||||
# ssj_file = "/dipldata/ssj500k-sl.TEI/ssj500k-sl.body.xml"
|
||||
ssj_file = "/dipldata/ssj500k-sl.TEI/ssj500k-sl.body.sample.xml" # smaller file
|
||||
ssj_dict = parser.parse_tei(ssj_file)
|
||||
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
|
||||
print("parsing ssj")
|
||||
ssj_file = "../data/ssj500k-sl.sample.xml"
|
||||
ssj_dict = parser.parse_tei(ssj_file)
|
||||
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
|
||||
print("end parsing ssj")
|
||||
|
||||
print("parsing kres")
|
||||
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
|
||||
kres_dir = "../data/kres_example/"
|
||||
for kres_file in os.listdir(kres_dir):
|
||||
parser.parse_tei(join(kres_dir, kres_file))
|
||||
print("end parsing kres")
|
||||
print("parsing kres")
|
||||
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
|
||||
kres_dir = "../data/kres_example/"
|
||||
for kres_file in os.listdir(kres_dir):
|
||||
res_dict = parser.parse_tei(join(kres_dir, kres_file))
|
||||
for _, sentence in res_dict.items():
|
||||
parser.to_conll09(sentence)
|
||||
print("end parsing kres")
|
||||
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in new issue