adding per-file parsing, for parallel use

This commit is contained in:
voje 2019-04-14 17:16:45 +02:00
parent 19945a9dd9
commit cce83045e8

View File

@ -94,6 +94,7 @@ class Parser():
def sentence_generator(self): def sentence_generator(self):
# Using generators so we don't copy a whole corpu around in memory. # Using generators so we don't copy a whole corpu around in memory.
# Might be too complicated. Try per-file generator instead.
if self.corpus == "kres": if self.corpus == "kres":
# some logging output # some logging output
@ -109,7 +110,18 @@ class Parser():
else: else:
yield from self.parse_xml_file(self.ssj_file) yield from self.parse_xml_file(self.ssj_file)
def kres_to_json_file(self, in_xml_file, out_json_file):
out_buffer = []
for _, sentence_entry in parser.parser_xml_file(in_xml_file):
out_buffer += [sentence_entry]
with outfile.open("w") as fp:
json.dump(out_buffer, fp)
def parse_xml_file(self, xml_file): def parse_xml_file(self, xml_file):
# for separate srl links, it will guess the srl file based on
# self.kres_srl_folder
srl_from_json = {} srl_from_json = {}
if self.corpus == "kres": if self.corpus == "kres":
# in case of kres, read the SRL links form a separate json file # in case of kres, read the SRL links form a separate json file