adding per-file parsing, for parallel use
This commit is contained in:
parent
19945a9dd9
commit
cce83045e8
|
@ -94,6 +94,7 @@ class Parser():
|
||||||
|
|
||||||
def sentence_generator(self):
|
def sentence_generator(self):
|
||||||
# Using generators so we don't copy a whole corpu around in memory.
|
# Using generators so we don't copy a whole corpu around in memory.
|
||||||
|
# Might be too complicated. Try per-file generator instead.
|
||||||
if self.corpus == "kres":
|
if self.corpus == "kres":
|
||||||
|
|
||||||
# some logging output
|
# some logging output
|
||||||
|
@ -109,7 +110,18 @@ class Parser():
|
||||||
else:
|
else:
|
||||||
yield from self.parse_xml_file(self.ssj_file)
|
yield from self.parse_xml_file(self.ssj_file)
|
||||||
|
|
||||||
|
def kres_to_json_file(self, in_xml_file, out_json_file):
|
||||||
|
out_buffer = []
|
||||||
|
for _, sentence_entry in parser.parser_xml_file(in_xml_file):
|
||||||
|
out_buffer += [sentence_entry]
|
||||||
|
|
||||||
|
with outfile.open("w") as fp:
|
||||||
|
json.dump(out_buffer, fp)
|
||||||
|
|
||||||
def parse_xml_file(self, xml_file):
|
def parse_xml_file(self, xml_file):
|
||||||
|
# for separate srl links, it will guess the srl file based on
|
||||||
|
# self.kres_srl_folder
|
||||||
|
|
||||||
srl_from_json = {}
|
srl_from_json = {}
|
||||||
if self.corpus == "kres":
|
if self.corpus == "kres":
|
||||||
# in case of kres, read the SRL links form a separate json file
|
# in case of kres, read the SRL links form a separate json file
|
||||||
|
|
Loading…
Reference in New Issue
Block a user