From cce83045e83ba28a563b87625965710dd395e39f Mon Sep 17 00:00:00 2001 From: voje Date: Sun, 14 Apr 2019 17:16:45 +0200 Subject: [PATCH] adding per-file parsing, for parallel use --- corpusparser/Parser.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/corpusparser/Parser.py b/corpusparser/Parser.py index 75e8b7b..531e23c 100644 --- a/corpusparser/Parser.py +++ b/corpusparser/Parser.py @@ -94,6 +94,7 @@ class Parser(): def sentence_generator(self): # Using generators so we don't copy a whole corpu around in memory. + # Might be too complicated. Try per-file generator instead. if self.corpus == "kres": # some logging output @@ -109,7 +110,18 @@ class Parser(): else: yield from self.parse_xml_file(self.ssj_file) + def kres_to_json_file(self, in_xml_file, out_json_file): + out_buffer = [] + for _, sentence_entry in parser.parser_xml_file(in_xml_file): + out_buffer += [sentence_entry] + + with outfile.open("w") as fp: + json.dump(out_buffer, fp) + def parse_xml_file(self, xml_file): + # for separate srl links, it will guess the srl file based on + # self.kres_srl_folder + srl_from_json = {} if self.corpus == "kres": # in case of kres, read the SRL links form a separate json file