From cce83045e83ba28a563b87625965710dd395e39f Mon Sep 17 00:00:00 2001
From: voje <kristjan.voje@gmail.com>
Date: Sun, 14 Apr 2019 17:16:45 +0200
Subject: [PATCH] adding per-file parsing, for parallel use

---
 corpusparser/Parser.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/corpusparser/Parser.py b/corpusparser/Parser.py
index 75e8b7b..531e23c 100644
--- a/corpusparser/Parser.py
+++ b/corpusparser/Parser.py
@@ -94,6 +94,7 @@ class Parser():
 
     def sentence_generator(self):
         # Using generators so we don't copy a whole corpu around in memory.
+        # Might be too complicated. Try per-file generator instead. 
         if self.corpus == "kres":
 
             # some logging output
@@ -109,7 +110,18 @@ class Parser():
         else:
             yield from self.parse_xml_file(self.ssj_file)
 
+    def kres_to_json_file(self, in_xml_file, out_json_file):
+        out_buffer = []
+        for _, sentence_entry in parser.parser_xml_file(in_xml_file):
+            out_buffer += [sentence_entry]
+
+        with outfile.open("w") as fp:
+            json.dump(out_buffer, fp)
+
     def parse_xml_file(self, xml_file):
+        # for separate srl links, it will guess the srl file based on 
+        # self.kres_srl_folder
+
         srl_from_json = {}
         if self.corpus == "kres":  
             # in case of kres, read the SRL links form a separate json file