From bcaf226b9e3860c4bf44fc4ff37f6d3d608b284d Mon Sep 17 00:00:00 2001
From: voje <kristjan.voje@gmail.com>
Date: Tue, 26 Feb 2019 00:22:15 +0100
Subject: [PATCH] gen_json.py needs a bit more work

---
 Makefile           |  6 +++---
 tools/gen_json.py  | 49 +++++++++++++++++++++++++++++++++++++++++++---
 tools/parse_all.py |  7 +------
 3 files changed, 50 insertions(+), 12 deletions(-)
diff --git a/Makefile b/Makefile
index 8c5adc9..ad7e8a0 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
-.PHONY: tsv_files srl_tagged_files
+.PHONY: tsv_files srl_tagged_files json_files env
 
-all: srl_tagged_files
+all: json_files
 
-json_files: srl_tagged_files
+json_files: #TODO srl_tagged_files
 	cd tools; python3 gen_json.py
 
 srl_tagged_files: tsv_files
diff --git a/tools/gen_json.py b/tools/gen_json.py
index c074a8d..0c7e701 100644
--- a/tools/gen_json.py
+++ b/tools/gen_json.py
@@ -1,5 +1,48 @@
-import Path
+from pathlib import Path
+from parser.parser import Parser
+
+ORIGPATH = Path("../data/kres_example")  # we need the IDs
+INPATH = Path("../data/kres_example_srl")
+OUTPATH = Path("../data/kres_example_json")
+
+def get_origfile(filename):
+	for origfile in ORIGPATH.iterdir():
+		if filename.name.split('.')[0] == origfile.name.split('.')[0]:
+			return origfile
+	raise FileNotFoundError
+
+def extract_sentences(line_reader):
+	acc = []
+	for line in [x.decode("utf-8").split('\t') for x in line_reader]:
+		if line[0] == '\n':
+			tmp = acc
+			acc = []
+			yield tmp
+		else:
+			acc.append(line)
+
+def match_sentence_id(string, rd):
+	str1 = " ".join([token[1] for token in sentence_arr])
+	for k, e in rd.items():
+		str2 = " ".join(token[2] for token in dict_entry["tokens"])
+		if str1 == str2 
+			return k
+	raise KeyError
+
 
 if __name__ == "__main__":
-    print("TODO: take data/kres_example_srl/* and generate data/kres_example_json/*")
-    print("TODO: check ssj and kres <links> for structure")
+
+	par = Parser()
+
+	for infile in [x for x in INPATH.iterdir() if x.is_file()]:
+		origfile = get_origfile(infile)
+		rd = par.parse_tei(origfile)
+
+		fp = infile.open("rb")
+		for sentence_arr in extract_sentences(fp.readlines()):
+			sid = match_sentence_id(sentence_arr, rd)
+			print(sid)
+			# OK, we got the sentence id, now generate the predicate map!
+
+
+		outfile = (OUTPATH / infile.name).with_suffix(".json")
\ No newline at end of file
diff --git a/tools/parse_all.py b/tools/parse_all.py
index 0bd0146..01a867f 100644
--- a/tools/parse_all.py
+++ b/tools/parse_all.py
@@ -7,7 +7,7 @@ import sys
 import cProfile
 
 
-def main():
+if __name__ == "__main__":
     # make sure you sanitize every input into unicode
 
     SSJ500K_2_1 = 27829  # number of sentences
@@ -44,8 +44,3 @@ def main():
             fp.close()
 
     print("end parsing kres")
-
-
-if __name__ == "__main__":
-    # cProfile.run("main()", sort="tottime")
-    main()