fixed paths

2019-02-27 17:32:19 +01:00
parent 5c9cf59723
commit b4c7ac5427
13 changed files with 40 additions and 41 deletions
@@ -51,4 +51,4 @@ if __name__ == "__main__":
 			print(i, df.shape)

 	print(ndf.head())
-	ndf.to_pickle(OUTFILE)
+	ndf.to_pickle(Path(OUTFILE))
@@ -27,4 +27,6 @@ if __name__ == "__main__":
 	clf_full = DecisionTreeClassifier()
 	clf_full.fit(X, y)

-	pickle.dump(clf_full, open(OUTFILE, "wb"))
+	with open(OUTFILE, "wb") as fp:
+		pickle.dump(clf_full, fp)
+
@@ -1,6 +1,8 @@
 from pathlib import Path
 from parser.parser import Parser
 import configparser
+import json
+import sys

 # defaults
 ORIGPATH = Path("../data/kres_example")  # we need the IDs
@@ -14,7 +16,7 @@ config.read("tools.cfg")
 ORIGPATH = Path(config["tools"]["kres_orig"])
 INPATH = Path(config["tools"]["kres_srl"])
 OUTPATH = Path(config["tools"]["kres_json"])
-DEBUG = bool(config["tools"]["debug"])
+DEBUG = config["tools"]["debug"] == "True"

 def get_origfile(filename):
 	for origfile in ORIGPATH.iterdir():
@@ -63,36 +65,36 @@ for infile in [x for x in INPATH.iterdir() if x.is_file()]:
 	origfile = get_origfile(infile)
 	orig_dict = par.parse_tei(origfile)

-	fp = infile.open("rb")
-	outdata = {}
-	for sentence_arr in extract_sentences(fp.readlines()):
-		# tsv dropped sentence ids, match the ID, using original data
-		sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
+	with infile.open("rb") as fp:
+		outdata = {}
+		for sentence_arr in extract_sentences(fp.readlines()):
+			# tsv dropped sentence ids, match the ID, using original data
+			sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)

-		outdata[sid] = []
+			outdata[sid] = []

-		# find all predicate indices in the sentence
-		predicates = []
-		for token in sentence_arr:
-			if token[12] == "Y":
-				predicates += [token[0]]  # idx
+			# find all predicate indices in the sentence
+			predicates = []
+			for token in sentence_arr:
+				if token[12] == "Y":
+					predicates += [token[0]]  # idx

-			deprel = get_dep_rel(token)
-			if deprel is not None:
-				outdata[sid].append(deprel)
+				deprel = get_dep_rel(token)
+				if deprel is not None:
+					outdata[sid].append(deprel)

-		# deprel["from"] points to n-th predicate
-		# replace with predicate's token index
-		for deprel in outdata[sid]:
-			deprel["from"] = predicates[deprel["from"]]
+			# deprel["from"] points to n-th predicate
+			# replace with predicate's token index
+			for deprel in outdata[sid]:
+				deprel["from"] = predicates[deprel["from"]]

-		if DEBUG:
-			print(to_sentence(sentence_arr))
-			print(outdata[sid])
-			print(sid)
-			print()
-			print()
+			if DEBUG:
+				print(to_sentence(sentence_arr))
+				print(outdata[sid])
+				print(sid)
+				print()
+				print()

 	outfile = (OUTPATH / infile.name).with_suffix(".json")
-	# print(outdata)
-	json.dump(outdata, outfile.open("w"))
+	with outfile.open("w") as fp:
+		json.dump(outdata, fp)
@@ -36,14 +36,11 @@ for kres_file in [x for x in INDIR.iterdir() if x.is_file()]:

    print("Processing file: " + str(kres_file))
    res_dict = par.parse_tei(kres_file)
-    longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
-    print("Longest sentence: ", longest_sent)
    kres_out_str = ""

    for _, sentence in res_dict.items():
-        kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
+        kres_out_str += par.to_conll_2009_SRL(sentence)

    with (OUTDIR / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
        fp.write(kres_out_str.encode("utf-8"))
-        fp.close()
 print("end parsing kres")
@@ -2,5 +2,5 @@
 kres_orig = ../data/kres_example
 kres_tsv = ../data/kres_example_tsv
 kres_srl = ../data/kres_example_srl
-kres_json = ../data/kres/example_json
+kres_json = ../data/kres_example_json
 debug = False