fixed paths

This commit is contained in:
voje 2019-02-27 17:32:19 +01:00
parent 5c9cf59723
commit b4c7ac5427
13 changed files with 40 additions and 41 deletions

1
.gitignore vendored
View File

@ -3,3 +3,4 @@
data/*/*.xml
data/*/*.tsv
data/*/*.json

View File

@ -1,4 +1,4 @@
.PHONY: tsv_files srl_tagged_files json_files env
.PHONY: tsv_files srl_tagged_files json_files env clean
all: json_files
@ -17,3 +17,6 @@ tools/fillpred_model/model.pickle:
env:
cd dockerfiles; cd python-java; $(MAKE)
clean:
rm tools/fillpred_model/model.pickle

View File

@ -1 +0,0 @@
{"F0006347.50.1": [{"dep": "3", "from": "6", "arg": "CAUSE"}, {"dep": "5", "from": "6", "arg": "PAT"}, {"dep": "11", "from": "12", "arg": "ACT"}, {"dep": "16", "from": "12", "arg": "LOC"}], "F0006347.50.0": [], "F0006347.50.2": [{"dep": "5", "from": "14", "arg": "TIME"}, {"dep": "12", "from": "14", "arg": "ACT"}, {"dep": "15", "from": "14", "arg": "MWPRED"}, {"dep": "18", "from": "20", "arg": "ACT"}, {"dep": "19", "from": "20", "arg": "TIME"}, {"dep": "23", "from": "20", "arg": "PAT"}, {"dep": "26", "from": "13", "arg": "REC"}, {"dep": "29", "from": "30", "arg": "MANN"}, {"dep": "31", "from": "30", "arg": "PAT"}], "F0006347.50.3": [{"dep": "14", "from": "19", "arg": "PAT"}, {"dep": "16", "from": "19", "arg": "LOC"}, {"dep": "30", "from": "29", "arg": "PAT"}, {"dep": "32", "from": "29", "arg": "PAT"}, {"dep": "42", "from": "43", "arg": "DUR"}, {"dep": "45", "from": "43", "arg": "PAT"}, {"dep": "48", "from": "43", "arg": "LOC"}, {"dep": "56", "from": "57", "arg": "TIME"}, {"dep": "57", "from": "55", "arg": "MODAL"}, {"dep": "62", "from": "57", "arg": "PAT"}, {"dep": "67", "from": "66", "arg": "TIME"}, {"dep": "73", "from": "66", "arg": "ACT"}]}

View File

@ -1 +0,0 @@
{"F0012782.9.0": [{"dep": "2", "from": "3", "arg": "ACT"}, {"dep": "4", "from": "3", "arg": "MWPRED"}, {"dep": "6", "from": "4", "arg": "TIME"}], "F0012782.5.0": [], "F0012782.6.0": [{"dep": "56", "from": "54", "arg": "MEANS"}], "F0012782.10.0": [], "F0012782.11.0": [], "F0012782.8.0": [], "F0012782.7.0": [{"dep": "3", "from": "7", "arg": "COND"}, {"dep": "6", "from": "7", "arg": "ACT"}, {"dep": "8", "from": "7", "arg": "MANN"}]}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1 +0,0 @@
{"F0032377.38.0": [], "F0032377.32.4": [], "F0032377.29.0": [], "F0032377.33.0": [{"dep": "10", "from": "11", "arg": "GOAL"}, {"dep": "12", "from": "11", "arg": "ACT"}], "F0032377.25.0": [{"dep": "5", "from": "4", "arg": "PAT"}, {"dep": "11", "from": "13", "arg": "REC"}, {"dep": "14", "from": "13", "arg": "PAT"}, {"dep": "16", "from": "13", "arg": "LOC"}, {"dep": "19", "from": "23", "arg": "LOC"}, {"dep": "21", "from": "23", "arg": "TIME"}, {"dep": "22", "from": "23", "arg": "PAT"}, {"dep": "24", "from": "23", "arg": "ACT"}], "F0032377.32.0": [], "F0032377.35.1": [{"dep": "1", "from": "8", "arg": "ACT"}, {"dep": "7", "from": "8", "arg": "CAUSE"}, {"dep": "9", "from": "8", "arg": "PAT"}, {"dep": "12", "from": "13", "arg": "TIME"}], "F0032377.33.1": [{"dep": "2", "from": "3", "arg": "LOC"}, {"dep": "9", "from": "10", "arg": "ACT"}, {"dep": "13", "from": "10", "arg": "REG"}, {"dep": "17", "from": "19", "arg": "ACT"}, {"dep": "21", "from": "19", "arg": "GOAL"}, {"dep": "27", "from": "26", "arg": "PAT"}, {"dep": "33", "from": "34", "arg": "MANN"}, {"dep": "36", "from": "34", "arg": "PAT"}], "F0032377.32.5": [{"dep": "6", "from": "5", "arg": "MANN"}, {"dep": "13", "from": "16", "arg": "PAT"}, {"dep": "15", "from": "16", "arg": "TIME"}, {"dep": "18", "from": "16", "arg": "RESLT"}], "F0032377.33.2": [{"dep": "3", "from": "1", "arg": "PAT"}, {"dep": "8", "from": "5", "arg": "PAT"}], "F0032377.26.3": [{"dep": "1", "from": "3", "arg": "TIME"}, {"dep": "9", "from": "8", "arg": "ACT"}, {"dep": "13", "from": "14", "arg": "TIME"}, {"dep": "17", "from": "18", "arg": "LOC"}, {"dep": "18", "from": "14", "arg": "PAT"}], "F0032377.37.0": [], "F0032377.25.2": [{"dep": "1", "from": "3", "arg": "REC"}, {"dep": "6", "from": "9", "arg": "PAT"}, {"dep": "8", "from": "9", "arg": "ACT"}, {"dep": "11", "from": "9", "arg": "LOC"}, {"dep": "18", "from": "15", "arg": "RESLT"}], "F0032377.36.0": [], "F0032377.32.1": [{"dep": "1", "from": "10", "arg": "ACT"}, {"dep": "5", "from": "10", "arg": "MANN"}, {"dep": "12", "from": "10", "arg": "GOAL"}], "F0032377.26.1": [{"dep": "1", "from": "4", "arg": "PAT"}, {"dep": "3", "from": "4", "arg": "TIME"}, {"dep": "6", "from": "4", "arg": "GOAL"}, {"dep": "9", "from": "13", "arg": "TIME"}, {"dep": "12", "from": "13", "arg": "DUR"}, {"dep": "15", "from": "13", "arg": "PAT"}, {"dep": "20", "from": "18", "arg": "PAT"}], "F0032377.24.0": [{"dep": "1", "from": "2", "arg": "MANN"}, {"dep": "4", "from": "2", "arg": "ACT"}], "F0032377.31.0": [], "F0032377.30.0": [], "F0032377.23.0": [], "F0032377.32.3": [{"dep": "1", "from": "3", "arg": "ACT"}, {"dep": "4", "from": "3", "arg": "PAT"}, {"dep": "8", "from": "9", "arg": "PAT"}, {"dep": "11", "from": "9", "arg": "PAT"}], "F0032377.26.0": [{"dep": "1", "from": "4", "arg": "ACT"}, {"dep": "10", "from": "11", "arg": "QUANT"}, {"dep": "13", "from": "11", "arg": "PAT"}, {"dep": "18", "from": "19", "arg": "TIME"}], "F0032377.27.0": [], "F0032377.34.0": [], "F0032377.28.0": [{"dep": "4", "from": "3", "arg": "PAT"}], "F0032377.32.2": [{"dep": "1", "from": "5", "arg": "TIME"}, {"dep": "2", "from": "5", "arg": "REC"}, {"dep": "3", "from": "5", "arg": "PAT"}, {"dep": "7", "from": "5", "arg": "ACT"}, {"dep": "11", "from": "12", "arg": "REC"}, {"dep": "13", "from": "12", "arg": "ACT"}], "F0032377.26.2": [{"dep": "2", "from": "4", "arg": "ACT"}, {"dep": "5", "from": "4", "arg": "ACT"}], "F0032377.35.2": [{"dep": "8", "from": "7", "arg": "ACT"}, {"dep": "12", "from": "14", "arg": "REC"}, {"dep": "13", "from": "14", "arg": "MANN"}, {"dep": "15", "from": "14", "arg": "PAT"}], "F0032377.35.0": [{"dep": "9", "from": "12", "arg": "LOC"}, {"dep": "15", "from": "12", "arg": "PAT"}, {"dep": "20", "from": "21", "arg": "TIME"}, {"dep": "23", "from": "21", "arg": "GOAL"}], "F0032377.25.1": []}

View File

@ -1 +0,0 @@
{"F0039402.11.0": [{"dep": "1", "from": "5", "arg": "ACT"}, {"dep": "7", "from": "5", "arg": "TIME"}], "F0039402.10.0": [{"dep": "2", "from": "8", "arg": "ACT"}, {"dep": "11", "from": "8", "arg": "PAT"}, {"dep": "16", "from": "14", "arg": "RESLT"}], "F0039402.12.1": [{"dep": "1", "from": "9", "arg": "ACT"}, {"dep": "5", "from": "9", "arg": "TIME"}, {"dep": "8", "from": "9", "arg": "PAT"}, {"dep": "11", "from": "9", "arg": "PAT"}], "F0039402.15.0": [{"dep": "2", "from": "10", "arg": "MANN"}, {"dep": "7", "from": "10", "arg": "TIME"}, {"dep": "11", "from": "10", "arg": "ACT"}, {"dep": "13", "from": "10", "arg": "RESLT"}, {"dep": "20", "from": "22", "arg": "PAT"}, {"dep": "24", "from": "22", "arg": "TIME"}], "F0039402.14.2": [{"dep": "9", "from": "17", "arg": "ACT"}, {"dep": "13", "from": "17", "arg": "PAT"}, {"dep": "16", "from": "17", "arg": "LOC"}, {"dep": "20", "from": "17", "arg": "LOC"}, {"dep": "32", "from": "31", "arg": "ACT"}], "F0039402.7.0": [{"dep": "3", "from": "5", "arg": "ACT"}, {"dep": "6", "from": "5", "arg": "ACT"}, {"dep": "22", "from": "24", "arg": "TIME"}, {"dep": "25", "from": "24", "arg": "PAT"}], "F0039402.5.2": [{"dep": "1", "from": "3", "arg": "ACT"}, {"dep": "6", "from": "3", "arg": "ACT"}, {"dep": "16", "from": "25", "arg": "ACT"}, {"dep": "20", "from": "25", "arg": "COND"}, {"dep": "27", "from": "25", "arg": "PAT"}], "F0039402.15.2": [], "F0039402.8.0": [{"dep": "3", "from": "5", "arg": "ACT"}, {"dep": "11", "from": "9", "arg": "MEANS"}, {"dep": "31", "from": "33", "arg": "PAT"}, {"dep": "34", "from": "33", "arg": "ACT"}], "F0039402.15.1": [{"dep": "7", "from": "8", "arg": "MANN"}, {"dep": "29", "from": "30", "arg": "ACT"}, {"dep": "33", "from": "30", "arg": "LOC"}], "F0039402.14.1": [], "F0039402.12.2": [{"dep": "1", "from": "8", "arg": "ACT"}, {"dep": "7", "from": "8", "arg": "GOAL"}, {"dep": "10", "from": "8", "arg": "PAT"}, {"dep": "15", "from": "18", "arg": "ACT"}, {"dep": "19", "from": "18", "arg": "MODAL"}, {"dep": "20", "from": "19", "arg": "PAT"}, {"dep": "33", "from": "37", "arg": "CAUSE"}, {"dep": "38", "from": "37", "arg": "ACT"}], "F0039402.6.0": [{"dep": "3", "from": "5", "arg": "ACT"}, {"dep": "10", "from": "7", "arg": "PAT"}], "F0039402.5.1": [], "F0039402.14.0": [{"dep": "1", "from": "8", "arg": "ACT"}, {"dep": "4", "from": "8", "arg": "PAT"}, {"dep": "12", "from": "14", "arg": "ACT"}, {"dep": "15", "from": "14", "arg": "PAT"}], "F0039402.12.0": [], "F0039402.5.0": [], "F0039402.13.1": [{"dep": "13", "from": "16", "arg": "ACT"}, {"dep": "19", "from": "26", "arg": "PAT"}, {"dep": "25", "from": "26", "arg": "TIME"}, {"dep": "31", "from": "32", "arg": "ACT"}, {"dep": "35", "from": "32", "arg": "PAT"}], "F0039402.9.0": [{"dep": "15", "from": "14", "arg": "ACT"}, {"dep": "18", "from": "14", "arg": "TIME"}], "F0039402.13.0": [{"dep": "2", "from": "8", "arg": "EVENT"}, {"dep": "4", "from": "8", "arg": "ACT"}, {"dep": "6", "from": "8", "arg": "REC"}, {"dep": "7", "from": "8", "arg": "MANN"}, {"dep": "10", "from": "8", "arg": "PAT"}, {"dep": "17", "from": "18", "arg": "MANN"}, {"dep": "19", "from": "18", "arg": "PAT"}, {"dep": "26", "from": "24", "arg": "PAT"}, {"dep": "28", "from": "30", "arg": "MANN"}, {"dep": "31", "from": "30", "arg": "PAT"}]}

View File

@ -51,4 +51,4 @@ if __name__ == "__main__":
print(i, df.shape)
print(ndf.head())
ndf.to_pickle(OUTFILE)
ndf.to_pickle(Path(OUTFILE))

View File

@ -27,4 +27,6 @@ if __name__ == "__main__":
clf_full = DecisionTreeClassifier()
clf_full.fit(X, y)
pickle.dump(clf_full, open(OUTFILE, "wb"))
with open(OUTFILE, "wb") as fp:
pickle.dump(clf_full, fp)

View File

@ -1,6 +1,8 @@
from pathlib import Path
from parser.parser import Parser
import configparser
import json
import sys
# defaults
ORIGPATH = Path("../data/kres_example") # we need the IDs
@ -14,7 +16,7 @@ config.read("tools.cfg")
ORIGPATH = Path(config["tools"]["kres_orig"])
INPATH = Path(config["tools"]["kres_srl"])
OUTPATH = Path(config["tools"]["kres_json"])
DEBUG = bool(config["tools"]["debug"])
DEBUG = config["tools"]["debug"] == "True"
def get_origfile(filename):
for origfile in ORIGPATH.iterdir():
@ -63,36 +65,36 @@ for infile in [x for x in INPATH.iterdir() if x.is_file()]:
origfile = get_origfile(infile)
orig_dict = par.parse_tei(origfile)
fp = infile.open("rb")
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
# tsv dropped sentence ids, match the ID, using original data
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
with infile.open("rb") as fp:
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
# tsv dropped sentence ids, match the ID, using original data
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
outdata[sid] = []
outdata[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
outfile = (OUTPATH / infile.name).with_suffix(".json")
# print(outdata)
json.dump(outdata, outfile.open("w"))
with outfile.open("w") as fp:
json.dump(outdata, fp)

View File

@ -36,14 +36,11 @@ for kres_file in [x for x in INDIR.iterdir() if x.is_file()]:
print("Processing file: " + str(kres_file))
res_dict = par.parse_tei(kres_file)
longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
print("Longest sentence: ", longest_sent)
kres_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
kres_out_str += par.to_conll_2009_SRL(sentence)
with (OUTDIR / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
fp.close()
print("end parsing kres")

View File

@ -2,5 +2,5 @@
kres_orig = ../data/kres_example
kres_tsv = ../data/kres_example_tsv
kres_srl = ../data/kres_example_srl
kres_json = ../data/kres/example_json
kres_json = ../data/kres_example_json
debug = False