forked from kristjan/cjvt-srl-tagging
tmp
This commit is contained in:
parent
bcaf226b9e
commit
577c8418d2
6
Makefile
6
Makefile
|
@ -2,17 +2,17 @@
|
|||
|
||||
all: json_files
|
||||
|
||||
json_files: #TODO srl_tagged_files
|
||||
json_files: srl_tagged_files
|
||||
cd tools; python3 gen_json.py
|
||||
|
||||
srl_tagged_files: tsv_files
|
||||
# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
|
||||
cd tools/srl-20131216; ./tag_all.sh ../../data/kres_example_tsv ../../data/kres_example_srl
|
||||
|
||||
tsv_files: fillpred_model/model.pickle
|
||||
tsv_files: tools/fillpred_model/model.pickle
|
||||
cd tools; python3 parse_all.py
|
||||
|
||||
fillpred_model/model.pickle:
|
||||
tools/fillpred_model/model.pickle:
|
||||
cd tools/fillpred_model; $(MAKE)
|
||||
|
||||
env:
|
||||
|
|
1
data/kres_example_json/F0006347.srl.json
Normal file
1
data/kres_example_json/F0006347.srl.json
Normal file
|
@ -0,0 +1 @@
|
|||
{"F0006347.50.1": [{"dep": "3", "from": "6", "arg": "CAUSE"}, {"dep": "5", "from": "6", "arg": "PAT"}, {"dep": "11", "from": "12", "arg": "ACT"}, {"dep": "16", "from": "12", "arg": "LOC"}], "F0006347.50.0": [], "F0006347.50.2": [{"dep": "5", "from": "14", "arg": "TIME"}, {"dep": "12", "from": "14", "arg": "ACT"}, {"dep": "15", "from": "14", "arg": "MWPRED"}, {"dep": "18", "from": "20", "arg": "ACT"}, {"dep": "19", "from": "20", "arg": "TIME"}, {"dep": "23", "from": "20", "arg": "PAT"}, {"dep": "26", "from": "13", "arg": "REC"}, {"dep": "29", "from": "30", "arg": "MANN"}, {"dep": "31", "from": "30", "arg": "PAT"}], "F0006347.50.3": [{"dep": "14", "from": "19", "arg": "PAT"}, {"dep": "16", "from": "19", "arg": "LOC"}, {"dep": "30", "from": "29", "arg": "PAT"}, {"dep": "32", "from": "29", "arg": "PAT"}, {"dep": "42", "from": "43", "arg": "DUR"}, {"dep": "45", "from": "43", "arg": "PAT"}, {"dep": "48", "from": "43", "arg": "LOC"}, {"dep": "56", "from": "57", "arg": "TIME"}, {"dep": "57", "from": "55", "arg": "MODAL"}, {"dep": "62", "from": "57", "arg": "PAT"}, {"dep": "67", "from": "66", "arg": "TIME"}, {"dep": "73", "from": "66", "arg": "ACT"}]}
|
1
data/kres_example_json/F0012782.srl.json
Normal file
1
data/kres_example_json/F0012782.srl.json
Normal file
|
@ -0,0 +1 @@
|
|||
{"F0012782.9.0": [{"dep": "2", "from": "3", "arg": "ACT"}, {"dep": "4", "from": "3", "arg": "MWPRED"}, {"dep": "6", "from": "4", "arg": "TIME"}], "F0012782.5.0": [], "F0012782.6.0": [{"dep": "56", "from": "54", "arg": "MEANS"}], "F0012782.10.0": [], "F0012782.11.0": [], "F0012782.8.0": [], "F0012782.7.0": [{"dep": "3", "from": "7", "arg": "COND"}, {"dep": "6", "from": "7", "arg": "ACT"}, {"dep": "8", "from": "7", "arg": "MANN"}]}
|
1
data/kres_example_json/F0019343.srl.json
Normal file
1
data/kres_example_json/F0019343.srl.json
Normal file
File diff suppressed because one or more lines are too long
1
data/kres_example_json/F0025741.srl.json
Normal file
1
data/kres_example_json/F0025741.srl.json
Normal file
File diff suppressed because one or more lines are too long
1
data/kres_example_json/F0032377.srl.json
Normal file
1
data/kres_example_json/F0032377.srl.json
Normal file
|
@ -0,0 +1 @@
|
|||
{"F0032377.38.0": [], "F0032377.32.4": [], "F0032377.29.0": [], "F0032377.33.0": [{"dep": "10", "from": "11", "arg": "GOAL"}, {"dep": "12", "from": "11", "arg": "ACT"}], "F0032377.25.0": [{"dep": "5", "from": "4", "arg": "PAT"}, {"dep": "11", "from": "13", "arg": "REC"}, {"dep": "14", "from": "13", "arg": "PAT"}, {"dep": "16", "from": "13", "arg": "LOC"}, {"dep": "19", "from": "23", "arg": "LOC"}, {"dep": "21", "from": "23", "arg": "TIME"}, {"dep": "22", "from": "23", "arg": "PAT"}, {"dep": "24", "from": "23", "arg": "ACT"}], "F0032377.32.0": [], "F0032377.35.1": [{"dep": "1", "from": "8", "arg": "ACT"}, {"dep": "7", "from": "8", "arg": "CAUSE"}, {"dep": "9", "from": "8", "arg": "PAT"}, {"dep": "12", "from": "13", "arg": "TIME"}], "F0032377.33.1": [{"dep": "2", "from": "3", "arg": "LOC"}, {"dep": "9", "from": "10", "arg": "ACT"}, {"dep": "13", "from": "10", "arg": "REG"}, {"dep": "17", "from": "19", "arg": "ACT"}, {"dep": "21", "from": "19", "arg": "GOAL"}, {"dep": "27", "from": "26", "arg": "PAT"}, {"dep": "33", "from": "34", "arg": "MANN"}, {"dep": "36", "from": "34", "arg": "PAT"}], "F0032377.32.5": [{"dep": "6", "from": "5", "arg": "MANN"}, {"dep": "13", "from": "16", "arg": "PAT"}, {"dep": "15", "from": "16", "arg": "TIME"}, {"dep": "18", "from": "16", "arg": "RESLT"}], "F0032377.33.2": [{"dep": "3", "from": "1", "arg": "PAT"}, {"dep": "8", "from": "5", "arg": "PAT"}], "F0032377.26.3": [{"dep": "1", "from": "3", "arg": "TIME"}, {"dep": "9", "from": "8", "arg": "ACT"}, {"dep": "13", "from": "14", "arg": "TIME"}, {"dep": "17", "from": "18", "arg": "LOC"}, {"dep": "18", "from": "14", "arg": "PAT"}], "F0032377.37.0": [], "F0032377.25.2": [{"dep": "1", "from": "3", "arg": "REC"}, {"dep": "6", "from": "9", "arg": "PAT"}, {"dep": "8", "from": "9", "arg": "ACT"}, {"dep": "11", "from": "9", "arg": "LOC"}, {"dep": "18", "from": "15", "arg": "RESLT"}], "F0032377.36.0": [], "F0032377.32.1": [{"dep": "1", "from": "10", "arg": "ACT"}, {"dep": "5", "from": "10", "arg": "MANN"}, {"dep": "12", "from": "10", "arg": "GOAL"}], "F0032377.26.1": [{"dep": "1", "from": "4", "arg": "PAT"}, {"dep": "3", "from": "4", "arg": "TIME"}, {"dep": "6", "from": "4", "arg": "GOAL"}, {"dep": "9", "from": "13", "arg": "TIME"}, {"dep": "12", "from": "13", "arg": "DUR"}, {"dep": "15", "from": "13", "arg": "PAT"}, {"dep": "20", "from": "18", "arg": "PAT"}], "F0032377.24.0": [{"dep": "1", "from": "2", "arg": "MANN"}, {"dep": "4", "from": "2", "arg": "ACT"}], "F0032377.31.0": [], "F0032377.30.0": [], "F0032377.23.0": [], "F0032377.32.3": [{"dep": "1", "from": "3", "arg": "ACT"}, {"dep": "4", "from": "3", "arg": "PAT"}, {"dep": "8", "from": "9", "arg": "PAT"}, {"dep": "11", "from": "9", "arg": "PAT"}], "F0032377.26.0": [{"dep": "1", "from": "4", "arg": "ACT"}, {"dep": "10", "from": "11", "arg": "QUANT"}, {"dep": "13", "from": "11", "arg": "PAT"}, {"dep": "18", "from": "19", "arg": "TIME"}], "F0032377.27.0": [], "F0032377.34.0": [], "F0032377.28.0": [{"dep": "4", "from": "3", "arg": "PAT"}], "F0032377.32.2": [{"dep": "1", "from": "5", "arg": "TIME"}, {"dep": "2", "from": "5", "arg": "REC"}, {"dep": "3", "from": "5", "arg": "PAT"}, {"dep": "7", "from": "5", "arg": "ACT"}, {"dep": "11", "from": "12", "arg": "REC"}, {"dep": "13", "from": "12", "arg": "ACT"}], "F0032377.26.2": [{"dep": "2", "from": "4", "arg": "ACT"}, {"dep": "5", "from": "4", "arg": "ACT"}], "F0032377.35.2": [{"dep": "8", "from": "7", "arg": "ACT"}, {"dep": "12", "from": "14", "arg": "REC"}, {"dep": "13", "from": "14", "arg": "MANN"}, {"dep": "15", "from": "14", "arg": "PAT"}], "F0032377.35.0": [{"dep": "9", "from": "12", "arg": "LOC"}, {"dep": "15", "from": "12", "arg": "PAT"}, {"dep": "20", "from": "21", "arg": "TIME"}, {"dep": "23", "from": "21", "arg": "GOAL"}], "F0032377.25.1": []}
|
1
data/kres_example_json/F0039402.srl.json
Normal file
1
data/kres_example_json/F0039402.srl.json
Normal file
|
@ -0,0 +1 @@
|
|||
{"F0039402.11.0": [{"dep": "1", "from": "5", "arg": "ACT"}, {"dep": "7", "from": "5", "arg": "TIME"}], "F0039402.10.0": [{"dep": "2", "from": "8", "arg": "ACT"}, {"dep": "11", "from": "8", "arg": "PAT"}, {"dep": "16", "from": "14", "arg": "RESLT"}], "F0039402.12.1": [{"dep": "1", "from": "9", "arg": "ACT"}, {"dep": "5", "from": "9", "arg": "TIME"}, {"dep": "8", "from": "9", "arg": "PAT"}, {"dep": "11", "from": "9", "arg": "PAT"}], "F0039402.15.0": [{"dep": "2", "from": "10", "arg": "MANN"}, {"dep": "7", "from": "10", "arg": "TIME"}, {"dep": "11", "from": "10", "arg": "ACT"}, {"dep": "13", "from": "10", "arg": "RESLT"}, {"dep": "20", "from": "22", "arg": "PAT"}, {"dep": "24", "from": "22", "arg": "TIME"}], "F0039402.14.2": [{"dep": "9", "from": "17", "arg": "ACT"}, {"dep": "13", "from": "17", "arg": "PAT"}, {"dep": "16", "from": "17", "arg": "LOC"}, {"dep": "20", "from": "17", "arg": "LOC"}, {"dep": "32", "from": "31", "arg": "ACT"}], "F0039402.7.0": [{"dep": "3", "from": "5", "arg": "ACT"}, {"dep": "6", "from": "5", "arg": "ACT"}, {"dep": "22", "from": "24", "arg": "TIME"}, {"dep": "25", "from": "24", "arg": "PAT"}], "F0039402.5.2": [{"dep": "1", "from": "3", "arg": "ACT"}, {"dep": "6", "from": "3", "arg": "ACT"}, {"dep": "16", "from": "25", "arg": "ACT"}, {"dep": "20", "from": "25", "arg": "COND"}, {"dep": "27", "from": "25", "arg": "PAT"}], "F0039402.15.2": [], "F0039402.8.0": [{"dep": "3", "from": "5", "arg": "ACT"}, {"dep": "11", "from": "9", "arg": "MEANS"}, {"dep": "31", "from": "33", "arg": "PAT"}, {"dep": "34", "from": "33", "arg": "ACT"}], "F0039402.15.1": [{"dep": "7", "from": "8", "arg": "MANN"}, {"dep": "29", "from": "30", "arg": "ACT"}, {"dep": "33", "from": "30", "arg": "LOC"}], "F0039402.14.1": [], "F0039402.12.2": [{"dep": "1", "from": "8", "arg": "ACT"}, {"dep": "7", "from": "8", "arg": "GOAL"}, {"dep": "10", "from": "8", "arg": "PAT"}, {"dep": "15", "from": "18", "arg": "ACT"}, {"dep": "19", "from": "18", "arg": "MODAL"}, {"dep": "20", "from": "19", "arg": "PAT"}, {"dep": "33", "from": "37", "arg": "CAUSE"}, {"dep": "38", "from": "37", "arg": "ACT"}], "F0039402.6.0": [{"dep": "3", "from": "5", "arg": "ACT"}, {"dep": "10", "from": "7", "arg": "PAT"}], "F0039402.5.1": [], "F0039402.14.0": [{"dep": "1", "from": "8", "arg": "ACT"}, {"dep": "4", "from": "8", "arg": "PAT"}, {"dep": "12", "from": "14", "arg": "ACT"}, {"dep": "15", "from": "14", "arg": "PAT"}], "F0039402.12.0": [], "F0039402.5.0": [], "F0039402.13.1": [{"dep": "13", "from": "16", "arg": "ACT"}, {"dep": "19", "from": "26", "arg": "PAT"}, {"dep": "25", "from": "26", "arg": "TIME"}, {"dep": "31", "from": "32", "arg": "ACT"}, {"dep": "35", "from": "32", "arg": "PAT"}], "F0039402.9.0": [{"dep": "15", "from": "14", "arg": "ACT"}, {"dep": "18", "from": "14", "arg": "TIME"}], "F0039402.13.0": [{"dep": "2", "from": "8", "arg": "EVENT"}, {"dep": "4", "from": "8", "arg": "ACT"}, {"dep": "6", "from": "8", "arg": "REC"}, {"dep": "7", "from": "8", "arg": "MANN"}, {"dep": "10", "from": "8", "arg": "PAT"}, {"dep": "17", "from": "18", "arg": "MANN"}, {"dep": "19", "from": "18", "arg": "PAT"}, {"dep": "26", "from": "24", "arg": "PAT"}, {"dep": "28", "from": "30", "arg": "MANN"}, {"dep": "31", "from": "30", "arg": "PAT"}]}
|
|
@ -1,9 +1,11 @@
|
|||
from pathlib import Path
|
||||
from parser.parser import Parser
|
||||
import json
|
||||
|
||||
ORIGPATH = Path("../data/kres_example") # we need the IDs
|
||||
INPATH = Path("../data/kres_example_srl")
|
||||
OUTPATH = Path("../data/kres_example_json")
|
||||
DEBUG = False
|
||||
|
||||
def get_origfile(filename):
|
||||
for origfile in ORIGPATH.iterdir():
|
||||
|
@ -13,36 +15,77 @@ def get_origfile(filename):
|
|||
|
||||
def extract_sentences(line_reader):
|
||||
acc = []
|
||||
for line in [x.decode("utf-8").split('\t') for x in line_reader]:
|
||||
if line[0] == '\n':
|
||||
# last char in line is \n, remove it
|
||||
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
|
||||
if len(line) == 1: # empty line
|
||||
tmp = acc
|
||||
acc = []
|
||||
yield tmp
|
||||
else:
|
||||
acc.append(line)
|
||||
|
||||
def match_sentence_id(string, rd):
|
||||
str1 = " ".join([token[1] for token in sentence_arr])
|
||||
for k, e in rd.items():
|
||||
str2 = " ".join(token[2] for token in dict_entry["tokens"])
|
||||
if str1 == str2
|
||||
def to_sentence(sentence_arr):
|
||||
return " ".join([token[1] for token in sentence_arr])
|
||||
|
||||
def match_sentence_id(sentence, orig_dict):
|
||||
for k, e in orig_dict.items():
|
||||
orig_sentence = " ".join(token[2] for token in e["tokens"])
|
||||
if sentence == orig_sentence:
|
||||
return k
|
||||
raise KeyError
|
||||
|
||||
def get_dep_rel(token):
|
||||
if DEBUG:
|
||||
print(token)
|
||||
for i, field in enumerate(token[14:]):
|
||||
if field != "_":
|
||||
return {
|
||||
"arg": field,
|
||||
"from": i, # i-th predicate in sentence
|
||||
"dep": token[0],
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
par = Parser()
|
||||
OUTPATH.mkdir(exist_ok=True)
|
||||
|
||||
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
|
||||
origfile = get_origfile(infile)
|
||||
rd = par.parse_tei(origfile)
|
||||
orig_dict = par.parse_tei(origfile)
|
||||
|
||||
fp = infile.open("rb")
|
||||
outdata = {}
|
||||
for sentence_arr in extract_sentences(fp.readlines()):
|
||||
sid = match_sentence_id(sentence_arr, rd)
|
||||
print(sid)
|
||||
# OK, we got the sentence id, now generate the predicate map!
|
||||
# tsv dropped sentence ids, match the ID, using original data
|
||||
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
|
||||
|
||||
outdata[sid] = []
|
||||
|
||||
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
||||
# find all predicate indices in the sentence
|
||||
predicates = []
|
||||
for token in sentence_arr:
|
||||
if token[12] == "Y":
|
||||
predicates += [token[0]] # idx
|
||||
|
||||
deprel = get_dep_rel(token)
|
||||
if deprel is not None:
|
||||
outdata[sid].append(deprel)
|
||||
|
||||
# deprel["from"] points to n-th predicate
|
||||
# replace with predicate's token index
|
||||
for deprel in outdata[sid]:
|
||||
deprel["from"] = predicates[deprel["from"]]
|
||||
|
||||
if DEBUG:
|
||||
print(to_sentence(sentence_arr))
|
||||
print(outdata[sid])
|
||||
print(sid)
|
||||
print()
|
||||
print()
|
||||
|
||||
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
||||
# print(outdata)
|
||||
json.dump(outdata, outfile.open("w"))
|
||||
|
|
|
@ -32,12 +32,10 @@ if __name__ == "__main__":
|
|||
|
||||
print("Processing file: " + str(kres_file))
|
||||
res_dict = par.parse_tei(kres_file)
|
||||
longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
|
||||
print("Longest sentence: ", longest_sent)
|
||||
kres_out_str = ""
|
||||
|
||||
for _, sentence in res_dict.items():
|
||||
kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
|
||||
kres_out_str += par.to_conll_2009_SRL(sentence)
|
||||
|
||||
with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
|
||||
fp.write(kres_out_str.encode("utf-8"))
|
||||
|
|
|
@ -114,7 +114,7 @@ class Parser:
|
|||
return res_dict
|
||||
|
||||
|
||||
def to_conll_2009_SRL(self, sentence_entry, napreds=9):
|
||||
def to_conll_2009_SRL(self, sentence_entry):
|
||||
|
||||
def fillpred(tsv_row):
|
||||
mrow = build_model_row(tsv_row)
|
||||
|
@ -122,8 +122,6 @@ class Parser:
|
|||
y = self.fillpred_model.predict([x])
|
||||
return y[0] # bool
|
||||
|
||||
apreds_string = '\t'.join(["_" for x in range(napreds)])
|
||||
|
||||
# works with kres, with parsed links
|
||||
out_str = ""
|
||||
for token in sentence_entry["tokens"]:
|
||||
|
@ -136,7 +134,7 @@ class Parser:
|
|||
[t_id] +
|
||||
[form for x in range(7)] +
|
||||
["0", "0", "modra", "modra", "_", "_"] +
|
||||
[apreds_string, "\n"]
|
||||
["\n"]
|
||||
)
|
||||
continue
|
||||
|
||||
|
@ -165,7 +163,6 @@ class Parser:
|
|||
sentence_entry["links"][t_id][0], # pdeprel
|
||||
"_", # fillpred
|
||||
"_", # pred
|
||||
apreds_string,
|
||||
"\n",
|
||||
]
|
||||
fprd = fillpred(row_list)
|
||||
|
|
Loading…
Reference in New Issue
Block a user