This commit is contained in:
2019-02-27 16:58:04 +01:00
parent bcaf226b9e
commit 577c8418d2
10 changed files with 67 additions and 23 deletions

View File

@@ -1,9 +1,11 @@
from pathlib import Path
from parser.parser import Parser
import json
ORIGPATH = Path("../data/kres_example") # we need the IDs
INPATH = Path("../data/kres_example_srl")
OUTPATH = Path("../data/kres_example_json")
DEBUG = False
def get_origfile(filename):
for origfile in ORIGPATH.iterdir():
@@ -13,36 +15,77 @@ def get_origfile(filename):
def extract_sentences(line_reader):
acc = []
for line in [x.decode("utf-8").split('\t') for x in line_reader]:
if line[0] == '\n':
# last char in line is \n, remove it
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
if len(line) == 1: # empty line
tmp = acc
acc = []
yield tmp
else:
acc.append(line)
def match_sentence_id(string, rd):
str1 = " ".join([token[1] for token in sentence_arr])
for k, e in rd.items():
str2 = " ".join(token[2] for token in dict_entry["tokens"])
if str1 == str2
def to_sentence(sentence_arr):
return " ".join([token[1] for token in sentence_arr])
def match_sentence_id(sentence, orig_dict):
for k, e in orig_dict.items():
orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == orig_sentence:
return k
raise KeyError
def get_dep_rel(token):
if DEBUG:
print(token)
for i, field in enumerate(token[14:]):
if field != "_":
return {
"arg": field,
"from": i, # i-th predicate in sentence
"dep": token[0],
}
return None
if __name__ == "__main__":
par = Parser()
OUTPATH.mkdir(exist_ok=True)
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
origfile = get_origfile(infile)
rd = par.parse_tei(origfile)
orig_dict = par.parse_tei(origfile)
fp = infile.open("rb")
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
sid = match_sentence_id(sentence_arr, rd)
print(sid)
# OK, we got the sentence id, now generate the predicate map!
# tsv dropped sentence ids, match the ID, using original data
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
outdata[sid] = []
outfile = (OUTPATH / infile.name).with_suffix(".json")
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
outfile = (OUTPATH / infile.name).with_suffix(".json")
# print(outdata)
json.dump(outdata, outfile.open("w"))

View File

@@ -32,12 +32,10 @@ if __name__ == "__main__":
print("Processing file: " + str(kres_file))
res_dict = par.parse_tei(kres_file)
longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
print("Longest sentence: ", longest_sent)
kres_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
kres_out_str += par.to_conll_2009_SRL(sentence)
with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))

View File

@@ -114,7 +114,7 @@ class Parser:
return res_dict
def to_conll_2009_SRL(self, sentence_entry, napreds=9):
def to_conll_2009_SRL(self, sentence_entry):
def fillpred(tsv_row):
mrow = build_model_row(tsv_row)
@@ -122,8 +122,6 @@ class Parser:
y = self.fillpred_model.predict([x])
return y[0] # bool
apreds_string = '\t'.join(["_" for x in range(napreds)])
# works with kres, with parsed links
out_str = ""
for token in sentence_entry["tokens"]:
@@ -136,7 +134,7 @@ class Parser:
[t_id] +
[form for x in range(7)] +
["0", "0", "modra", "modra", "_", "_"] +
[apreds_string, "\n"]
["\n"]
)
continue
@@ -165,7 +163,6 @@ class Parser:
sentence_entry["links"][t_id][0], # pdeprel
"_", # fillpred
"_", # pred
apreds_string,
"\n",
]
fprd = fillpred(row_list)