srl taggin pipeline (output in .tsv)

This commit is contained in:
2019-02-24 22:23:32 +01:00
parent 9939bf0f55
commit b79721f6a7
25 changed files with 10104 additions and 4255 deletions

View File

@@ -1,6 +1,9 @@
from lxml import etree
import re
from parser.msd.msdmap import Msdmap
import pickle
from pathlib import Path
from fillpred_model.step1 import build_model_row
class Parser:
# reads a TEI xml file and returns a dictionary:
@@ -15,6 +18,8 @@ class Parser:
self.W_TAGS = ['w']
self.C_TAGS = ['c']
self.S_TAGS = ['S', 'pc']
with Path("./fillpred_model/model.pickle").open("rb") as fp:
self.fillpred_model = pickle.load(fp)
def parse_tei(self, filepath):
@@ -111,11 +116,11 @@ class Parser:
def to_conll_2009_SRL(self, sentence_entry, napreds=9):
def fillpred(pos, feat):
# TODO (decision tree or bayes on mate training data)
if pos == "V" and "main" in feat.split("|"):
return True
return False
def fillpred(tsv_row):
mrow = build_model_row(tsv_row)
x = mrow[:-1]
y = self.fillpred_model.predict([x])
return y[0] # bool
apreds_string = '\t'.join(["_" for x in range(napreds)])
@@ -137,7 +142,6 @@ class Parser:
pos = self.msdmap.slo_msd_to_eng_pos(token[4])
feat = "|".join(self.msdmap.slo_msd_to_eng_long(token[4]).split(" "))
fprd = fillpred(pos, feat)
"""
print(t_id)
@@ -146,10 +150,7 @@ class Parser:
print(token)
print(sentence_entry["links"])
"""
# format: 14 + apreds
out_str += '\t'.join(map(str,
[
row_list = [
t_id,
form,
token[3], # lemma
@@ -162,11 +163,18 @@ class Parser:
sentence_entry["links"][t_id][2], # phead
sentence_entry["links"][t_id][0], # deprel
sentence_entry["links"][t_id][0], # pdeprel
"Y" if fprd else "_", # fillpred
token[3] if fprd else "_", # pred
"_", # fillpred
"_", # pred
apreds_string,
"\n",
]
]
fprd = fillpred(row_list)
row_list[12] = "Y" if fprd else "_"
row_list[13] = token[3] if fprd else "_"
# format: 14 + apreds
out_str += '\t'.join(map(str,
row_list
))
out_str += "\n" # newline at the end of sentence
return out_str