mate-tools tags the corpus. Need to specify predicates better

This commit is contained in:
voje
2019-02-20 07:38:26 +01:00
parent 142ad22ba3
commit 91eda1962e
18 changed files with 10766 additions and 9761 deletions

View File

@@ -109,20 +109,32 @@ class Parser:
return res_dict
def to_conll_2009_SRL(self, sentence_entry, napreds=100):
def to_conll_2009_SRL(self, sentence_entry, napreds=9):
def fillpred(pos, feat):
# TODO (decision tree or bayes on mate training data)
if pos == "V" and "main" in feat.split("|"):
return True
return False
apreds_string = '\t'.join(["_" for x in range(napreds)])
# works with kres, with parsed links
out_str = ""
for token in sentence_entry["tokens"]:
if token[0] != "w":
continue
t_id = token[1]
form = token[2]
# handle stop signs
if token[0] != "w":
out_str += '\t'.join(
[t_id] +
[form for x in range(7)] +
["0", "0", "modra", "modra", "_", "_"] +
[apreds_string, "\n"]
)
continue
pos = self.msdmap.slo_msd_to_eng_pos(token[4])
feat = "|".join(self.msdmap.slo_msd_to_eng_long(token[4]).split(" "))
fprd = fillpred(pos, feat)
@@ -135,28 +147,28 @@ class Parser:
print(sentence_entry["links"])
"""
apreds = "".join(["\t_" for x in range(napreds)])
# format: 14 + apreds
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}{}\n".format(
t_id, # id
token[2], # form
token[3], # lemma
token[3], # plemma
pos, # pos
pos, # ppos
feat, # feat
feat, # pfeat
sentence_entry["links"][t_id][2], # head
sentence_entry["links"][t_id][2], # phead
sentence_entry["links"][t_id][0], # deprel
sentence_entry["links"][t_id][0], # pdeprel
"Y" if fprd else "_", # fillpred
token[3] if fprd else "_", # pred
apreds,
)
out_str += "\n"
# print(out_str)
out_str += '\t'.join(map(str,
[
t_id,
form,
token[3], # lemma
token[3], # plemma
pos, # pos
pos, # ppos
feat, # feat
feat, # pfeat
sentence_entry["links"][t_id][2], # head
sentence_entry["links"][t_id][2], # phead
sentence_entry["links"][t_id][0], # deprel
sentence_entry["links"][t_id][0], # pdeprel
"Y" if fprd else "_", # fillpred
token[3] if fprd else "_", # pred
apreds_string,
"\n",
]
))
out_str += "\n" # newline at the end of sentence
return out_str