added formatter for mate-tools parse_full.sh

This commit is contained in:
voje 2019-02-13 16:49:45 +01:00
parent b3a39d74ef
commit f9f7fae76a
5 changed files with 20 additions and 2 deletions

View File

@ -17,7 +17,11 @@ if __name__ == "__main__":
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
kres_dir = "../data/kres_example/"
for kres_file in os.listdir(kres_dir):
out_file = ""
res_dict = parser.parse_tei(join(kres_dir, kres_file))
for _, sentence in res_dict.items():
parser.to_conll09(sentence)
out_file += parser.to_conll_2009_full(sentence)
with open(join(kres_dir, kres_file + ".tsv"), "wb+") as fp:
fp.write(out_file.encode("utf-8"))
fp.close()
print("end parsing kres")

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -90,6 +90,7 @@ def parse_tei(filepath):
parse_links(s) if guess_corpus == "KRES" else None
)
}
fp.close()
return res_dict
@ -108,7 +109,7 @@ def parse_links(s_el):
return res_links
def to_conll09(sentence_entry):
def to_conll_2009_SRL(sentence_entry):
def fillpred(pos, feat):
if False:
@ -154,3 +155,16 @@ def to_conll09(sentence_entry):
out_str += "\n"
print(out_str)
return out_str
def to_conll_2009_full(sentence_entry):
out_str = ""
for token in sentence_entry["tokens"]:
t_id = token[1]
# 1 3
out_str += "{}\t{}\n".format(
t_id, # id
token[2], # form
)
out_str += "\n"
return out_str