cjvt-srl-tagging/tools/parser/parser.py

218 lines
8.0 KiB
Python
Raw Permalink Normal View History

2019-02-03 21:54:26 +00:00
from lxml import etree
import re
2019-02-15 08:09:11 +00:00
from parser.msd.msdmap import Msdmap
2019-02-24 21:23:32 +00:00
import pickle
from pathlib import Path
from fillpred_model.step1 import build_model_row
2019-02-27 08:15:40 +00:00
import sys
2019-02-15 08:09:11 +00:00
class Parser:
# reads a TEI xml file and returns a dictionary:
# { <sentence_id>: {
# sid: <sentence_id>, # serves as index in MongoDB
# text: ,
# tokens: ,
# }}
def __init__(self):
self.msdmap = Msdmap()
self.W_TAGS = ['w']
self.C_TAGS = ['c']
self.S_TAGS = ['S', 'pc']
2019-02-27 08:15:40 +00:00
try:
fp = Path("./fillpred_model/model.pickle").open("rb")
2019-02-24 21:23:32 +00:00
self.fillpred_model = pickle.load(fp)
2019-02-27 08:15:40 +00:00
except IOError:
print("Generate the model first: $ make tools/fillpred_mode/model.pickle")
sys.exit(1)
2019-02-15 08:09:11 +00:00
def parse_tei(self, filepath):
def parse_links(s_el):
lgrps = s_el.findall(".//links")
if len(lgrps) < 1:
raise IOError("Can't find links.")
res_links = {}
for link in lgrps[0]:
dep = int(link.get("dep").split(".")[-1])
res_links[dep] = (
link.get("afun"),
dep,
int(link.get("from").split(".")[-1]),
)
return res_links
guess_corpus = None # SSJ | KRES
res_dict = {}
with filepath.open("rb") as fp:
2019-02-15 08:09:11 +00:00
# remove namespaces
bstr = fp.read()
utf8str = bstr.decode("utf-8")
utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1)
utf8str = re.sub(' xml:', ' ', utf8str)
root = etree.XML(utf8str.encode("utf-8"))
divs = [] # in ssj, there are divs, in Kres, there are separate files
if "id" in root.keys():
# Kres files start with <TEI id=...>
2022-02-04 10:24:47 +00:00
if root.get("id")[0:2] == 'GF':
guess_corpus = "GIGA"
else:
guess_corpus = "KRES"
2019-02-15 08:09:11 +00:00
divs = [root]
else:
guess_corpus = "SSJ"
divs = root.findall(".//div")
# parse divs
for div in divs:
2022-02-04 10:24:47 +00:00
f_id = div.get("id")[:-6]
if guess_corpus == "GIGA":
div = div.findall(".//body")[0]
2019-02-15 08:09:11 +00:00
# parse paragraphs
for p in div.findall(".//p"):
p_id = p.get("id").split(".")[-1]
# parse sentences
for s in p.findall(".//s"):
s_id = s.get("id").split(".")[-1]
sentence_text = ""
2022-02-04 10:24:47 +00:00
sentence_list = []
2019-02-15 08:09:11 +00:00
sentence_tokens = []
# parse tokens
for el in s.iter():
if el.tag in self.W_TAGS:
2022-02-04 10:24:47 +00:00
if guess_corpus != "GIGA":
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
sentence_tokens += [(
"w",
int(el_id),
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
else el.get("ana").split(":")[-1]),
)]
else:
sentence_list.append(el.text)
2019-02-15 08:09:11 +00:00
elif el.tag in self.C_TAGS:
# only Kres' C_TAGS have ids
2022-02-04 10:24:47 +00:00
if guess_corpus != "GIGA":
el_id = el.get("id") or "none"
el_id = el_id.split(".")[-1]
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
2019-02-15 08:09:11 +00:00
elif el.tag in self.S_TAGS:
# Kres' <S /> doesn't contain .text
2022-02-04 10:24:47 +00:00
if guess_corpus == "GIGA":
sentence_list.append(el.text)
else:
sentence_text += " "
2019-02-15 08:09:11 +00:00
else:
# pass links and linkGroups
pass
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
if sentence_id in res_dict:
raise KeyError("duplicated id: {}".format(sentence_id))
2022-02-04 10:24:47 +00:00
if guess_corpus == "GIGA":
res_dict[sentence_id] = {
"sid": sentence_id,
"text": ' '.join(sentence_list),
"tokens": None,
"links": None
}
else:
res_dict[sentence_id] = {
"sid": sentence_id,
"text": sentence_text,
"tokens": sentence_tokens,
"links": (
parse_links(s) if guess_corpus == "KRES" else None
)
}
2019-02-15 08:09:11 +00:00
fp.close()
return res_dict
2019-02-27 15:58:04 +00:00
def to_conll_2009_SRL(self, sentence_entry):
2019-02-15 08:09:11 +00:00
2019-02-24 21:23:32 +00:00
def fillpred(tsv_row):
mrow = build_model_row(tsv_row)
2022-02-04 10:24:47 +00:00
x = mrow[:-1]
2019-02-24 21:23:32 +00:00
y = self.fillpred_model.predict([x])
return y[0] # bool
2019-02-15 08:09:11 +00:00
# works with kres, with parsed links
out_str = ""
for token in sentence_entry["tokens"]:
t_id = token[1]
form = token[2]
# handle stop signs
2019-02-15 08:09:11 +00:00
if token[0] != "w":
out_str += '\t'.join(
[t_id] +
[form for x in range(7)] +
["0", "0", "modra", "modra", "_", "_"] +
2019-02-27 15:58:04 +00:00
["\n"]
)
continue
2019-02-15 08:09:11 +00:00
2019-02-16 10:41:39 +00:00
pos = self.msdmap.slo_msd_to_eng_pos(token[4])
feat = "|".join(self.msdmap.slo_msd_to_eng_long(token[4]).split(" "))
2019-02-15 08:09:11 +00:00
"""
print(t_id)
print("msd:")
print(msd)
print(token)
print(sentence_entry["links"])
"""
2019-02-24 21:23:32 +00:00
row_list = [
t_id,
form,
token[3], # lemma
token[3], # plemma
pos, # pos
pos, # ppos
feat, # feat
feat, # pfeat
sentence_entry["links"][t_id][2], # head
sentence_entry["links"][t_id][2], # phead
sentence_entry["links"][t_id][0], # deprel
sentence_entry["links"][t_id][0], # pdeprel
2019-02-24 21:23:32 +00:00
"_", # fillpred
"_", # pred
"\n",
2019-02-24 21:23:32 +00:00
]
fprd = fillpred(row_list)
row_list[12] = "Y" if fprd else "_"
row_list[13] = token[3] if fprd else "_"
# format: 14 + apreds
out_str += '\t'.join(map(str,
row_list
))
out_str += "\n" # newline at the end of sentence
2019-02-15 08:09:11 +00:00
return out_str
def to_conll_2009_full(self, sentence_entry):
out_str = ""
for token in sentence_entry["tokens"]:
t_id = token[1]
# 1 3
out_str += "{}\t{}\n".format(
t_id, # id
token[2], # form
)
out_str += "\n"
return out_str