174 lines
6.1 KiB
Python
174 lines
6.1 KiB
Python
from lxml import etree
|
|
import re
|
|
from parser.msd.msdmap import Msdmap
|
|
|
|
class Parser:
|
|
# reads a TEI xml file and returns a dictionary:
|
|
# { <sentence_id>: {
|
|
# sid: <sentence_id>, # serves as index in MongoDB
|
|
# text: ,
|
|
# tokens: ,
|
|
# }}
|
|
|
|
def __init__(self):
|
|
self.msdmap = Msdmap()
|
|
self.W_TAGS = ['w']
|
|
self.C_TAGS = ['c']
|
|
self.S_TAGS = ['S', 'pc']
|
|
|
|
def parse_tei(self, filepath):
|
|
|
|
def parse_links(s_el):
|
|
lgrps = s_el.findall(".//links")
|
|
if len(lgrps) < 1:
|
|
raise IOError("Can't find links.")
|
|
res_links = {}
|
|
for link in lgrps[0]:
|
|
dep = int(link.get("dep").split(".")[-1])
|
|
res_links[dep] = (
|
|
link.get("afun"),
|
|
dep,
|
|
int(link.get("from").split(".")[-1]),
|
|
)
|
|
return res_links
|
|
|
|
guess_corpus = None # SSJ | KRES
|
|
res_dict = {}
|
|
with filepath.open("rb") as fp:
|
|
# remove namespaces
|
|
bstr = fp.read()
|
|
|
|
utf8str = bstr.decode("utf-8")
|
|
utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1)
|
|
utf8str = re.sub(' xml:', ' ', utf8str)
|
|
|
|
root = etree.XML(utf8str.encode("utf-8"))
|
|
|
|
divs = [] # in ssj, there are divs, in Kres, there are separate files
|
|
if "id" in root.keys():
|
|
# Kres files start with <TEI id=...>
|
|
guess_corpus = "KRES"
|
|
divs = [root]
|
|
else:
|
|
guess_corpus = "SSJ"
|
|
divs = root.findall(".//div")
|
|
|
|
# parse divs
|
|
for div in divs:
|
|
f_id = div.get("id")
|
|
|
|
# parse paragraphs
|
|
for p in div.findall(".//p"):
|
|
p_id = p.get("id").split(".")[-1]
|
|
|
|
# parse sentences
|
|
for s in p.findall(".//s"):
|
|
s_id = s.get("id").split(".")[-1]
|
|
sentence_text = ""
|
|
sentence_tokens = []
|
|
|
|
# parse tokens
|
|
for el in s.iter():
|
|
if el.tag in self.W_TAGS:
|
|
el_id = el.get("id").split(".")[-1]
|
|
if el_id[0] == 't':
|
|
el_id = el_id[1:] # ssj W_TAG ids start with t
|
|
sentence_text += el.text
|
|
sentence_tokens += [(
|
|
"w",
|
|
int(el_id),
|
|
el.text,
|
|
el.get("lemma"),
|
|
(el.get("msd") if guess_corpus == "KRES"
|
|
else el.get("ana").split(":")[-1]),
|
|
)]
|
|
elif el.tag in self.C_TAGS:
|
|
# only Kres' C_TAGS have ids
|
|
el_id = el.get("id") or "none"
|
|
el_id = el_id.split(".")[-1]
|
|
sentence_text += el.text
|
|
sentence_tokens += [("c", el_id, el.text,)]
|
|
elif el.tag in self.S_TAGS:
|
|
# Kres' <S /> doesn't contain .text
|
|
sentence_text += " "
|
|
else:
|
|
# pass links and linkGroups
|
|
pass
|
|
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
|
|
if sentence_id in res_dict:
|
|
raise KeyError("duplicated id: {}".format(sentence_id))
|
|
res_dict[sentence_id] = {
|
|
"sid": sentence_id,
|
|
"text": sentence_text,
|
|
"tokens": sentence_tokens,
|
|
"links": (
|
|
parse_links(s) if guess_corpus == "KRES" else None
|
|
)
|
|
}
|
|
fp.close()
|
|
return res_dict
|
|
|
|
|
|
def to_conll_2009_SRL(self, sentence_entry, napreds=100):
|
|
|
|
def fillpred(pos, feat):
|
|
if pos == "V" and "main" in feat.split("|"):
|
|
return True
|
|
return False
|
|
|
|
# works with kres, with parsed links
|
|
out_str = ""
|
|
for token in sentence_entry["tokens"]:
|
|
if token[0] != "w":
|
|
continue
|
|
|
|
t_id = token[1]
|
|
pos = self.msdmap.slo_msd_to_eng_pos(token[4])
|
|
feat = "|".join(self.msdmap.slo_msd_to_eng_long(token[4]).split(" "))
|
|
fprd = fillpred(pos, feat)
|
|
|
|
"""
|
|
print(t_id)
|
|
print("msd:")
|
|
print(msd)
|
|
print(token)
|
|
print(sentence_entry["links"])
|
|
"""
|
|
|
|
apreds = "".join(["\t_" for x in range(napreds)])
|
|
|
|
# format: 14 + apreds
|
|
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}{}\n".format(
|
|
t_id, # id
|
|
token[2], # form
|
|
token[3], # lemma
|
|
token[3], # plemma
|
|
pos, # pos
|
|
pos, # ppos
|
|
feat, # feat
|
|
feat, # pfeat
|
|
sentence_entry["links"][t_id][2], # head
|
|
sentence_entry["links"][t_id][2], # phead
|
|
sentence_entry["links"][t_id][0], # deprel
|
|
sentence_entry["links"][t_id][0], # pdeprel
|
|
"Y" if fprd else "_", # fillpred
|
|
token[3] if fprd else "_", # pred
|
|
apreds,
|
|
)
|
|
out_str += "\n"
|
|
# print(out_str)
|
|
return out_str
|
|
|
|
|
|
def to_conll_2009_full(self, sentence_entry):
|
|
out_str = ""
|
|
for token in sentence_entry["tokens"]:
|
|
t_id = token[1]
|
|
# 1 3
|
|
out_str += "{}\t{}\n".format(
|
|
t_id, # id
|
|
token[2], # form
|
|
)
|
|
out_str += "\n"
|
|
return out_str
|