cjvt-srl-tagging/tools/parser/parser.py

132 lines
4.4 KiB
Python
Raw Normal View History

2019-02-03 21:54:26 +00:00
from lxml import etree
import re
W_TAGS = ['w']
C_TAGS = ['c']
S_TAGS = ['S', 'pc']
# reads a TEI xml file and returns a dictionary:
# { <sentence_id>: {
2019-02-10 22:23:24 +00:00
# sid: <sentence_id>, # serves as index in MongoDB
# text: ,
# tokens: ,
2019-02-03 21:54:26 +00:00
# }}
2019-02-10 22:23:24 +00:00
2019-02-03 21:54:26 +00:00
def parse_tei(filepath):
2019-02-10 22:23:24 +00:00
guess_corpus = None # SSJ | KRES
res_dict = {}
with open(filepath, "r") as fp:
# remove namespaces
xmlstr = fp.read()
xmlstr = re.sub('\\sxmlns="[^"]+"', '', xmlstr, count=1)
xmlstr = re.sub(' xml:', ' ', xmlstr)
root = etree.XML(xmlstr.encode("utf-8"))
divs = [] # in ssj, there are divs, in Kres, there are separate files
if "id" in root.keys():
# Kres files start with <TEI id=...>
guess_corpus = "KRES"
divs = [root]
else:
guess_corpus = "SSJ"
divs = root.findall(".//div")
# parse divs
for div in divs:
f_id = div.get("id")
# parse paragraphs
for p in div.findall(".//p"):
p_id = p.get("id").split(".")[-1]
# parse sentences
for s in p.findall(".//s"):
s_id = s.get("id").split(".")[-1]
sentence_text = ""
sentence_tokens = []
# parse tokens
for el in s.iter():
if el.tag in W_TAGS:
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
sentence_tokens += [(
"w",
2019-02-12 01:48:34 +00:00
int(el_id),
2019-02-10 22:23:24 +00:00
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES"
else el.get("ana").split(":")[-1]),
)]
elif el.tag in C_TAGS:
# only Kres' C_TAGS have ids
el_id = el.get("id") or "none"
el_id = el_id.split(".")[-1]
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
elif el.tag in S_TAGS:
# Kres' <S /> doesn't contain .text
sentence_text += " "
else:
# pass links and linkGroups
pass
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
if sentence_id in res_dict:
raise KeyError("duplicated id: {}".format(sentence_id))
res_dict[sentence_id] = {
"sid": sentence_id,
"text": sentence_text,
2019-02-12 01:48:34 +00:00
"tokens": sentence_tokens,
"links": (
parse_links(s) if guess_corpus == "KRES" else None
)
2019-02-10 22:23:24 +00:00
}
return res_dict
2019-02-12 01:48:34 +00:00
def parse_links(s_el):
lgrps = s_el.findall(".//links")
if len(lgrps) < 1:
raise IOError("Can't find links.")
res_links = {}
for link in lgrps[0]:
dep = int(link.get("dep").split(".")[-1])
res_links[dep] = (
link.get("afun"),
dep,
int(link.get("from").split(".")[-1]),
)
return res_links
def to_conll09(sentence_entry):
# works with kres, with parsed links
out_str = ""
for token in sentence_entry["tokens"]:
if token[0] != "w":
continue
print(token)
print(sentence_entry["links"])
t_id = token[1]
print(t_id)
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
t_id, # id
token[2], # form
token[3], # lemma
token[3], # plemma
"todo", # pos (TODO)
"todo", # ppos (TODO)
"todo", # feat (TODO)
"todo", # pfeat (TODO)
sentence_entry["links"][t_id][2], # head
sentence_entry["links"][t_id][2], # phead
sentence_entry["links"][t_id][1], # deprel
sentence_entry["links"][t_id][1], # pdeprel
)
out_str += "\n"
return out_str