cjvt-valency/dip_src/valency/ssj_struct.py
2019-03-20 17:49:34 +01:00

219 lines
6.3 KiB
Python

import xml.etree.ElementTree as ET
from copy import deepcopy as DC
from time import time
import re
import logging
import sys
import pickle
log = logging.getLogger(__name__)
ET.register_namespace("xml", "http://www.w3.org/XML/1998/namespace")
XML_ID = "{http://www.w3.org/XML/1998/namespace}id"
# |$ for a default empty match
re_int = re.compile(r"t\d+|$")
# For sorting a "s" section in ssj; returns key as integer.
# example: "S123.t34" --> 34
def re_lmbd(el):
s = re_int.findall(el)[0]
if len(s) == 0:
return 0
else:
return int(s[1:])
class SsjEntry:
def __init__(self, ssj_id, s, deep_links):
# See ssj xml structure.
self.id = ssj_id
self.s = DC(s)
self.deep_links = DC(deep_links)
class SsjDict:
def __init__(self):
self.entries = {}
"""
def read_xml(self, filepath):
# No data loss.
log.info("SsjDict.read_xml({})".format(filepath))
t_start = time()
tree = ET.parse(filepath)
root = tree.getroot()
stats = {
"skipped": [],
"duplicates": []
}
for s in root.iter("s"):
s_id = s.attrib[XML_ID]
tokens = {}
for token in s:
if token.tag == "linkGrp":
continue
if token.tag == "w":
tokens[token.attrib[XML_ID]] = {
"msd": token.attrib["msd"],
"lemma": token.attrib["lemma"],
"word": token.text
}
elif token.tag == "c":
tokens[token.attrib[XML_ID]] = {
"word": token.text
}
else:
# <S />
pass
linkGrps = s.findall("linkGrp")
if len(linkGrps) < 2:
# Take only entries with both deep and shallow
# syntactic annotation
stats["skipped"].append(s_id)
continue
linkG = {}
for el in linkGrps:
if el.attrib["type"] == "dep":
linkG["dep"] = el
elif el.attrib["type"] == "SRL":
linkG["SRL"] = el
else:
raise KeyError("Unknown linkGrp.")
if s_id in self.entries:
stats["duplicates"].append(s_id)
self.entries[s_id] = SsjEntry(
s_id,
s.attrib["n"],
tokens,
create_edge_dict(linkG["dep"]),
create_edge_dict(linkG["SRL"])
)
t_end = time()
log.info("Time: {}s.".format(t_end - t_start))
log.info(
"{} duplicates, skipped {} elements (missing linkGrp).".format(
len(stats["duplicates"]), len(stats["skipped"]))
)
"""
def read_xml_v2(self, filepath):
NS_DICT = {
"tei": "http://www.tei-c.org/ns/1.0",
"xml": "http://www.w3.org/XML/1998/namespace",
}
def ns_prefix(ns):
return "{" + NS_DICT[ns] + "}"
def helper_get_sentence(tree_s):
# all w and pc elements
ret = []
for el in tree_s.iter():
if (
el.tag == ns_prefix("tei") + "w" or
el.tag == ns_prefix("tei") + "pc"
):
ret.append(el)
return ret
def helper_get_functor_links(tree_s):
# links for SRL linkGrp
lg = None
for linkGrp in tree_s.findall("tei:linkGrp", NS_DICT):
if linkGrp.attrib["type"] == "SRL":
lg = linkGrp
break
else:
return []
ret = []
for link in lg:
ret.append(link)
return ret
def helper_gen_deep_links(link_list):
deep_links = []
for link in link_list:
deep_links.append({
"from": link.attrib["target"].split(" ")[0][1:],
"to": link.attrib["target"].split(" ")[1][1:],
"functor": link.attrib["ana"].split(":")[1]
})
return deep_links
log.info("SsjDict.read_xml({})".format(filepath))
t_start = time()
stats = {
"total_count": 0,
"deep_roles_count": 0,
"duplicated_sid": 0,
}
tree = ET.parse(filepath)
root = tree.getroot()
for s in root.findall(".//tei:s", NS_DICT):
stats["total_count"] += 1
s_id = s.attrib[ns_prefix("xml") + "id"]
# get_functors (deep semantic roles)
functor_links = helper_get_functor_links(s)
if len(functor_links) == 0:
continue
stats["deep_roles_count"] += 1
# get_sentence
tokens = {}
for token in helper_get_sentence(s):
tid = token.attrib[ns_prefix("xml") + "id"]
if token.tag == ns_prefix("tei") + "w":
tokens[tid] = {
"msd": token.attrib["ana"].split(":")[1],
"lemma": token.attrib["lemma"],
"word": token.text
}
elif token.tag == ns_prefix("tei") + "pc":
tokens[tid] = {
"word": token.text
}
else:
log.warning("Unrecognized sentence element: " + token.tag)
exit(1)
if s_id in self.entries:
log.warning("duplicated sentence: " + s_id)
stats["duplicated_sid"] += 1
continue
self.entries[s_id] = SsjEntry(
s_id,
tokens,
helper_gen_deep_links(functor_links)
)
t_end = time()
log.info("Time: {}s.".format(t_end - t_start))
log.info(str(stats))
if __name__ == "__main__":
# testing
log.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
log.addHandler(ch)
# Load
fpath = "../../data/ssj500k-sl.TEI/ssj500k-sl.body.xml"
ssj = SsjDict()
ssj.read_xml_v2(fpath)
with open("ssj_test.pickle", "wb") as file:
pickle.dump(ssj, file)