forked from kristjan/cjvt-valency
219 lines
6.3 KiB
Python
219 lines
6.3 KiB
Python
|
import xml.etree.ElementTree as ET
|
||
|
from copy import deepcopy as DC
|
||
|
from time import time
|
||
|
import re
|
||
|
import logging
|
||
|
import sys
|
||
|
import pickle
|
||
|
|
||
|
log = logging.getLogger(__name__)
|
||
|
|
||
|
ET.register_namespace("xml", "http://www.w3.org/XML/1998/namespace")
|
||
|
XML_ID = "{http://www.w3.org/XML/1998/namespace}id"
|
||
|
|
||
|
|
||
|
# |$ for a default empty match
|
||
|
re_int = re.compile(r"t\d+|$")
|
||
|
|
||
|
|
||
|
# For sorting a "s" section in ssj; returns key as integer.
|
||
|
# example: "S123.t34" --> 34
|
||
|
def re_lmbd(el):
|
||
|
s = re_int.findall(el)[0]
|
||
|
if len(s) == 0:
|
||
|
return 0
|
||
|
else:
|
||
|
return int(s[1:])
|
||
|
|
||
|
|
||
|
class SsjEntry:
|
||
|
def __init__(self, ssj_id, s, deep_links):
|
||
|
# See ssj xml structure.
|
||
|
self.id = ssj_id
|
||
|
self.s = DC(s)
|
||
|
self.deep_links = DC(deep_links)
|
||
|
|
||
|
|
||
|
class SsjDict:
|
||
|
def __init__(self):
|
||
|
self.entries = {}
|
||
|
|
||
|
"""
|
||
|
def read_xml(self, filepath):
|
||
|
# No data loss.
|
||
|
log.info("SsjDict.read_xml({})".format(filepath))
|
||
|
t_start = time()
|
||
|
tree = ET.parse(filepath)
|
||
|
root = tree.getroot()
|
||
|
stats = {
|
||
|
"skipped": [],
|
||
|
"duplicates": []
|
||
|
}
|
||
|
|
||
|
for s in root.iter("s"):
|
||
|
s_id = s.attrib[XML_ID]
|
||
|
tokens = {}
|
||
|
for token in s:
|
||
|
if token.tag == "linkGrp":
|
||
|
continue
|
||
|
|
||
|
if token.tag == "w":
|
||
|
tokens[token.attrib[XML_ID]] = {
|
||
|
"msd": token.attrib["msd"],
|
||
|
"lemma": token.attrib["lemma"],
|
||
|
"word": token.text
|
||
|
}
|
||
|
elif token.tag == "c":
|
||
|
tokens[token.attrib[XML_ID]] = {
|
||
|
"word": token.text
|
||
|
}
|
||
|
else:
|
||
|
# <S />
|
||
|
pass
|
||
|
|
||
|
linkGrps = s.findall("linkGrp")
|
||
|
if len(linkGrps) < 2:
|
||
|
# Take only entries with both deep and shallow
|
||
|
# syntactic annotation
|
||
|
stats["skipped"].append(s_id)
|
||
|
continue
|
||
|
|
||
|
linkG = {}
|
||
|
for el in linkGrps:
|
||
|
if el.attrib["type"] == "dep":
|
||
|
linkG["dep"] = el
|
||
|
elif el.attrib["type"] == "SRL":
|
||
|
linkG["SRL"] = el
|
||
|
else:
|
||
|
raise KeyError("Unknown linkGrp.")
|
||
|
|
||
|
if s_id in self.entries:
|
||
|
stats["duplicates"].append(s_id)
|
||
|
self.entries[s_id] = SsjEntry(
|
||
|
s_id,
|
||
|
s.attrib["n"],
|
||
|
tokens,
|
||
|
create_edge_dict(linkG["dep"]),
|
||
|
create_edge_dict(linkG["SRL"])
|
||
|
)
|
||
|
|
||
|
t_end = time()
|
||
|
log.info("Time: {}s.".format(t_end - t_start))
|
||
|
log.info(
|
||
|
"{} duplicates, skipped {} elements (missing linkGrp).".format(
|
||
|
len(stats["duplicates"]), len(stats["skipped"]))
|
||
|
)
|
||
|
"""
|
||
|
|
||
|
def read_xml_v2(self, filepath):
|
||
|
NS_DICT = {
|
||
|
"tei": "http://www.tei-c.org/ns/1.0",
|
||
|
"xml": "http://www.w3.org/XML/1998/namespace",
|
||
|
}
|
||
|
|
||
|
def ns_prefix(ns):
|
||
|
return "{" + NS_DICT[ns] + "}"
|
||
|
|
||
|
def helper_get_sentence(tree_s):
|
||
|
# all w and pc elements
|
||
|
ret = []
|
||
|
for el in tree_s.iter():
|
||
|
if (
|
||
|
el.tag == ns_prefix("tei") + "w" or
|
||
|
el.tag == ns_prefix("tei") + "pc"
|
||
|
):
|
||
|
ret.append(el)
|
||
|
return ret
|
||
|
|
||
|
def helper_get_functor_links(tree_s):
|
||
|
# links for SRL linkGrp
|
||
|
lg = None
|
||
|
for linkGrp in tree_s.findall("tei:linkGrp", NS_DICT):
|
||
|
if linkGrp.attrib["type"] == "SRL":
|
||
|
lg = linkGrp
|
||
|
break
|
||
|
else:
|
||
|
return []
|
||
|
ret = []
|
||
|
for link in lg:
|
||
|
ret.append(link)
|
||
|
return ret
|
||
|
|
||
|
def helper_gen_deep_links(link_list):
|
||
|
deep_links = []
|
||
|
for link in link_list:
|
||
|
deep_links.append({
|
||
|
"from": link.attrib["target"].split(" ")[0][1:],
|
||
|
"to": link.attrib["target"].split(" ")[1][1:],
|
||
|
"functor": link.attrib["ana"].split(":")[1]
|
||
|
})
|
||
|
return deep_links
|
||
|
|
||
|
log.info("SsjDict.read_xml({})".format(filepath))
|
||
|
t_start = time()
|
||
|
stats = {
|
||
|
"total_count": 0,
|
||
|
"deep_roles_count": 0,
|
||
|
"duplicated_sid": 0,
|
||
|
}
|
||
|
tree = ET.parse(filepath)
|
||
|
root = tree.getroot()
|
||
|
|
||
|
for s in root.findall(".//tei:s", NS_DICT):
|
||
|
stats["total_count"] += 1
|
||
|
s_id = s.attrib[ns_prefix("xml") + "id"]
|
||
|
|
||
|
# get_functors (deep semantic roles)
|
||
|
functor_links = helper_get_functor_links(s)
|
||
|
if len(functor_links) == 0:
|
||
|
continue
|
||
|
stats["deep_roles_count"] += 1
|
||
|
|
||
|
# get_sentence
|
||
|
tokens = {}
|
||
|
for token in helper_get_sentence(s):
|
||
|
tid = token.attrib[ns_prefix("xml") + "id"]
|
||
|
if token.tag == ns_prefix("tei") + "w":
|
||
|
tokens[tid] = {
|
||
|
"msd": token.attrib["ana"].split(":")[1],
|
||
|
"lemma": token.attrib["lemma"],
|
||
|
"word": token.text
|
||
|
}
|
||
|
elif token.tag == ns_prefix("tei") + "pc":
|
||
|
tokens[tid] = {
|
||
|
"word": token.text
|
||
|
}
|
||
|
else:
|
||
|
log.warning("Unrecognized sentence element: " + token.tag)
|
||
|
exit(1)
|
||
|
|
||
|
if s_id in self.entries:
|
||
|
log.warning("duplicated sentence: " + s_id)
|
||
|
stats["duplicated_sid"] += 1
|
||
|
continue
|
||
|
|
||
|
self.entries[s_id] = SsjEntry(
|
||
|
s_id,
|
||
|
tokens,
|
||
|
helper_gen_deep_links(functor_links)
|
||
|
)
|
||
|
|
||
|
t_end = time()
|
||
|
log.info("Time: {}s.".format(t_end - t_start))
|
||
|
log.info(str(stats))
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
# testing
|
||
|
log.setLevel(logging.DEBUG)
|
||
|
|
||
|
ch = logging.StreamHandler(sys.stdout)
|
||
|
log.addHandler(ch)
|
||
|
|
||
|
# Load
|
||
|
fpath = "../../data/ssj500k-sl.TEI/ssj500k-sl.body.xml"
|
||
|
ssj = SsjDict()
|
||
|
ssj.read_xml_v2(fpath)
|
||
|
with open("ssj_test.pickle", "wb") as file:
|
||
|
pickle.dump(ssj, file)
|