import xml.etree.ElementTree as ET from copy import deepcopy as DC from time import time import re import logging import sys import pickle log = logging.getLogger(__name__) ET.register_namespace("xml", "http://www.w3.org/XML/1998/namespace") XML_ID = "{http://www.w3.org/XML/1998/namespace}id" # |$ for a default empty match re_int = re.compile(r"t\d+|$") # For sorting a "s" section in ssj; returns key as integer. # example: "S123.t34" --> 34 def re_lmbd(el): s = re_int.findall(el)[0] if len(s) == 0: return 0 else: return int(s[1:]) class SsjEntry: def __init__(self, ssj_id, s, deep_links): # See ssj xml structure. self.id = ssj_id self.s = DC(s) self.deep_links = DC(deep_links) class SsjDict: def __init__(self): self.entries = {} """ def read_xml(self, filepath): # No data loss. log.info("SsjDict.read_xml({})".format(filepath)) t_start = time() tree = ET.parse(filepath) root = tree.getroot() stats = { "skipped": [], "duplicates": [] } for s in root.iter("s"): s_id = s.attrib[XML_ID] tokens = {} for token in s: if token.tag == "linkGrp": continue if token.tag == "w": tokens[token.attrib[XML_ID]] = { "msd": token.attrib["msd"], "lemma": token.attrib["lemma"], "word": token.text } elif token.tag == "c": tokens[token.attrib[XML_ID]] = { "word": token.text } else: # <S /> pass linkGrps = s.findall("linkGrp") if len(linkGrps) < 2: # Take only entries with both deep and shallow # syntactic annotation stats["skipped"].append(s_id) continue linkG = {} for el in linkGrps: if el.attrib["type"] == "dep": linkG["dep"] = el elif el.attrib["type"] == "SRL": linkG["SRL"] = el else: raise KeyError("Unknown linkGrp.") if s_id in self.entries: stats["duplicates"].append(s_id) self.entries[s_id] = SsjEntry( s_id, s.attrib["n"], tokens, create_edge_dict(linkG["dep"]), create_edge_dict(linkG["SRL"]) ) t_end = time() log.info("Time: {}s.".format(t_end - t_start)) log.info( "{} duplicates, skipped {} elements (missing linkGrp).".format( len(stats["duplicates"]), len(stats["skipped"])) ) """ def read_xml_v2(self, filepath): NS_DICT = { "tei": "http://www.tei-c.org/ns/1.0", "xml": "http://www.w3.org/XML/1998/namespace", } def ns_prefix(ns): return "{" + NS_DICT[ns] + "}" def helper_get_sentence(tree_s): # all w and pc elements ret = [] for el in tree_s.iter(): if ( el.tag == ns_prefix("tei") + "w" or el.tag == ns_prefix("tei") + "pc" ): ret.append(el) return ret def helper_get_functor_links(tree_s): # links for SRL linkGrp lg = None for linkGrp in tree_s.findall("tei:linkGrp", NS_DICT): if linkGrp.attrib["type"] == "SRL": lg = linkGrp break else: return [] ret = [] for link in lg: ret.append(link) return ret def helper_gen_deep_links(link_list): deep_links = [] for link in link_list: deep_links.append({ "from": link.attrib["target"].split(" ")[0][1:], "to": link.attrib["target"].split(" ")[1][1:], "functor": link.attrib["ana"].split(":")[1] }) return deep_links log.info("SsjDict.read_xml({})".format(filepath)) t_start = time() stats = { "total_count": 0, "deep_roles_count": 0, "duplicated_sid": 0, } tree = ET.parse(filepath) root = tree.getroot() for s in root.findall(".//tei:s", NS_DICT): stats["total_count"] += 1 s_id = s.attrib[ns_prefix("xml") + "id"] # get_functors (deep semantic roles) functor_links = helper_get_functor_links(s) if len(functor_links) == 0: continue stats["deep_roles_count"] += 1 # get_sentence tokens = {} for token in helper_get_sentence(s): tid = token.attrib[ns_prefix("xml") + "id"] if token.tag == ns_prefix("tei") + "w": tokens[tid] = { "msd": token.attrib["ana"].split(":")[1], "lemma": token.attrib["lemma"], "word": token.text } elif token.tag == ns_prefix("tei") + "pc": tokens[tid] = { "word": token.text } else: log.warning("Unrecognized sentence element: " + token.tag) exit(1) if s_id in self.entries: log.warning("duplicated sentence: " + s_id) stats["duplicated_sid"] += 1 continue self.entries[s_id] = SsjEntry( s_id, tokens, helper_gen_deep_links(functor_links) ) t_end = time() log.info("Time: {}s.".format(t_end - t_start)) log.info(str(stats)) if __name__ == "__main__": # testing log.setLevel(logging.DEBUG) ch = logging.StreamHandler(sys.stdout) log.addHandler(ch) # Load fpath = "../../data/ssj500k-sl.TEI/ssj500k-sl.body.xml" ssj = SsjDict() ssj.read_xml_v2(fpath) with open("ssj_test.pickle", "wb") as file: pickle.dump(ssj, file)