cjvt-valency/dip_src/valency/ssj_struct.py

import xml.etree.ElementTree as ET
from copy import deepcopy as DC
from time import time
import re
import logging
import sys
import pickle

log = logging.getLogger(__name__)

ET.register_namespace("xml", "http://www.w3.org/XML/1998/namespace")
XML_ID = "{http://www.w3.org/XML/1998/namespace}id"


# |$ for a default empty match
re_int = re.compile(r"t\d+|$")


# For sorting a "s" section in ssj; returns key as integer.
# example: "S123.t34" --> 34
def re_lmbd(el):
    s = re_int.findall(el)[0]
    if len(s) == 0:
        return 0
    else:
        return int(s[1:])


class SsjEntry:
    def __init__(self, ssj_id, s, deep_links):
        # See ssj xml structure.
        self.id = ssj_id
        self.s = DC(s)
        self.deep_links = DC(deep_links)


class SsjDict:
    def __init__(self):
        self.entries = {}

    """
    def read_xml(self, filepath):
        # No data loss.
        log.info("SsjDict.read_xml({})".format(filepath))
        t_start = time()
        tree = ET.parse(filepath)
        root = tree.getroot()
        stats = {
            "skipped": [],
            "duplicates": []
        }

        for s in root.iter("s"):
            s_id = s.attrib[XML_ID]
            tokens = {}
            for token in s:
                if token.tag == "linkGrp":
                    continue

                if token.tag == "w":
                    tokens[token.attrib[XML_ID]] = {
                        "msd": token.attrib["msd"],
                        "lemma": token.attrib["lemma"],
                        "word": token.text
                    }
                elif token.tag == "c":
                    tokens[token.attrib[XML_ID]] = {
                        "word": token.text
                    }
                else:
                    # <S />
                    pass

            linkGrps = s.findall("linkGrp")
            if len(linkGrps) < 2:
                # Take only entries with both deep and shallow
                # syntactic annotation
                stats["skipped"].append(s_id)
                continue

            linkG = {}
            for el in linkGrps:
                if el.attrib["type"] == "dep":
                    linkG["dep"] = el
                elif el.attrib["type"] == "SRL":
                    linkG["SRL"] = el
                else:
                    raise KeyError("Unknown linkGrp.")

            if s_id in self.entries:
                stats["duplicates"].append(s_id)
            self.entries[s_id] = SsjEntry(
                s_id,
                s.attrib["n"],
                tokens,
                create_edge_dict(linkG["dep"]),
                create_edge_dict(linkG["SRL"])
            )

        t_end = time()
        log.info("Time: {}s.".format(t_end - t_start))
        log.info(
            "{} duplicates, skipped {} elements (missing linkGrp).".format(
                len(stats["duplicates"]), len(stats["skipped"]))
        )
    """

    def read_xml_v2(self, filepath):
        NS_DICT = {
            "tei": "http://www.tei-c.org/ns/1.0",
            "xml": "http://www.w3.org/XML/1998/namespace",
        }

        def ns_prefix(ns):
            return "{" + NS_DICT[ns] + "}"

        def helper_get_sentence(tree_s):
            # all w and pc elements
            ret = []
            for el in tree_s.iter():
                if (
                    el.tag == ns_prefix("tei") + "w" or
                    el.tag == ns_prefix("tei") + "pc"
                ):
                    ret.append(el)
            return ret

        def helper_get_functor_links(tree_s):
            # links for SRL linkGrp
            lg = None
            for linkGrp in tree_s.findall("tei:linkGrp", NS_DICT):
                if linkGrp.attrib["type"] == "SRL":
                    lg = linkGrp
                    break
            else:
                return []
            ret = []
            for link in lg:
                ret.append(link)
            return ret

        def helper_gen_deep_links(link_list):
            deep_links = []
            for link in link_list:
                deep_links.append({
                    "from": link.attrib["target"].split(" ")[0][1:],
                    "to": link.attrib["target"].split(" ")[1][1:],
                    "functor": link.attrib["ana"].split(":")[1]
                })
            return deep_links

        log.info("SsjDict.read_xml({})".format(filepath))
        t_start = time()
        stats = {
            "total_count": 0,
            "deep_roles_count": 0,
            "duplicated_sid": 0,
        }
        tree = ET.parse(filepath)
        root = tree.getroot()

        for s in root.findall(".//tei:s", NS_DICT):
            stats["total_count"] += 1
            s_id = s.attrib[ns_prefix("xml") + "id"]

            # get_functors (deep semantic roles)
            functor_links = helper_get_functor_links(s)
            if len(functor_links) == 0:
                continue
            stats["deep_roles_count"] += 1

            # get_sentence
            tokens = {}
            for token in helper_get_sentence(s):
                tid = token.attrib[ns_prefix("xml") + "id"]
                if token.tag == ns_prefix("tei") + "w":
                    tokens[tid] = {
                        "msd": token.attrib["ana"].split(":")[1],
                        "lemma": token.attrib["lemma"],
                        "word": token.text
                    }
                elif token.tag == ns_prefix("tei") + "pc":
                    tokens[tid] = {
                        "word": token.text
                    }
                else:
                    log.warning("Unrecognized sentence element: " + token.tag)
                    exit(1)

            if s_id in self.entries:
                log.warning("duplicated sentence: " + s_id)
                stats["duplicated_sid"] += 1
                continue

            self.entries[s_id] = SsjEntry(
                s_id,
                tokens,
                helper_gen_deep_links(functor_links)
            )

        t_end = time()
        log.info("Time: {}s.".format(t_end - t_start))
        log.info(str(stats))


if __name__ == "__main__":
    # testing
    log.setLevel(logging.DEBUG)

    ch = logging.StreamHandler(sys.stdout)
    log.addHandler(ch)

    # Load
    fpath = "../../data/ssj500k-sl.TEI/ssj500k-sl.body.xml"
    ssj = SsjDict()
    ssj.read_xml_v2(fpath)
    with open("ssj_test.pickle", "wb") as file:
        pickle.dump(ssj, file)
old files from diploma's poc 2019-03-07 08:00:01 +00:00			`import xml.etree.ElementTree as ET`
			`from copy import deepcopy as DC`
			`from time import time`
			`import re`
			`import logging`
			`import sys`
			`import pickle`

			`log = logging.getLogger(__name__)`

			`ET.register_namespace("xml", "http://www.w3.org/XML/1998/namespace")`
			`XML_ID = "{http://www.w3.org/XML/1998/namespace}id"`


			`# \|$ for a default empty match`
			`re_int = re.compile(r"t\d+\|$")`


			`# For sorting a "s" section in ssj; returns key as integer.`
			`# example: "S123.t34" --> 34`
			`def re_lmbd(el):`
			`s = re_int.findall(el)[0]`
			`if len(s) == 0:`
			`return 0`
			`else:`
			`return int(s[1:])`


			`class SsjEntry:`
			`def __init__(self, ssj_id, s, deep_links):`
			`# See ssj xml structure.`
			`self.id = ssj_id`
			`self.s = DC(s)`
			`self.deep_links = DC(deep_links)`


			`class SsjDict:`
			`def __init__(self):`
			`self.entries = {}`

			`"""`
			`def read_xml(self, filepath):`
			`# No data loss.`
			`log.info("SsjDict.read_xml({})".format(filepath))`
			`t_start = time()`
			`tree = ET.parse(filepath)`
			`root = tree.getroot()`
			`stats = {`
			`"skipped": [],`
			`"duplicates": []`
			`}`

			`for s in root.iter("s"):`
			`s_id = s.attrib[XML_ID]`
			`tokens = {}`
			`for token in s:`
			`if token.tag == "linkGrp":`
			`continue`

			`if token.tag == "w":`
			`tokens[token.attrib[XML_ID]] = {`
			`"msd": token.attrib["msd"],`
			`"lemma": token.attrib["lemma"],`
			`"word": token.text`
			`}`
			`elif token.tag == "c":`
			`tokens[token.attrib[XML_ID]] = {`
			`"word": token.text`
			`}`
			`else:`
			`# <S />`
			`pass`

			`linkGrps = s.findall("linkGrp")`
			`if len(linkGrps) < 2:`
			`# Take only entries with both deep and shallow`
			`# syntactic annotation`
			`stats["skipped"].append(s_id)`
			`continue`

			`linkG = {}`
			`for el in linkGrps:`
			`if el.attrib["type"] == "dep":`
			`linkG["dep"] = el`
			`elif el.attrib["type"] == "SRL":`
			`linkG["SRL"] = el`
			`else:`
			`raise KeyError("Unknown linkGrp.")`

			`if s_id in self.entries:`
			`stats["duplicates"].append(s_id)`
			`self.entries[s_id] = SsjEntry(`
			`s_id,`
			`s.attrib["n"],`
			`tokens,`
			`create_edge_dict(linkG["dep"]),`
			`create_edge_dict(linkG["SRL"])`
			`)`

			`t_end = time()`
			`log.info("Time: {}s.".format(t_end - t_start))`
			`log.info(`
			`"{} duplicates, skipped {} elements (missing linkGrp).".format(`
			`len(stats["duplicates"]), len(stats["skipped"]))`
			`)`
			`"""`

			`def read_xml_v2(self, filepath):`
			`NS_DICT = {`
			`"tei": "http://www.tei-c.org/ns/1.0",`
			`"xml": "http://www.w3.org/XML/1998/namespace",`
			`}`

			`def ns_prefix(ns):`
			`return "{" + NS_DICT[ns] + "}"`

			`def helper_get_sentence(tree_s):`
			`# all w and pc elements`
			`ret = []`
			`for el in tree_s.iter():`
			`if (`
			`el.tag == ns_prefix("tei") + "w" or`
			`el.tag == ns_prefix("tei") + "pc"`
			`):`
			`ret.append(el)`
			`return ret`

			`def helper_get_functor_links(tree_s):`
			`# links for SRL linkGrp`
			`lg = None`
			`for linkGrp in tree_s.findall("tei:linkGrp", NS_DICT):`
			`if linkGrp.attrib["type"] == "SRL":`
			`lg = linkGrp`
			`break`
			`else:`
			`return []`
			`ret = []`
			`for link in lg:`
			`ret.append(link)`
			`return ret`

			`def helper_gen_deep_links(link_list):`
			`deep_links = []`
			`for link in link_list:`
			`deep_links.append({`
			`"from": link.attrib["target"].split(" ")[0][1:],`
			`"to": link.attrib["target"].split(" ")[1][1:],`
			`"functor": link.attrib["ana"].split(":")[1]`
			`})`
			`return deep_links`

			`log.info("SsjDict.read_xml({})".format(filepath))`
			`t_start = time()`
			`stats = {`
			`"total_count": 0,`
			`"deep_roles_count": 0,`
			`"duplicated_sid": 0,`
			`}`
			`tree = ET.parse(filepath)`
			`root = tree.getroot()`

			`for s in root.findall(".//tei:s", NS_DICT):`
			`stats["total_count"] += 1`
			`s_id = s.attrib[ns_prefix("xml") + "id"]`

			`# get_functors (deep semantic roles)`
			`functor_links = helper_get_functor_links(s)`
			`if len(functor_links) == 0:`
			`continue`
			`stats["deep_roles_count"] += 1`

			`# get_sentence`
			`tokens = {}`
			`for token in helper_get_sentence(s):`
			`tid = token.attrib[ns_prefix("xml") + "id"]`
			`if token.tag == ns_prefix("tei") + "w":`
			`tokens[tid] = {`
			`"msd": token.attrib["ana"].split(":")[1],`
			`"lemma": token.attrib["lemma"],`
			`"word": token.text`
			`}`
			`elif token.tag == ns_prefix("tei") + "pc":`
			`tokens[tid] = {`
			`"word": token.text`
			`}`
			`else:`
			`log.warning("Unrecognized sentence element: " + token.tag)`
			`exit(1)`

			`if s_id in self.entries:`
			`log.warning("duplicated sentence: " + s_id)`
			`stats["duplicated_sid"] += 1`
			`continue`

			`self.entries[s_id] = SsjEntry(`
			`s_id,`
			`tokens,`
			`helper_gen_deep_links(functor_links)`
			`)`

			`t_end = time()`
			`log.info("Time: {}s.".format(t_end - t_start))`
			`log.info(str(stats))`


			`if __name__ == "__main__":`
			`# testing`
			`log.setLevel(logging.DEBUG)`

			`ch = logging.StreamHandler(sys.stdout)`
			`log.addHandler(ch)`

			`# Load`
			`fpath = "../../data/ssj500k-sl.TEI/ssj500k-sl.body.xml"`
			`ssj = SsjDict()`
			`ssj.read_xml_v2(fpath)`
			`with open("ssj_test.pickle", "wb") as file:`
			`pickle.dump(ssj, file)`