frontend_devops fix

2019-03-20 17:49:34 +01:00
parent fbe9eb7b0f
commit aab075a291
96 changed files with 4 additions and 4 deletions
@@ -0,0 +1,218 @@
+import xml.etree.ElementTree as ET
+from copy import deepcopy as DC
+from time import time
+import re
+import logging
+import sys
+import pickle
+
+log = logging.getLogger(__name__)
+
+ET.register_namespace("xml", "http://www.w3.org/XML/1998/namespace")
+XML_ID = "{http://www.w3.org/XML/1998/namespace}id"
+
+
+# |$ for a default empty match
+re_int = re.compile(r"t\d+|$")
+
+
+# For sorting a "s" section in ssj; returns key as integer.
+# example: "S123.t34" --> 34
+def re_lmbd(el):
+    s = re_int.findall(el)[0]
+    if len(s) == 0:
+        return 0
+    else:
+        return int(s[1:])
+
+
+class SsjEntry:
+    def __init__(self, ssj_id, s, deep_links):
+        # See ssj xml structure.
+        self.id = ssj_id
+        self.s = DC(s)
+        self.deep_links = DC(deep_links)
+
+
+class SsjDict:
+    def __init__(self):
+        self.entries = {}
+
+    """
+    def read_xml(self, filepath):
+        # No data loss.
+        log.info("SsjDict.read_xml({})".format(filepath))
+        t_start = time()
+        tree = ET.parse(filepath)
+        root = tree.getroot()
+        stats = {
+            "skipped": [],
+            "duplicates": []
+        }
+
+        for s in root.iter("s"):
+            s_id = s.attrib[XML_ID]
+            tokens = {}
+            for token in s:
+                if token.tag == "linkGrp":
+                    continue
+
+                if token.tag == "w":
+                    tokens[token.attrib[XML_ID]] = {
+                        "msd": token.attrib["msd"],
+                        "lemma": token.attrib["lemma"],
+                        "word": token.text
+                    }
+                elif token.tag == "c":
+                    tokens[token.attrib[XML_ID]] = {
+                        "word": token.text
+                    }
+                else:
+                    # <S />
+                    pass
+
+            linkGrps = s.findall("linkGrp")
+            if len(linkGrps) < 2:
+                # Take only entries with both deep and shallow
+                # syntactic annotation
+                stats["skipped"].append(s_id)
+                continue
+
+            linkG = {}
+            for el in linkGrps:
+                if el.attrib["type"] == "dep":
+                    linkG["dep"] = el
+                elif el.attrib["type"] == "SRL":
+                    linkG["SRL"] = el
+                else:
+                    raise KeyError("Unknown linkGrp.")
+
+            if s_id in self.entries:
+                stats["duplicates"].append(s_id)
+            self.entries[s_id] = SsjEntry(
+                s_id,
+                s.attrib["n"],
+                tokens,
+                create_edge_dict(linkG["dep"]),
+                create_edge_dict(linkG["SRL"])
+            )
+
+        t_end = time()
+        log.info("Time: {}s.".format(t_end - t_start))
+        log.info(
+            "{} duplicates, skipped {} elements (missing linkGrp).".format(
+                len(stats["duplicates"]), len(stats["skipped"]))
+        )
+    """
+
+    def read_xml_v2(self, filepath):
+        NS_DICT = {
+            "tei": "http://www.tei-c.org/ns/1.0",
+            "xml": "http://www.w3.org/XML/1998/namespace",
+        }
+
+        def ns_prefix(ns):
+            return "{" + NS_DICT[ns] + "}"
+
+        def helper_get_sentence(tree_s):
+            # all w and pc elements
+            ret = []
+            for el in tree_s.iter():
+                if (
+                    el.tag == ns_prefix("tei") + "w" or
+                    el.tag == ns_prefix("tei") + "pc"
+                ):
+                    ret.append(el)
+            return ret
+
+        def helper_get_functor_links(tree_s):
+            # links for SRL linkGrp
+            lg = None
+            for linkGrp in tree_s.findall("tei:linkGrp", NS_DICT):
+                if linkGrp.attrib["type"] == "SRL":
+                    lg = linkGrp
+                    break
+            else:
+                return []
+            ret = []
+            for link in lg:
+                ret.append(link)
+            return ret
+
+        def helper_gen_deep_links(link_list):
+            deep_links = []
+            for link in link_list:
+                deep_links.append({
+                    "from": link.attrib["target"].split(" ")[0][1:],
+                    "to": link.attrib["target"].split(" ")[1][1:],
+                    "functor": link.attrib["ana"].split(":")[1]
+                })
+            return deep_links
+
+        log.info("SsjDict.read_xml({})".format(filepath))
+        t_start = time()
+        stats = {
+            "total_count": 0,
+            "deep_roles_count": 0,
+            "duplicated_sid": 0,
+        }
+        tree = ET.parse(filepath)
+        root = tree.getroot()
+
+        for s in root.findall(".//tei:s", NS_DICT):
+            stats["total_count"] += 1
+            s_id = s.attrib[ns_prefix("xml") + "id"]
+
+            # get_functors (deep semantic roles)
+            functor_links = helper_get_functor_links(s)
+            if len(functor_links) == 0:
+                continue
+            stats["deep_roles_count"] += 1
+
+            # get_sentence
+            tokens = {}
+            for token in helper_get_sentence(s):
+                tid = token.attrib[ns_prefix("xml") + "id"]
+                if token.tag == ns_prefix("tei") + "w":
+                    tokens[tid] = {
+                        "msd": token.attrib["ana"].split(":")[1],
+                        "lemma": token.attrib["lemma"],
+                        "word": token.text
+                    }
+                elif token.tag == ns_prefix("tei") + "pc":
+                    tokens[tid] = {
+                        "word": token.text
+                    }
+                else:
+                    log.warning("Unrecognized sentence element: " + token.tag)
+                    exit(1)
+
+            if s_id in self.entries:
+                log.warning("duplicated sentence: " + s_id)
+                stats["duplicated_sid"] += 1
+                continue
+
+            self.entries[s_id] = SsjEntry(
+                s_id,
+                tokens,
+                helper_gen_deep_links(functor_links)
+            )
+
+        t_end = time()
+        log.info("Time: {}s.".format(t_end - t_start))
+        log.info(str(stats))
+
+
+if __name__ == "__main__":
+    # testing
+    log.setLevel(logging.DEBUG)
+
+    ch = logging.StreamHandler(sys.stdout)
+    log.addHandler(ch)
+
+    # Load
+    fpath = "../../data/ssj500k-sl.TEI/ssj500k-sl.body.xml"
+    ssj = SsjDict()
+    ssj.read_xml_v2(fpath)
+    with open("ssj_test.pickle", "wb") as file:
+        pickle.dump(ssj, file)