cjvt-srl-tagging/tools/parser/parser.py

from lxml import etree
import re
from parser.msd.msdmap import Msdmap

class Parser:
    # reads a TEI xml file and returns a dictionary:
    # { <sentence_id>: {
    #       sid: <sentence_id>,  # serves as index in MongoDB
    #       text: ,
    #       tokens: ,
    # }}

    def __init__(self):
        self.msdmap = Msdmap()
        self.W_TAGS = ['w']
        self.C_TAGS = ['c']
        self.S_TAGS = ['S', 'pc']

    def parse_tei(self, filepath):

        def parse_links(s_el):
            lgrps = s_el.findall(".//links")
            if len(lgrps) < 1:
                raise IOError("Can't find links.")
            res_links = {}
            for link in lgrps[0]:
                dep = int(link.get("dep").split(".")[-1])
                res_links[dep] = (
                    link.get("afun"),
                    dep,
                    int(link.get("from").split(".")[-1]),
                )
            return res_links

        guess_corpus = None  # SSJ | KRES
        res_dict = {}
        with filepath.open("rb") as fp:
            # remove namespaces
            bstr = fp.read()

            utf8str = bstr.decode("utf-8")
            utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1)
            utf8str = re.sub(' xml:', ' ', utf8str)

            root = etree.XML(utf8str.encode("utf-8"))

            divs = []  # in ssj, there are divs, in Kres, there are separate files
            if "id" in root.keys():
                # Kres files start with <TEI id=...>
                guess_corpus = "KRES"
                divs = [root]
            else:
                guess_corpus = "SSJ"
                divs = root.findall(".//div")

            # parse divs
            for div in divs:
                f_id = div.get("id")

                # parse paragraphs
                for p in div.findall(".//p"):
                    p_id = p.get("id").split(".")[-1]

                    # parse sentences
                    for s in p.findall(".//s"):
                        s_id = s.get("id").split(".")[-1]
                        sentence_text = ""
                        sentence_tokens = []

                        # parse tokens
                        for el in s.iter():
                            if el.tag in self.W_TAGS:
                                el_id = el.get("id").split(".")[-1]
                                if el_id[0] == 't':
                                    el_id = el_id[1:]  # ssj W_TAG ids start with t
                                sentence_text += el.text
                                sentence_tokens += [(
                                    "w",
                                    int(el_id),
                                    el.text,
                                    el.get("lemma"),
                                    (el.get("msd") if guess_corpus == "KRES"
                                        else el.get("ana").split(":")[-1]),
                                )]
                            elif el.tag in self.C_TAGS:
                                # only Kres' C_TAGS have ids
                                el_id = el.get("id") or "none"
                                el_id = el_id.split(".")[-1]
                                sentence_text += el.text
                                sentence_tokens += [("c", el_id, el.text,)]
                            elif el.tag in self.S_TAGS:
                                # Kres' <S /> doesn't contain .text
                                sentence_text += " "
                            else:
                                # pass links and linkGroups
                                pass
                        sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
                        if sentence_id in res_dict:
                            raise KeyError("duplicated id: {}".format(sentence_id))
                        res_dict[sentence_id] = {
                            "sid": sentence_id,
                            "text": sentence_text,
                            "tokens": sentence_tokens,
                            "links": (
                                parse_links(s) if guess_corpus == "KRES" else None
                            )
                        }
        fp.close()
        return res_dict


    def to_conll_2009_SRL(self, sentence_entry, napreds=100):

        def fillpred(pos, feat):
            if pos == "V" and "main" in feat.split("|"):
                return True
            return False

        # works with kres, with parsed links
        out_str = ""
        for token in sentence_entry["tokens"]:
            if token[0] != "w":
                continue

            t_id = token[1]
            pos = self.msdmap.slo_msd_to_eng_pos(token[4])
            feat = "|".join(self.msdmap.slo_msd_to_eng_long(token[4]).split(" "))
            fprd = fillpred(pos, feat)

            """
            print(t_id)
            print("msd:")
            print(msd)
            print(token)
            print(sentence_entry["links"])
            """

            apreds = "".join(["\t_" for x in range(napreds)])

            # format: 14 + apreds
            out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}{}\n".format(
                t_id,  # id
                token[2],  # form
                token[3],  # lemma
                token[3],  # plemma
                pos,  # pos
                pos,  # ppos
                feat,  # feat
                feat,  # pfeat
                sentence_entry["links"][t_id][2],  # head
                sentence_entry["links"][t_id][2],  # phead
                sentence_entry["links"][t_id][0],  # deprel
                sentence_entry["links"][t_id][0],  # pdeprel
                "Y" if fprd else "_",  # fillpred
                token[3] if fprd else "_",  # pred
                apreds,
            )
        out_str += "\n"
        # print(out_str)
        return out_str


    def to_conll_2009_full(self, sentence_entry):
        out_str = ""
        for token in sentence_entry["tokens"]:
            t_id = token[1]
            #            1   3
            out_str += "{}\t{}\n".format(
                t_id,  # id
                token[2],  # form
            )
        out_str += "\n"
        return out_str