cjvt-srl-tagging/tools/parser/parser.py

from lxml import etree
import re

W_TAGS = ['w']
C_TAGS = ['c']
S_TAGS = ['S', 'pc']

# reads a TEI xml file and returns a dictionary:
# { <sentence_id>: {
#       sid: <sentence_id>,  # serves as index in MongoDB
#       text: ,
#       tokens: ,
# }}


def parse_tei(filepath):
    guess_corpus = None  # SSJ | KRES
    res_dict = {}
    with open(filepath, "r") as fp:
        # remove namespaces
        xmlstr = fp.read()
        xmlstr = re.sub('\\sxmlns="[^"]+"', '', xmlstr, count=1)
        xmlstr = re.sub(' xml:', ' ', xmlstr)

        root = etree.XML(xmlstr.encode("utf-8"))

        divs = []  # in ssj, there are divs, in Kres, there are separate files
        if "id" in root.keys():
            # Kres files start with <TEI id=...>
            guess_corpus = "KRES"
            divs = [root]
        else:
            guess_corpus = "SSJ"
            divs = root.findall(".//div")

        # parse divs
        for div in divs:
            f_id = div.get("id")

            # parse paragraphs
            for p in div.findall(".//p"):
                p_id = p.get("id").split(".")[-1]

                # parse sentences
                for s in p.findall(".//s"):
                    s_id = s.get("id").split(".")[-1]
                    sentence_text = ""
                    sentence_tokens = []

                    # parse tokens
                    for el in s.iter():
                        if el.tag in W_TAGS:
                            el_id = el.get("id").split(".")[-1]
                            if el_id[0] == 't':
                                el_id = el_id[1:]  # ssj W_TAG ids start with t
                            sentence_text += el.text
                            sentence_tokens += [(
                                "w",
                                int(el_id),
                                el.text,
                                el.get("lemma"),
                                (el.get("msd") if guess_corpus == "KRES"
                                    else el.get("ana").split(":")[-1]),
                            )]
                        elif el.tag in C_TAGS:
                            # only Kres' C_TAGS have ids
                            el_id = el.get("id") or "none"
                            el_id = el_id.split(".")[-1]
                            sentence_text += el.text
                            sentence_tokens += [("c", el_id, el.text,)]
                        elif el.tag in S_TAGS:
                            # Kres' <S /> doesn't contain .text
                            sentence_text += " "
                        else:
                            # pass links and linkGroups
                            pass
                    sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
                    if sentence_id in res_dict:
                        raise KeyError("duplicated id: {}".format(sentence_id))
                    res_dict[sentence_id] = {
                        "sid": sentence_id,
                        "text": sentence_text,
                        "tokens": sentence_tokens,
                        "links": (
                            parse_links(s) if guess_corpus == "KRES" else None
                        )
                    }
    return res_dict


def parse_links(s_el):
    lgrps = s_el.findall(".//links")
    if len(lgrps) < 1:
        raise IOError("Can't find links.")
    res_links = {}
    for link in lgrps[0]:
        dep = int(link.get("dep").split(".")[-1])
        res_links[dep] = (
            link.get("afun"),
            dep,
            int(link.get("from").split(".")[-1]),
        )
    return res_links


def to_conll09(sentence_entry):
    # works with kres, with parsed links
    out_str = ""
    for token in sentence_entry["tokens"]:
        if token[0] != "w":
            continue
        print(token)
        print(sentence_entry["links"])
        t_id = token[1]
        print(t_id)
        out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
            t_id,  # id
            token[2],  # form
            token[3],  # lemma
            token[3],  # plemma
            "todo",  # pos (TODO)
            "todo",  # ppos (TODO)
            "todo",  # feat (TODO)
            "todo",  # pfeat (TODO)
            sentence_entry["links"][t_id][2],  # head
            sentence_entry["links"][t_id][2],  # phead
            sentence_entry["links"][t_id][1],  # deprel
            sentence_entry["links"][t_id][1],  # pdeprel
        )
    out_str += "\n"
    return out_str