cjvt-srl-tagging/tools/parser/parser.py

from lxml import etree
import re

W_TAGS = ['w']
C_TAGS = ['c']
S_TAGS = ['S', 'pc']

# reads a TEI xml file and returns a dictionary:
# { <sentence_id>: {
#       sid: <sentence_id>,  # serves as index in MongoDB
#       text: ,
#       tokens: ,
# }}


def parse_tei(filepath):
    guess_corpus = None  # SSJ | KRES
    res_dict = {}
    with open(filepath, "r") as fp:
        # remove namespaces
        xmlstr = fp.read()
        xmlstr = re.sub('\\sxmlns="[^"]+"', '', xmlstr, count=1)
        xmlstr = re.sub(' xml:', ' ', xmlstr)

        root = etree.XML(xmlstr.encode("utf-8"))

        divs = []  # in ssj, there are divs, in Kres, there are separate files
        if "id" in root.keys():
            # Kres files start with <TEI id=...>
            guess_corpus = "KRES"
            divs = [root]
        else:
            guess_corpus = "SSJ"
            divs = root.findall(".//div")

        # parse divs
        for div in divs:
            f_id = div.get("id")

            # parse paragraphs
            for p in div.findall(".//p"):
                p_id = p.get("id").split(".")[-1]

                # parse sentences
                for s in p.findall(".//s"):
                    s_id = s.get("id").split(".")[-1]
                    sentence_text = ""
                    sentence_tokens = []

                    # parse tokens
                    for el in s.iter():
                        if el.tag in W_TAGS:
                            el_id = el.get("id").split(".")[-1]
                            if el_id[0] == 't':
                                el_id = el_id[1:]  # ssj W_TAG ids start with t
                            sentence_text += el.text
                            sentence_tokens += [(
                                "w",
                                int(el_id),
                                el.text,
                                el.get("lemma"),
                                (el.get("msd") if guess_corpus == "KRES"
                                    else el.get("ana").split(":")[-1]),
                            )]
                        elif el.tag in C_TAGS:
                            # only Kres' C_TAGS have ids
                            el_id = el.get("id") or "none"
                            el_id = el_id.split(".")[-1]
                            sentence_text += el.text
                            sentence_tokens += [("c", el_id, el.text,)]
                        elif el.tag in S_TAGS:
                            # Kres' <S /> doesn't contain .text
                            sentence_text += " "
                        else:
                            # pass links and linkGroups
                            pass
                    sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
                    if sentence_id in res_dict:
                        raise KeyError("duplicated id: {}".format(sentence_id))
                    res_dict[sentence_id] = {
                        "sid": sentence_id,
                        "text": sentence_text,
                        "tokens": sentence_tokens,
                        "links": (
                            parse_links(s) if guess_corpus == "KRES" else None
                        )
                    }
    return res_dict


def parse_links(s_el):
    lgrps = s_el.findall(".//links")
    if len(lgrps) < 1:
        raise IOError("Can't find links.")
    res_links = {}
    for link in lgrps[0]:
        dep = int(link.get("dep").split(".")[-1])
        res_links[dep] = (
            link.get("afun"),
            dep,
            int(link.get("from").split(".")[-1]),
        )
    return res_links


def to_conll09(sentence_entry):
    # works with kres, with parsed links
    out_str = ""
    for token in sentence_entry["tokens"]:
        if token[0] != "w":
            continue
        print(token)
        print(sentence_entry["links"])
        t_id = token[1]
        print(t_id)
        out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
            t_id,  # id
            token[2],  # form
            token[3],  # lemma
            token[3],  # plemma
            "todo",  # pos (TODO)
            "todo",  # ppos (TODO)
            "todo",  # feat (TODO)
            "todo",  # pfeat (TODO)
            sentence_entry["links"][t_id][2],  # head
            sentence_entry["links"][t_id][2],  # phead
            sentence_entry["links"][t_id][1],  # deprel
            sentence_entry["links"][t_id][1],  # pdeprel
        )
    out_str += "\n"
    return out_str
parser.py can read kres and/or ssj500k 2019-02-03 21:54:26 +00:00			`from lxml import etree`
			`import re`

			`W_TAGS = ['w']`
			`C_TAGS = ['c']`
			`S_TAGS = ['S', 'pc']`

			`# reads a TEI xml file and returns a dictionary:`
			`# { <sentence_id>: {`
msdmap.py 2019-02-10 22:23:24 +00:00			`# sid: <sentence_id>, # serves as index in MongoDB`
			`# text: ,`
			`# tokens: ,`
parser.py can read kres and/or ssj500k 2019-02-03 21:54:26 +00:00			`# }}`
msdmap.py 2019-02-10 22:23:24 +00:00

parser.py can read kres and/or ssj500k 2019-02-03 21:54:26 +00:00			`def parse_tei(filepath):`
msdmap.py 2019-02-10 22:23:24 +00:00			`guess_corpus = None # SSJ \| KRES`
			`res_dict = {}`
			`with open(filepath, "r") as fp:`
			`# remove namespaces`
			`xmlstr = fp.read()`
			`xmlstr = re.sub('\\sxmlns="[^"]+"', '', xmlstr, count=1)`
			`xmlstr = re.sub(' xml:', ' ', xmlstr)`

			`root = etree.XML(xmlstr.encode("utf-8"))`

			`divs = [] # in ssj, there are divs, in Kres, there are separate files`
			`if "id" in root.keys():`
			`# Kres files start with <TEI id=...>`
			`guess_corpus = "KRES"`
			`divs = [root]`
			`else:`
			`guess_corpus = "SSJ"`
			`divs = root.findall(".//div")`

			`# parse divs`
			`for div in divs:`
			`f_id = div.get("id")`

			`# parse paragraphs`
			`for p in div.findall(".//p"):`
			`p_id = p.get("id").split(".")[-1]`

			`# parse sentences`
			`for s in p.findall(".//s"):`
			`s_id = s.get("id").split(".")[-1]`
			`sentence_text = ""`
			`sentence_tokens = []`

			`# parse tokens`
			`for el in s.iter():`
			`if el.tag in W_TAGS:`
			`el_id = el.get("id").split(".")[-1]`
			`if el_id[0] == 't':`
			`el_id = el_id[1:] # ssj W_TAG ids start with t`
			`sentence_text += el.text`
			`sentence_tokens += [(`
			`"w",`
parsed links 2019-02-12 01:48:34 +00:00			`int(el_id),`
msdmap.py 2019-02-10 22:23:24 +00:00			`el.text,`
			`el.get("lemma"),`
			`(el.get("msd") if guess_corpus == "KRES"`
			`else el.get("ana").split(":")[-1]),`
			`)]`
			`elif el.tag in C_TAGS:`
			`# only Kres' C_TAGS have ids`
			`el_id = el.get("id") or "none"`
			`el_id = el_id.split(".")[-1]`
			`sentence_text += el.text`
			`sentence_tokens += [("c", el_id, el.text,)]`
			`elif el.tag in S_TAGS:`
			`# Kres' <S /> doesn't contain .text`
			`sentence_text += " "`
			`else:`
			`# pass links and linkGroups`
			`pass`
			`sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)`
			`if sentence_id in res_dict:`
			`raise KeyError("duplicated id: {}".format(sentence_id))`
			`res_dict[sentence_id] = {`
			`"sid": sentence_id,`
			`"text": sentence_text,`
parsed links 2019-02-12 01:48:34 +00:00			`"tokens": sentence_tokens,`
			`"links": (`
			`parse_links(s) if guess_corpus == "KRES" else None`
			`)`
msdmap.py 2019-02-10 22:23:24 +00:00			`}`
			`return res_dict`


parsed links 2019-02-12 01:48:34 +00:00			`def parse_links(s_el):`
			`lgrps = s_el.findall(".//links")`
			`if len(lgrps) < 1:`
			`raise IOError("Can't find links.")`
			`res_links = {}`
			`for link in lgrps[0]:`
			`dep = int(link.get("dep").split(".")[-1])`
			`res_links[dep] = (`
			`link.get("afun"),`
			`dep,`
			`int(link.get("from").split(".")[-1]),`
			`)`
			`return res_links`


			`def to_conll09(sentence_entry):`
			`# works with kres, with parsed links`
			`out_str = ""`
			`for token in sentence_entry["tokens"]:`
			`if token[0] != "w":`
			`continue`
			`print(token)`
			`print(sentence_entry["links"])`
			`t_id = token[1]`
			`print(t_id)`
			`out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(`
			`t_id, # id`
			`token[2], # form`
			`token[3], # lemma`
			`token[3], # plemma`
			`"todo", # pos (TODO)`
			`"todo", # ppos (TODO)`
			`"todo", # feat (TODO)`
			`"todo", # pfeat (TODO)`
			`sentence_entry["links"][t_id][2], # head`
			`sentence_entry["links"][t_id][2], # phead`
			`sentence_entry["links"][t_id][1], # deprel`
			`sentence_entry["links"][t_id][1], # pdeprel`
			`)`
			`out_str += "\n"`
			`return out_str`