cjvt-srl-tagging/tools/parser/parser.py

from lxml import etree
import re
from parser.msd.msdmap import Msdmap
import pickle
from pathlib import Path
from fillpred_model.step1 import build_model_row
import sys

class Parser:
    # reads a TEI xml file and returns a dictionary:
    # { <sentence_id>: {
    #       sid: <sentence_id>,  # serves as index in MongoDB
    #       text: ,
    #       tokens: ,
    # }}

    def __init__(self):
        self.msdmap = Msdmap()
        self.W_TAGS = ['w']
        self.C_TAGS = ['c']
        self.S_TAGS = ['S', 'pc']
        try:
            fp = Path("./fillpred_model/model.pickle").open("rb")
            self.fillpred_model = pickle.load(fp)
        except IOError:
            print("Generate the model first: $ make tools/fillpred_mode/model.pickle")
            sys.exit(1)

    def parse_tei(self, filepath):

        def parse_links(s_el):
            lgrps = s_el.findall(".//links")
            if len(lgrps) < 1:
                raise IOError("Can't find links.")
            res_links = {}
            for link in lgrps[0]:
                dep = int(link.get("dep").split(".")[-1])
                res_links[dep] = (
                    link.get("afun"),
                    dep,
                    int(link.get("from").split(".")[-1]),
                )
            return res_links

        guess_corpus = None  # SSJ | KRES
        res_dict = {}
        with filepath.open("rb") as fp:
            # remove namespaces
            bstr = fp.read()

            utf8str = bstr.decode("utf-8")
            utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1)
            utf8str = re.sub(' xml:', ' ', utf8str)

            root = etree.XML(utf8str.encode("utf-8"))

            divs = []  # in ssj, there are divs, in Kres, there are separate files
            if "id" in root.keys():
                # Kres files start with <TEI id=...>
                guess_corpus = "KRES"
                divs = [root]
            else:
                guess_corpus = "SSJ"
                divs = root.findall(".//div")

            # parse divs
            for div in divs:
                f_id = div.get("id")

                # parse paragraphs
                for p in div.findall(".//p"):
                    p_id = p.get("id").split(".")[-1]

                    # parse sentences
                    for s in p.findall(".//s"):
                        s_id = s.get("id").split(".")[-1]
                        sentence_text = ""
                        sentence_tokens = []

                        # parse tokens
                        for el in s.iter():
                            if el.tag in self.W_TAGS:
                                el_id = el.get("id").split(".")[-1]
                                if el_id[0] == 't':
                                    el_id = el_id[1:]  # ssj W_TAG ids start with t
                                sentence_text += el.text
                                sentence_tokens += [(
                                    "w",
                                    int(el_id),
                                    el.text,
                                    el.get("lemma"),
                                    (el.get("msd") if guess_corpus == "KRES"
                                        else el.get("ana").split(":")[-1]),
                                )]
                            elif el.tag in self.C_TAGS:
                                # only Kres' C_TAGS have ids
                                el_id = el.get("id") or "none"
                                el_id = el_id.split(".")[-1]
                                sentence_text += el.text
                                sentence_tokens += [("c", el_id, el.text,)]
                            elif el.tag in self.S_TAGS:
                                # Kres' <S /> doesn't contain .text
                                sentence_text += " "
                            else:
                                # pass links and linkGroups
                                pass
                        sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
                        if sentence_id in res_dict:
                            raise KeyError("duplicated id: {}".format(sentence_id))
                        res_dict[sentence_id] = {
                            "sid": sentence_id,
                            "text": sentence_text,
                            "tokens": sentence_tokens,
                            "links": (
                                parse_links(s) if guess_corpus == "KRES" else None
                            )
                        }
        fp.close()
        return res_dict


    def to_conll_2009_SRL(self, sentence_entry):

        def fillpred(tsv_row):
            mrow = build_model_row(tsv_row)
            x = mrow[:-1] 
            y = self.fillpred_model.predict([x])
            return y[0]  # bool

        # works with kres, with parsed links
        out_str = ""
        for token in sentence_entry["tokens"]:
            t_id = token[1]
            form = token[2]

            # handle stop signs
            if token[0] != "w":
                out_str += '\t'.join(
                    [t_id] +
                    [form for x in range(7)] + 
                    ["0", "0", "modra", "modra", "_", "_"] +
                    ["\n"]
                )
                continue 

            pos = self.msdmap.slo_msd_to_eng_pos(token[4])
            feat = "|".join(self.msdmap.slo_msd_to_eng_long(token[4]).split(" "))

            """
            print(t_id)
            print("msd:")
            print(msd)
            print(token)
            print(sentence_entry["links"])
            """
            row_list = [
                    t_id,
                    form,
                    token[3],  # lemma
                    token[3],  # plemma
                    pos,  # pos
                    pos,  # ppos
                    feat,  # feat
                    feat,  # pfeat
                    sentence_entry["links"][t_id][2],  # head
                    sentence_entry["links"][t_id][2],  # phead
                    sentence_entry["links"][t_id][0],  # deprel
                    sentence_entry["links"][t_id][0],  # pdeprel
                    "_",  # fillpred
                    "_",  # pred
                    "\n",
            ]
            fprd = fillpred(row_list)
            row_list[12] = "Y" if fprd else "_"
            row_list[13] = token[3] if fprd else "_"

            # format: 14 + apreds
            out_str += '\t'.join(map(str, 
                row_list
            ))
        out_str += "\n"  # newline at the end of sentence
        return out_str


    def to_conll_2009_full(self, sentence_entry):
        out_str = ""
        for token in sentence_entry["tokens"]:
            t_id = token[1]
            #            1   3
            out_str += "{}\t{}\n".format(
                t_id,  # id
                token[2],  # form
            )
        out_str += "\n"
        return out_str
parser.py can read kres and/or ssj500k 2019-02-03 21:54:26 +00:00			`from lxml import etree`
			`import re`
Parser() class 2019-02-15 08:09:11 +00:00			`from parser.msd.msdmap import Msdmap`
srl taggin pipeline (output in .tsv) 2019-02-24 21:23:32 +00:00			`import pickle`
			`from pathlib import Path`
			`from fillpred_model.step1 import build_model_row`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`import sys`
Parser() class 2019-02-15 08:09:11 +00:00
			`class Parser:`
			`# reads a TEI xml file and returns a dictionary:`
			`# { <sentence_id>: {`
			`# sid: <sentence_id>, # serves as index in MongoDB`
			`# text: ,`
			`# tokens: ,`
			`# }}`

			`def __init__(self):`
			`self.msdmap = Msdmap()`
			`self.W_TAGS = ['w']`
			`self.C_TAGS = ['c']`
			`self.S_TAGS = ['S', 'pc']`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`try:`
			`fp = Path("./fillpred_model/model.pickle").open("rb")`
srl taggin pipeline (output in .tsv) 2019-02-24 21:23:32 +00:00			`self.fillpred_model = pickle.load(fp)`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`except IOError:`
			`print("Generate the model first: $ make tools/fillpred_mode/model.pickle")`
			`sys.exit(1)`
Parser() class 2019-02-15 08:09:11 +00:00
			`def parse_tei(self, filepath):`

			`def parse_links(s_el):`
			`lgrps = s_el.findall(".//links")`
			`if len(lgrps) < 1:`
			`raise IOError("Can't find links.")`
			`res_links = {}`
			`for link in lgrps[0]:`
			`dep = int(link.get("dep").split(".")[-1])`
			`res_links[dep] = (`
			`link.get("afun"),`
			`dep,`
			`int(link.get("from").split(".")[-1]),`
			`)`
			`return res_links`

			`guess_corpus = None # SSJ \| KRES`
			`res_dict = {}`
finished parse + tag toolchain -> TODO: tagger error 2019-02-18 07:49:04 +00:00			`with filepath.open("rb") as fp:`
Parser() class 2019-02-15 08:09:11 +00:00			`# remove namespaces`
			`bstr = fp.read()`

			`utf8str = bstr.decode("utf-8")`
			`utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1)`
			`utf8str = re.sub(' xml:', ' ', utf8str)`

			`root = etree.XML(utf8str.encode("utf-8"))`

			`divs = [] # in ssj, there are divs, in Kres, there are separate files`
			`if "id" in root.keys():`
			`# Kres files start with <TEI id=...>`
			`guess_corpus = "KRES"`
			`divs = [root]`
			`else:`
			`guess_corpus = "SSJ"`
			`divs = root.findall(".//div")`

			`# parse divs`
			`for div in divs:`
			`f_id = div.get("id")`

			`# parse paragraphs`
			`for p in div.findall(".//p"):`
			`p_id = p.get("id").split(".")[-1]`

			`# parse sentences`
			`for s in p.findall(".//s"):`
			`s_id = s.get("id").split(".")[-1]`
			`sentence_text = ""`
			`sentence_tokens = []`

			`# parse tokens`
			`for el in s.iter():`
			`if el.tag in self.W_TAGS:`
			`el_id = el.get("id").split(".")[-1]`
			`if el_id[0] == 't':`
			`el_id = el_id[1:] # ssj W_TAG ids start with t`
			`sentence_text += el.text`
			`sentence_tokens += [(`
			`"w",`
			`int(el_id),`
			`el.text,`
			`el.get("lemma"),`
			`(el.get("msd") if guess_corpus == "KRES"`
			`else el.get("ana").split(":")[-1]),`
			`)]`
			`elif el.tag in self.C_TAGS:`
			`# only Kres' C_TAGS have ids`
			`el_id = el.get("id") or "none"`
			`el_id = el_id.split(".")[-1]`
			`sentence_text += el.text`
			`sentence_tokens += [("c", el_id, el.text,)]`
			`elif el.tag in self.S_TAGS:`
			`# Kres' <S /> doesn't contain .text`
			`sentence_text += " "`
			`else:`
			`# pass links and linkGroups`
			`pass`
			`sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)`
			`if sentence_id in res_dict:`
			`raise KeyError("duplicated id: {}".format(sentence_id))`
			`res_dict[sentence_id] = {`
			`"sid": sentence_id,`
			`"text": sentence_text,`
			`"tokens": sentence_tokens,`
			`"links": (`
			`parse_links(s) if guess_corpus == "KRES" else None`
			`)`
			`}`
			`fp.close()`
			`return res_dict`


tmp 2019-02-27 15:58:04 +00:00			`def to_conll_2009_SRL(self, sentence_entry):`
Parser() class 2019-02-15 08:09:11 +00:00
srl taggin pipeline (output in .tsv) 2019-02-24 21:23:32 +00:00			`def fillpred(tsv_row):`
			`mrow = build_model_row(tsv_row)`
			`x = mrow[:-1]`
			`y = self.fillpred_model.predict([x])`
			`return y[0] # bool`
Parser() class 2019-02-15 08:09:11 +00:00
			`# works with kres, with parsed links`
			`out_str = ""`
			`for token in sentence_entry["tokens"]:`
mate-tools tags the corpus. Need to specify predicates better 2019-02-20 06:38:26 +00:00			`t_id = token[1]`
			`form = token[2]`

			`# handle stop signs`
Parser() class 2019-02-15 08:09:11 +00:00			`if token[0] != "w":`
mate-tools tags the corpus. Need to specify predicates better 2019-02-20 06:38:26 +00:00			`out_str += '\t'.join(`
			`[t_id] +`
			`[form for x in range(7)] +`
			`["0", "0", "modra", "modra", "_", "_"] +`
tmp 2019-02-27 15:58:04 +00:00			`["\n"]`
mate-tools tags the corpus. Need to specify predicates better 2019-02-20 06:38:26 +00:00			`)`
			`continue`
Parser() class 2019-02-15 08:09:11 +00:00
ready for SRL tagging 2019-02-16 10:41:39 +00:00			`pos = self.msdmap.slo_msd_to_eng_pos(token[4])`
			`feat = "\|".join(self.msdmap.slo_msd_to_eng_long(token[4]).split(" "))`
Parser() class 2019-02-15 08:09:11 +00:00
			`"""`
			`print(t_id)`
			`print("msd:")`
			`print(msd)`
			`print(token)`
			`print(sentence_entry["links"])`
			`"""`
srl taggin pipeline (output in .tsv) 2019-02-24 21:23:32 +00:00			`row_list = [`
mate-tools tags the corpus. Need to specify predicates better 2019-02-20 06:38:26 +00:00			`t_id,`
			`form,`
			`token[3], # lemma`
			`token[3], # plemma`
			`pos, # pos`
			`pos, # ppos`
			`feat, # feat`
			`feat, # pfeat`
			`sentence_entry["links"][t_id][2], # head`
			`sentence_entry["links"][t_id][2], # phead`
			`sentence_entry["links"][t_id][0], # deprel`
			`sentence_entry["links"][t_id][0], # pdeprel`
srl taggin pipeline (output in .tsv) 2019-02-24 21:23:32 +00:00			`"_", # fillpred`
			`"_", # pred`
mate-tools tags the corpus. Need to specify predicates better 2019-02-20 06:38:26 +00:00			`"\n",`
srl taggin pipeline (output in .tsv) 2019-02-24 21:23:32 +00:00			`]`
			`fprd = fillpred(row_list)`
			`row_list[12] = "Y" if fprd else "_"`
			`row_list[13] = token[3] if fprd else "_"`

			`# format: 14 + apreds`
			`out_str += '\t'.join(map(str,`
			`row_list`
mate-tools tags the corpus. Need to specify predicates better 2019-02-20 06:38:26 +00:00			`))`
			`out_str += "\n" # newline at the end of sentence`
Parser() class 2019-02-15 08:09:11 +00:00			`return out_str`


			`def to_conll_2009_full(self, sentence_entry):`
			`out_str = ""`
			`for token in sentence_entry["tokens"]:`
			`t_id = token[1]`
			`# 1 3`
			`out_str += "{}\t{}\n".format(`
			`t_id, # id`
			`token[2], # form`
			`)`
			`out_str += "\n"`
			`return out_str`