171 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			171 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from lxml import etree
 | 
						|
import re
 | 
						|
from parser.msd.msdmap import Msdmap
 | 
						|
 | 
						|
class Parser:
 | 
						|
    # reads a TEI xml file and returns a dictionary:
 | 
						|
    # { <sentence_id>: {
 | 
						|
    #       sid: <sentence_id>,  # serves as index in MongoDB
 | 
						|
    #       text: ,
 | 
						|
    #       tokens: ,
 | 
						|
    # }}
 | 
						|
 | 
						|
    def __init__(self):
 | 
						|
        self.msdmap = Msdmap()
 | 
						|
        self.W_TAGS = ['w']
 | 
						|
        self.C_TAGS = ['c']
 | 
						|
        self.S_TAGS = ['S', 'pc']
 | 
						|
 | 
						|
    def parse_tei(self, filepath):
 | 
						|
 | 
						|
        def parse_links(s_el):
 | 
						|
            lgrps = s_el.findall(".//links")
 | 
						|
            if len(lgrps) < 1:
 | 
						|
                raise IOError("Can't find links.")
 | 
						|
            res_links = {}
 | 
						|
            for link in lgrps[0]:
 | 
						|
                dep = int(link.get("dep").split(".")[-1])
 | 
						|
                res_links[dep] = (
 | 
						|
                    link.get("afun"),
 | 
						|
                    dep,
 | 
						|
                    int(link.get("from").split(".")[-1]),
 | 
						|
                )
 | 
						|
            return res_links
 | 
						|
 | 
						|
        guess_corpus = None  # SSJ | KRES
 | 
						|
        res_dict = {}
 | 
						|
        with open(filepath, "rb") as fp:
 | 
						|
            # remove namespaces
 | 
						|
            bstr = fp.read()
 | 
						|
 | 
						|
            utf8str = bstr.decode("utf-8")
 | 
						|
            utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1)
 | 
						|
            utf8str = re.sub(' xml:', ' ', utf8str)
 | 
						|
 | 
						|
            root = etree.XML(utf8str.encode("utf-8"))
 | 
						|
 | 
						|
            divs = []  # in ssj, there are divs, in Kres, there are separate files
 | 
						|
            if "id" in root.keys():
 | 
						|
                # Kres files start with <TEI id=...>
 | 
						|
                guess_corpus = "KRES"
 | 
						|
                divs = [root]
 | 
						|
            else:
 | 
						|
                guess_corpus = "SSJ"
 | 
						|
                divs = root.findall(".//div")
 | 
						|
 | 
						|
            # parse divs
 | 
						|
            for div in divs:
 | 
						|
                f_id = div.get("id")
 | 
						|
 | 
						|
                # parse paragraphs
 | 
						|
                for p in div.findall(".//p"):
 | 
						|
                    p_id = p.get("id").split(".")[-1]
 | 
						|
 | 
						|
                    # parse sentences
 | 
						|
                    for s in p.findall(".//s"):
 | 
						|
                        s_id = s.get("id").split(".")[-1]
 | 
						|
                        sentence_text = ""
 | 
						|
                        sentence_tokens = []
 | 
						|
 | 
						|
                        # parse tokens
 | 
						|
                        for el in s.iter():
 | 
						|
                            if el.tag in self.W_TAGS:
 | 
						|
                                el_id = el.get("id").split(".")[-1]
 | 
						|
                                if el_id[0] == 't':
 | 
						|
                                    el_id = el_id[1:]  # ssj W_TAG ids start with t
 | 
						|
                                sentence_text += el.text
 | 
						|
                                sentence_tokens += [(
 | 
						|
                                    "w",
 | 
						|
                                    int(el_id),
 | 
						|
                                    el.text,
 | 
						|
                                    el.get("lemma"),
 | 
						|
                                    (el.get("msd") if guess_corpus == "KRES"
 | 
						|
                                        else el.get("ana").split(":")[-1]),
 | 
						|
                                )]
 | 
						|
                            elif el.tag in self.C_TAGS:
 | 
						|
                                # only Kres' C_TAGS have ids
 | 
						|
                                el_id = el.get("id") or "none"
 | 
						|
                                el_id = el_id.split(".")[-1]
 | 
						|
                                sentence_text += el.text
 | 
						|
                                sentence_tokens += [("c", el_id, el.text,)]
 | 
						|
                            elif el.tag in self.S_TAGS:
 | 
						|
                                # Kres' <S /> doesn't contain .text
 | 
						|
                                sentence_text += " "
 | 
						|
                            else:
 | 
						|
                                # pass links and linkGroups
 | 
						|
                                pass
 | 
						|
                        sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
 | 
						|
                        if sentence_id in res_dict:
 | 
						|
                            raise KeyError("duplicated id: {}".format(sentence_id))
 | 
						|
                        res_dict[sentence_id] = {
 | 
						|
                            "sid": sentence_id,
 | 
						|
                            "text": sentence_text,
 | 
						|
                            "tokens": sentence_tokens,
 | 
						|
                            "links": (
 | 
						|
                                parse_links(s) if guess_corpus == "KRES" else None
 | 
						|
                            )
 | 
						|
                        }
 | 
						|
        fp.close()
 | 
						|
        return res_dict
 | 
						|
 | 
						|
 | 
						|
    def to_conll_2009_SRL(self, sentence_entry):
 | 
						|
 | 
						|
        def fillpred(pos, feat):
 | 
						|
            if pos == "V" and "main" in feat.split("|"):
 | 
						|
                return True
 | 
						|
            return False
 | 
						|
 | 
						|
        # works with kres, with parsed links
 | 
						|
        out_str = ""
 | 
						|
        for token in sentence_entry["tokens"]:
 | 
						|
            if token[0] != "w":
 | 
						|
                continue
 | 
						|
 | 
						|
            t_id = token[1]
 | 
						|
            pos = self.msdmap.slo_msd_to_eng_pos(token[4])
 | 
						|
            feat = "|".join(self.msdmap.slo_msd_to_eng_long(token[4]).split(" "))
 | 
						|
            fprd = fillpred(pos, feat)
 | 
						|
 | 
						|
            """
 | 
						|
            print(t_id)
 | 
						|
            print("msd:")
 | 
						|
            print(msd)
 | 
						|
            print(token)
 | 
						|
            print(sentence_entry["links"])
 | 
						|
            """
 | 
						|
 | 
						|
            #            1   3   4   5   6   7   8   9  10  11  12  13  14  15
 | 
						|
            out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
 | 
						|
                t_id,  # id
 | 
						|
                token[2],  # form
 | 
						|
                token[3],  # lemma
 | 
						|
                token[3],  # plemma
 | 
						|
                pos,  # pos
 | 
						|
                pos,  # ppos
 | 
						|
                feat,  # feat
 | 
						|
                feat,  # pfeat
 | 
						|
                sentence_entry["links"][t_id][2],  # head
 | 
						|
                sentence_entry["links"][t_id][2],  # phead
 | 
						|
                sentence_entry["links"][t_id][0],  # deprel
 | 
						|
                sentence_entry["links"][t_id][0],  # pdeprel
 | 
						|
                "Y" if fprd else "_",  # fillpred
 | 
						|
                token[3] if fprd else "_",  # pred
 | 
						|
            )
 | 
						|
        out_str += "\n"
 | 
						|
        # print(out_str)
 | 
						|
        return out_str
 | 
						|
 | 
						|
 | 
						|
    def to_conll_2009_full(self, sentence_entry):
 | 
						|
        out_str = ""
 | 
						|
        for token in sentence_entry["tokens"]:
 | 
						|
            t_id = token[1]
 | 
						|
            #            1   3
 | 
						|
            out_str += "{}\t{}\n".format(
 | 
						|
                t_id,  # id
 | 
						|
                token[2],  # form
 | 
						|
            )
 | 
						|
        out_str += "\n"
 | 
						|
        return out_str
 |