from lxml import etree import re from parser.msd.msdmap import Msdmap class Parser: # reads a TEI xml file and returns a dictionary: # { : { # sid: , # serves as index in MongoDB # text: , # tokens: , # }} def __init__(self): self.msdmap = Msdmap() self.W_TAGS = ['w'] self.C_TAGS = ['c'] self.S_TAGS = ['S', 'pc'] def parse_tei(self, filepath): def parse_links(s_el): lgrps = s_el.findall(".//links") if len(lgrps) < 1: raise IOError("Can't find links.") res_links = {} for link in lgrps[0]: dep = int(link.get("dep").split(".")[-1]) res_links[dep] = ( link.get("afun"), dep, int(link.get("from").split(".")[-1]), ) return res_links guess_corpus = None # SSJ | KRES res_dict = {} with filepath.open("rb") as fp: # remove namespaces bstr = fp.read() utf8str = bstr.decode("utf-8") utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1) utf8str = re.sub(' xml:', ' ', utf8str) root = etree.XML(utf8str.encode("utf-8")) divs = [] # in ssj, there are divs, in Kres, there are separate files if "id" in root.keys(): # Kres files start with guess_corpus = "KRES" divs = [root] else: guess_corpus = "SSJ" divs = root.findall(".//div") # parse divs for div in divs: f_id = div.get("id") # parse paragraphs for p in div.findall(".//p"): p_id = p.get("id").split(".")[-1] # parse sentences for s in p.findall(".//s"): s_id = s.get("id").split(".")[-1] sentence_text = "" sentence_tokens = [] # parse tokens for el in s.iter(): if el.tag in self.W_TAGS: el_id = el.get("id").split(".")[-1] if el_id[0] == 't': el_id = el_id[1:] # ssj W_TAG ids start with t sentence_text += el.text sentence_tokens += [( "w", int(el_id), el.text, el.get("lemma"), (el.get("msd") if guess_corpus == "KRES" else el.get("ana").split(":")[-1]), )] elif el.tag in self.C_TAGS: # only Kres' C_TAGS have ids el_id = el.get("id") or "none" el_id = el_id.split(".")[-1] sentence_text += el.text sentence_tokens += [("c", el_id, el.text,)] elif el.tag in self.S_TAGS: # Kres' doesn't contain .text sentence_text += " " else: # pass links and linkGroups pass sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) if sentence_id in res_dict: raise KeyError("duplicated id: {}".format(sentence_id)) res_dict[sentence_id] = { "sid": sentence_id, "text": sentence_text, "tokens": sentence_tokens, "links": ( parse_links(s) if guess_corpus == "KRES" else None ) } fp.close() return res_dict def to_conll_2009_SRL(self, sentence_entry, napreds=9): def fillpred(pos, feat): # TODO (decision tree or bayes on mate training data) if pos == "V" and "main" in feat.split("|"): return True return False apreds_string = '\t'.join(["_" for x in range(napreds)]) # works with kres, with parsed links out_str = "" for token in sentence_entry["tokens"]: t_id = token[1] form = token[2] # handle stop signs if token[0] != "w": out_str += '\t'.join( [t_id] + [form for x in range(7)] + ["0", "0", "modra", "modra", "_", "_"] + [apreds_string, "\n"] ) continue pos = self.msdmap.slo_msd_to_eng_pos(token[4]) feat = "|".join(self.msdmap.slo_msd_to_eng_long(token[4]).split(" ")) fprd = fillpred(pos, feat) """ print(t_id) print("msd:") print(msd) print(token) print(sentence_entry["links"]) """ # format: 14 + apreds out_str += '\t'.join(map(str, [ t_id, form, token[3], # lemma token[3], # plemma pos, # pos pos, # ppos feat, # feat feat, # pfeat sentence_entry["links"][t_id][2], # head sentence_entry["links"][t_id][2], # phead sentence_entry["links"][t_id][0], # deprel sentence_entry["links"][t_id][0], # pdeprel "Y" if fprd else "_", # fillpred token[3] if fprd else "_", # pred apreds_string, "\n", ] )) out_str += "\n" # newline at the end of sentence return out_str def to_conll_2009_full(self, sentence_entry): out_str = "" for token in sentence_entry["tokens"]: t_id = token[1] # 1 3 out_str += "{}\t{}\n".format( t_id, # id token[2], # form ) out_str += "\n" return out_str