from lxml import etree import re from parser.msdmap import Msdmap W_TAGS = ['w'] C_TAGS = ['c'] S_TAGS = ['S', 'pc'] # reads a TEI xml file and returns a dictionary: # { : { # sid: , # serves as index in MongoDB # text: , # tokens: , # }} def parse_tei(filepath): guess_corpus = None # SSJ | KRES res_dict = {} with open(filepath, "rb") as fp: # remove namespaces bstr = fp.read() utf8str = bstr.decode("utf-8") utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1) utf8str = re.sub(' xml:', ' ', utf8str) root = etree.XML(utf8str.encode("utf-8")) divs = [] # in ssj, there are divs, in Kres, there are separate files if "id" in root.keys(): # Kres files start with guess_corpus = "KRES" divs = [root] else: guess_corpus = "SSJ" divs = root.findall(".//div") # parse divs for div in divs: f_id = div.get("id") # parse paragraphs for p in div.findall(".//p"): p_id = p.get("id").split(".")[-1] # parse sentences for s in p.findall(".//s"): s_id = s.get("id").split(".")[-1] sentence_text = "" sentence_tokens = [] # parse tokens for el in s.iter(): if el.tag in W_TAGS: el_id = el.get("id").split(".")[-1] if el_id[0] == 't': el_id = el_id[1:] # ssj W_TAG ids start with t sentence_text += el.text sentence_tokens += [( "w", int(el_id), el.text, el.get("lemma"), (el.get("msd") if guess_corpus == "KRES" else el.get("ana").split(":")[-1]), )] elif el.tag in C_TAGS: # only Kres' C_TAGS have ids el_id = el.get("id") or "none" el_id = el_id.split(".")[-1] sentence_text += el.text sentence_tokens += [("c", el_id, el.text,)] elif el.tag in S_TAGS: # Kres' doesn't contain .text sentence_text += " " else: # pass links and linkGroups pass sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) if sentence_id in res_dict: raise KeyError("duplicated id: {}".format(sentence_id)) res_dict[sentence_id] = { "sid": sentence_id, "text": sentence_text, "tokens": sentence_tokens, "links": ( parse_links(s) if guess_corpus == "KRES" else None ) } return res_dict def parse_links(s_el): lgrps = s_el.findall(".//links") if len(lgrps) < 1: raise IOError("Can't find links.") res_links = {} for link in lgrps[0]: dep = int(link.get("dep").split(".")[-1]) res_links[dep] = ( link.get("afun"), dep, int(link.get("from").split(".")[-1]), ) return res_links def to_conll09(sentence_entry): def fillpred(pos, feat): if False: # todo return "Y" return "_" msdm = Msdmap() # works with kres, with parsed links out_str = "" for token in sentence_entry["tokens"]: if token[0] != "w": continue msd = msdm.msd_from_slo(token[4]) fprd = fillpred("todo", "todo") print(msd) print(token) print(sentence_entry["links"]) t_id = token[1] print(t_id) # 1 3 4 5 6 7 8 9 10 11 12 13 14 out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( t_id, # id token[2], # form token[3], # lemma token[3], # plemma "todo", # pos (TODO) "todo", # ppos (TODO) "todo", # feat (TODO) "todo", # pfeat (TODO) sentence_entry["links"][t_id][2], # head sentence_entry["links"][t_id][2], # phead sentence_entry["links"][t_id][1], # deprel sentence_entry["links"][t_id][1], # pdeprel fprd, # fillpred (token[3] if fprd == "Y" else "_"), # pred "todo" # apredn... ) out_str += "\n" return out_str