132 lines
		
	
	
		
			4.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			132 lines
		
	
	
		
			4.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from lxml import etree
 | |
| import re
 | |
| 
 | |
| W_TAGS = ['w']
 | |
| C_TAGS = ['c']
 | |
| S_TAGS = ['S', 'pc']
 | |
| 
 | |
| # reads a TEI xml file and returns a dictionary:
 | |
| # { <sentence_id>: {
 | |
| #       sid: <sentence_id>,  # serves as index in MongoDB
 | |
| #       text: ,
 | |
| #       tokens: ,
 | |
| # }}
 | |
| 
 | |
| 
 | |
| def parse_tei(filepath):
 | |
|     guess_corpus = None  # SSJ | KRES
 | |
|     res_dict = {}
 | |
|     with open(filepath, "r") as fp:
 | |
|         # remove namespaces
 | |
|         xmlstr = fp.read()
 | |
|         xmlstr = re.sub('\\sxmlns="[^"]+"', '', xmlstr, count=1)
 | |
|         xmlstr = re.sub(' xml:', ' ', xmlstr)
 | |
| 
 | |
|         root = etree.XML(xmlstr.encode("utf-8"))
 | |
| 
 | |
|         divs = []  # in ssj, there are divs, in Kres, there are separate files
 | |
|         if "id" in root.keys():
 | |
|             # Kres files start with <TEI id=...>
 | |
|             guess_corpus = "KRES"
 | |
|             divs = [root]
 | |
|         else:
 | |
|             guess_corpus = "SSJ"
 | |
|             divs = root.findall(".//div")
 | |
| 
 | |
|         # parse divs
 | |
|         for div in divs:
 | |
|             f_id = div.get("id")
 | |
| 
 | |
|             # parse paragraphs
 | |
|             for p in div.findall(".//p"):
 | |
|                 p_id = p.get("id").split(".")[-1]
 | |
| 
 | |
|                 # parse sentences
 | |
|                 for s in p.findall(".//s"):
 | |
|                     s_id = s.get("id").split(".")[-1]
 | |
|                     sentence_text = ""
 | |
|                     sentence_tokens = []
 | |
| 
 | |
|                     # parse tokens
 | |
|                     for el in s.iter():
 | |
|                         if el.tag in W_TAGS:
 | |
|                             el_id = el.get("id").split(".")[-1]
 | |
|                             if el_id[0] == 't':
 | |
|                                 el_id = el_id[1:]  # ssj W_TAG ids start with t
 | |
|                             sentence_text += el.text
 | |
|                             sentence_tokens += [(
 | |
|                                 "w",
 | |
|                                 int(el_id),
 | |
|                                 el.text,
 | |
|                                 el.get("lemma"),
 | |
|                                 (el.get("msd") if guess_corpus == "KRES"
 | |
|                                     else el.get("ana").split(":")[-1]),
 | |
|                             )]
 | |
|                         elif el.tag in C_TAGS:
 | |
|                             # only Kres' C_TAGS have ids
 | |
|                             el_id = el.get("id") or "none"
 | |
|                             el_id = el_id.split(".")[-1]
 | |
|                             sentence_text += el.text
 | |
|                             sentence_tokens += [("c", el_id, el.text,)]
 | |
|                         elif el.tag in S_TAGS:
 | |
|                             # Kres' <S /> doesn't contain .text
 | |
|                             sentence_text += " "
 | |
|                         else:
 | |
|                             # pass links and linkGroups
 | |
|                             pass
 | |
|                     sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
 | |
|                     if sentence_id in res_dict:
 | |
|                         raise KeyError("duplicated id: {}".format(sentence_id))
 | |
|                     res_dict[sentence_id] = {
 | |
|                         "sid": sentence_id,
 | |
|                         "text": sentence_text,
 | |
|                         "tokens": sentence_tokens,
 | |
|                         "links": (
 | |
|                             parse_links(s) if guess_corpus == "KRES" else None
 | |
|                         )
 | |
|                     }
 | |
|     return res_dict
 | |
| 
 | |
| 
 | |
| def parse_links(s_el):
 | |
|     lgrps = s_el.findall(".//links")
 | |
|     if len(lgrps) < 1:
 | |
|         raise IOError("Can't find links.")
 | |
|     res_links = {}
 | |
|     for link in lgrps[0]:
 | |
|         dep = int(link.get("dep").split(".")[-1])
 | |
|         res_links[dep] = (
 | |
|             link.get("afun"),
 | |
|             dep,
 | |
|             int(link.get("from").split(".")[-1]),
 | |
|         )
 | |
|     return res_links
 | |
| 
 | |
| 
 | |
| def to_conll09(sentence_entry):
 | |
|     # works with kres, with parsed links
 | |
|     out_str = ""
 | |
|     for token in sentence_entry["tokens"]:
 | |
|         if token[0] != "w":
 | |
|             continue
 | |
|         print(token)
 | |
|         print(sentence_entry["links"])
 | |
|         t_id = token[1]
 | |
|         print(t_id)
 | |
|         out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
 | |
|             t_id,  # id
 | |
|             token[2],  # form
 | |
|             token[3],  # lemma
 | |
|             token[3],  # plemma
 | |
|             "todo",  # pos (TODO)
 | |
|             "todo",  # ppos (TODO)
 | |
|             "todo",  # feat (TODO)
 | |
|             "todo",  # pfeat (TODO)
 | |
|             sentence_entry["links"][t_id][2],  # head
 | |
|             sentence_entry["links"][t_id][2],  # phead
 | |
|             sentence_entry["links"][t_id][1],  # deprel
 | |
|             sentence_entry["links"][t_id][1],  # pdeprel
 | |
|         )
 | |
|     out_str += "\n"
 | |
|     return out_str
 |