forked from kristjan/cjvt-valency
		
	bug: matching srl_json files with kres_xml files
This commit is contained in:
		
							parent
							
								
									1654548310
								
							
						
					
					
						commit
						55c07f88ca
					
				
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -1,3 +1,3 @@ | ||||
| data/samples/ | ||||
| */pycache/ | ||||
| */__pycache__/ | ||||
| *egg-info/ | ||||
|  | ||||
| @ -1,6 +1,7 @@ | ||||
| from corpusparser import Sentence | ||||
| from pathlib import Path | ||||
| import re | ||||
| import json | ||||
| from lxml import etree | ||||
| 
 | ||||
| # Read input file(.xml, .json; kres or ssj500k).   | ||||
| @ -26,15 +27,8 @@ class Parser(): | ||||
|         if self.corpus == "kres": | ||||
|             return self.parse_jos_links_kres(sent_el) | ||||
|         else: | ||||
|             return self.parse_jos_links_ssj(sent_el) | ||||
| 
 | ||||
|     def parse_ssj_target_arg(self, text): | ||||
|         # from: 0, to: 6 | ||||
|         # <link ana="syn:modra" target="#ssj1.1.3 #ssj1.1.3.t6"/> | ||||
|         # from: 6, to: 7 | ||||
|         # <link ana="syn:dol" target="#ssj1.1.3.t6 #ssj1.1.3.t7"/> | ||||
|         lst = [x.split(".")[-1] for x in text.split(" ")] | ||||
|         return [int(x[1:] if x[0] == "t" else 0) for x in lst] | ||||
|             # 'syntax' is the linkgroup we're looking for | ||||
|             return self.parse_any_links_ssj(sent_el, "syntax") | ||||
| 
 | ||||
|     def parse_jos_links_kres(self, sent_el): | ||||
|         lgrps = sent_el.findall(".//links") | ||||
| @ -49,103 +43,138 @@ class Parser(): | ||||
|             }] | ||||
|         return res_links | ||||
| 
 | ||||
|     def parse_jos_links_ssj(self, sent_el): | ||||
|     def parse_ssj_target_arg(self, text): | ||||
|         # from: 0, to: 6 | ||||
|         # <link ana="syn:modra" target="#ssj1.1.3 #ssj1.1.3.t6"/> | ||||
|         # from: 6, to: 7 | ||||
|         # <link ana="syn:dol" target="#ssj1.1.3.t6 #ssj1.1.3.t7"/> | ||||
|         lst = [x.split(".")[-1] for x in text.split(" ")] | ||||
|         return [int(x[1:] if x[0] == "t" else 0) for x in lst] | ||||
| 
 | ||||
|     def parse_any_links_ssj(self, sent_el, links_type): | ||||
|         lgrps = sent_el.findall(".//linkGrp") | ||||
|         if len(lgrps) < 1: | ||||
|             # print(etree.tostring(sent_el)) | ||||
|             raise IOError("Can't find links.") | ||||
|         links = [x for x in lgrps if x.get("type") == links_type][0] | ||||
|         res_links = [] | ||||
|         for link in lgrps[0]: | ||||
|             print(link) | ||||
|         for link in links: | ||||
|             tar = self.parse_ssj_target_arg(link.get("target")) | ||||
|             res_links += [{ | ||||
|                 "from": tar[0], | ||||
|                 "afun": link.get("ana").split(":")[1], | ||||
|                 "to": [1], | ||||
|                 "to": tar[1], | ||||
|             }] | ||||
|         return res_links | ||||
| 
 | ||||
|     def parse_srl_links(self, sent_el, xml_file=None): | ||||
|         if self.corpus == "kres": | ||||
|             return self.parse_srl_links_kres(sent_el, xml_file) | ||||
|         else: | ||||
|             return self.parse_any_links_ssj(sent_el, "SRL") | ||||
| 
 | ||||
|     def parse_srl_links_kres(self, sent_el, sent_srl_dict): | ||||
|         print(sent_srl_dict) | ||||
|         # find the correspointing json file with srl links | ||||
|         return "TODO" | ||||
| 
 | ||||
|     def parse(self): | ||||
|         if self.corpus == "kres": | ||||
|             print("parse kres: TODO") | ||||
|             for xml_file in self.kres_folder.iterdir(): | ||||
|                 self.parse_xml_file(xml_file) | ||||
|                 break  # TODO dev break | ||||
|         else: | ||||
|             self.parse_xml_file(self.ssj_file) | ||||
| 
 | ||||
|     def parse_xml_file(self, filepath): | ||||
|         res_dict = {} | ||||
|         with filepath.open("rb") as fp: | ||||
|     def parse_xml_file(self, xml_file): | ||||
|         srl_dict = {} | ||||
|         if self.corpus == "kres":   | ||||
|             # in case of kres, read the SRL links form a separate json file | ||||
|             file_id = xml_file.name.split(".")[0] | ||||
|             json_file = self.kres_srl_folder / Path(file_id).with_suffix(".srl.json") | ||||
|             with json_file.open("r") as fp: | ||||
|                 srl_dict = json.loads(fp.read()) | ||||
| 
 | ||||
|         with xml_file.open("rb") as fp: | ||||
|             # remove namespaces | ||||
|             bstr = fp.read() | ||||
| 
 | ||||
|             utf8str = bstr.decode("utf-8") | ||||
|             utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1) | ||||
|             utf8str = re.sub(' xml:', ' ', utf8str) | ||||
|         utf8str = bstr.decode("utf-8") | ||||
|         utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1) | ||||
|         utf8str = re.sub(' xml:', ' ', utf8str) | ||||
| 
 | ||||
|             root = etree.XML(utf8str.encode("utf-8")) | ||||
|         root = etree.XML(utf8str.encode("utf-8")) | ||||
| 
 | ||||
|             divs = []  # in ssj, there are divs, in Kres, there are separate files | ||||
|             if self.corpus == "kres": | ||||
|                 divs = [root] | ||||
|             else: | ||||
|                 divs = root.findall(".//div") | ||||
|         divs = []  # in ssj, there are divs, in Kres, there are separate files | ||||
|         if self.corpus == "kres": | ||||
|             divs = [root] | ||||
|         else: | ||||
|             divs = root.findall(".//div") | ||||
| 
 | ||||
|             # parse divs | ||||
|             for div in divs: | ||||
|                 f_id = div.get("id") | ||||
|         res_dict = []  # TODO: try making an iterator instead | ||||
| 
 | ||||
|                 # parse paragraphs | ||||
|                 for p in div.findall(".//p"): | ||||
|                     p_id = p.get("id").split(".")[-1] | ||||
|         # parse divs | ||||
|         for div in divs: | ||||
|             f_id = div.get("id") | ||||
| 
 | ||||
|                     # parse sentences | ||||
|                     for s in p.findall(".//s"): | ||||
|                         s_id = s.get("id").split(".")[-1] | ||||
|                         sentence_text = "" | ||||
|                         sentence_tokens = [] | ||||
|             # parse paragraphs | ||||
|             for p in div.findall(".//p"): | ||||
|                 p_id = p.get("id").split(".")[-1] | ||||
| 
 | ||||
|                         # parse tokens | ||||
|                         for el in s.iter(): | ||||
|                             if el.tag in self.W_TAGS: | ||||
|                                 el_id = el.get("id").split(".")[-1] | ||||
|                                 if el_id[0] == 't': | ||||
|                                     el_id = el_id[1:]  # ssj W_TAG ids start with t | ||||
|                                 sentence_text += el.text | ||||
|                                 sentence_tokens += [{ | ||||
|                                     "word": True, | ||||
|                                     "tid": int(el_id), | ||||
|                                     "text": el.text, | ||||
|                                     "lemma": el.get("lemma"), | ||||
|                                     "msd": (el.get("msd") if self.corpus == "kres" | ||||
|                                         else el.get("ana").split(":")[-1]), | ||||
|                                 }] | ||||
|                             elif el.tag in self.C_TAGS: | ||||
|                                 # only Kres' C_TAGS have ids | ||||
|                                 el_id = el.get("id") or "none" | ||||
|                                 el_id = el_id.split(".")[-1] | ||||
|                                 sentence_text += el.text | ||||
|                                 sentence_tokens += [{ | ||||
|                                     "word": False, | ||||
|                                     "tid": el_id, | ||||
|                                     "text": el.text, | ||||
|                                 }] | ||||
|                             elif el.tag in self.S_TAGS: | ||||
|                                 # Kres' <S /> doesn't contain .text | ||||
|                                 sentence_text += " " | ||||
|                             else: | ||||
|                                 # pass links and linkGroups | ||||
|                                 pass | ||||
|                         sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) | ||||
|                 # parse sentences | ||||
|                 for s in p.findall(".//s"): | ||||
|                     s_id = s.get("id").split(".")[-1] | ||||
|                     sentence_text = "" | ||||
|                     sentence_tokens = [] | ||||
| 
 | ||||
|                         # make a generator instead of holding the whole corpus in memory | ||||
|                         if sentence_id in res_dict: | ||||
|                             raise KeyError("duplicated id: {}".format(sentence_id)) | ||||
|                         res_dict[sentence_id] = { | ||||
|                             "sid": sentence_id, | ||||
|                             "text": sentence_text, | ||||
|                             "tokens": sentence_tokens, | ||||
|                             "jos_links": self.parse_jos_links(s) | ||||
|                         } | ||||
|                     # parse tokens | ||||
|                     for el in s.iter(): | ||||
|                         if el.tag in self.W_TAGS: | ||||
|                             el_id = el.get("id").split(".")[-1] | ||||
|                             if el_id[0] == 't': | ||||
|                                 el_id = el_id[1:]  # ssj W_TAG ids start with t | ||||
|                             sentence_text += el.text | ||||
|                             sentence_tokens += [{ | ||||
|                                 "word": True, | ||||
|                                 "tid": int(el_id), | ||||
|                                 "text": el.text, | ||||
|                                 "lemma": el.get("lemma"), | ||||
|                                 "msd": (el.get("msd") if self.corpus == "kres" | ||||
|                                     else el.get("ana").split(":")[-1]), | ||||
|                             }] | ||||
|                         elif el.tag in self.C_TAGS: | ||||
|                             # only Kres' C_TAGS have ids | ||||
|                             el_id = el.get("id") or "none" | ||||
|                             el_id = el_id.split(".")[-1] | ||||
|                             sentence_text += el.text | ||||
|                             sentence_tokens += [{ | ||||
|                                 "word": False, | ||||
|                                 "tid": el_id, | ||||
|                                 "text": el.text, | ||||
|                             }] | ||||
|                         elif el.tag in self.S_TAGS: | ||||
|                             # Kres' <S /> doesn't contain .text | ||||
|                             sentence_text += " " | ||||
|                         else: | ||||
|                             # pass links and linkGroups | ||||
|                             pass | ||||
|                     sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) | ||||
| 
 | ||||
|                         print(res_dict[sentence_id]) | ||||
|                         break | ||||
|                     # make a generator instead of holding the whole corpus in memory | ||||
|                     # TODO -- match ids | ||||
|                     print("---") | ||||
|                     print(sorted(srl_dict.keys(), key=lambda x: x.split(".")[1])[:100]) | ||||
|                     print(sentence_id) | ||||
|                     print(srl_dict.get(str(sentence_id))) | ||||
|                     print("---") | ||||
|                     if sentence_id in res_dict: | ||||
|                         raise KeyError("duplicated id: {}".format(sentence_id)) | ||||
|                     res_dict[sentence_id] = { | ||||
|                         "sid": sentence_id, | ||||
|                         "text": sentence_text, | ||||
|                         "tokens": sentence_tokens, | ||||
|                         "jos_links": self.parse_jos_links(s), | ||||
|                         "srl_links": self.parse_srl_links(s, srl_dict[sentence_id]), | ||||
|                     } | ||||
| 
 | ||||
|                     print(res_dict[sentence_id]) | ||||
|                     print("------------------------------------------------- END") | ||||
|                     return  # TODO dev break | ||||
|         return res_dict | ||||
|  | ||||
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							| @ -9,8 +9,17 @@ if __name__ == "__main__": | ||||
| 	args = parser.parse_args()	 | ||||
| 
 | ||||
| 	# parse ssj | ||||
| 	""" | ||||
| 	ssj_parser = Parser( | ||||
| 		corpus="ssj", | ||||
| 		infiles=[args.ssj_file] | ||||
| 	) | ||||
| 	ssj_parser.parse() | ||||
| 	""" | ||||
| 
 | ||||
| 	# parse kres | ||||
| 	kres_parser = Parser( | ||||
| 		corpus="kres", | ||||
| 		infiles=[args.kres_folder, args.kres_srl_folder] | ||||
| 	) | ||||
| 	kres_parser.parse() | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user