forked from kristjan/cjvt-valency
		
	bug: matching srl_json files with kres_xml files
This commit is contained in:
		
							parent
							
								
									1654548310
								
							
						
					
					
						commit
						55c07f88ca
					
				
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -1,3 +1,3 @@ | |||||||
| data/samples/ | data/samples/ | ||||||
| */pycache/ | */__pycache__/ | ||||||
| *egg-info/ | *egg-info/ | ||||||
|  | |||||||
| @ -1,6 +1,7 @@ | |||||||
| from corpusparser import Sentence | from corpusparser import Sentence | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| import re | import re | ||||||
|  | import json | ||||||
| from lxml import etree | from lxml import etree | ||||||
| 
 | 
 | ||||||
| # Read input file(.xml, .json; kres or ssj500k).   | # Read input file(.xml, .json; kres or ssj500k).   | ||||||
| @ -26,15 +27,8 @@ class Parser(): | |||||||
|         if self.corpus == "kres": |         if self.corpus == "kres": | ||||||
|             return self.parse_jos_links_kres(sent_el) |             return self.parse_jos_links_kres(sent_el) | ||||||
|         else: |         else: | ||||||
|             return self.parse_jos_links_ssj(sent_el) |             # 'syntax' is the linkgroup we're looking for | ||||||
| 
 |             return self.parse_any_links_ssj(sent_el, "syntax") | ||||||
|     def parse_ssj_target_arg(self, text): |  | ||||||
|         # from: 0, to: 6 |  | ||||||
|         # <link ana="syn:modra" target="#ssj1.1.3 #ssj1.1.3.t6"/> |  | ||||||
|         # from: 6, to: 7 |  | ||||||
|         # <link ana="syn:dol" target="#ssj1.1.3.t6 #ssj1.1.3.t7"/> |  | ||||||
|         lst = [x.split(".")[-1] for x in text.split(" ")] |  | ||||||
|         return [int(x[1:] if x[0] == "t" else 0) for x in lst] |  | ||||||
| 
 | 
 | ||||||
|     def parse_jos_links_kres(self, sent_el): |     def parse_jos_links_kres(self, sent_el): | ||||||
|         lgrps = sent_el.findall(".//links") |         lgrps = sent_el.findall(".//links") | ||||||
| @ -49,103 +43,138 @@ class Parser(): | |||||||
|             }] |             }] | ||||||
|         return res_links |         return res_links | ||||||
| 
 | 
 | ||||||
|     def parse_jos_links_ssj(self, sent_el): |     def parse_ssj_target_arg(self, text): | ||||||
|  |         # from: 0, to: 6 | ||||||
|  |         # <link ana="syn:modra" target="#ssj1.1.3 #ssj1.1.3.t6"/> | ||||||
|  |         # from: 6, to: 7 | ||||||
|  |         # <link ana="syn:dol" target="#ssj1.1.3.t6 #ssj1.1.3.t7"/> | ||||||
|  |         lst = [x.split(".")[-1] for x in text.split(" ")] | ||||||
|  |         return [int(x[1:] if x[0] == "t" else 0) for x in lst] | ||||||
|  | 
 | ||||||
|  |     def parse_any_links_ssj(self, sent_el, links_type): | ||||||
|         lgrps = sent_el.findall(".//linkGrp") |         lgrps = sent_el.findall(".//linkGrp") | ||||||
|         if len(lgrps) < 1: |         links = [x for x in lgrps if x.get("type") == links_type][0] | ||||||
|             # print(etree.tostring(sent_el)) |  | ||||||
|             raise IOError("Can't find links.") |  | ||||||
|         res_links = [] |         res_links = [] | ||||||
|         for link in lgrps[0]: |         for link in links: | ||||||
|             print(link) |  | ||||||
|             tar = self.parse_ssj_target_arg(link.get("target")) |             tar = self.parse_ssj_target_arg(link.get("target")) | ||||||
|             res_links += [{ |             res_links += [{ | ||||||
|                 "from": tar[0], |                 "from": tar[0], | ||||||
|                 "afun": link.get("ana").split(":")[1], |                 "afun": link.get("ana").split(":")[1], | ||||||
|                 "to": [1], |                 "to": tar[1], | ||||||
|             }] |             }] | ||||||
|         return res_links |         return res_links | ||||||
| 
 | 
 | ||||||
|  |     def parse_srl_links(self, sent_el, xml_file=None): | ||||||
|  |         if self.corpus == "kres": | ||||||
|  |             return self.parse_srl_links_kres(sent_el, xml_file) | ||||||
|  |         else: | ||||||
|  |             return self.parse_any_links_ssj(sent_el, "SRL") | ||||||
|  | 
 | ||||||
|  |     def parse_srl_links_kres(self, sent_el, sent_srl_dict): | ||||||
|  |         print(sent_srl_dict) | ||||||
|  |         # find the correspointing json file with srl links | ||||||
|  |         return "TODO" | ||||||
|  | 
 | ||||||
|     def parse(self): |     def parse(self): | ||||||
|         if self.corpus == "kres": |         if self.corpus == "kres": | ||||||
|             print("parse kres: TODO") |             for xml_file in self.kres_folder.iterdir(): | ||||||
|  |                 self.parse_xml_file(xml_file) | ||||||
|  |                 break  # TODO dev break | ||||||
|         else: |         else: | ||||||
|             self.parse_xml_file(self.ssj_file) |             self.parse_xml_file(self.ssj_file) | ||||||
| 
 | 
 | ||||||
|     def parse_xml_file(self, filepath): |     def parse_xml_file(self, xml_file): | ||||||
|         res_dict = {} |         srl_dict = {} | ||||||
|         with filepath.open("rb") as fp: |         if self.corpus == "kres":   | ||||||
|  |             # in case of kres, read the SRL links form a separate json file | ||||||
|  |             file_id = xml_file.name.split(".")[0] | ||||||
|  |             json_file = self.kres_srl_folder / Path(file_id).with_suffix(".srl.json") | ||||||
|  |             with json_file.open("r") as fp: | ||||||
|  |                 srl_dict = json.loads(fp.read()) | ||||||
|  | 
 | ||||||
|  |         with xml_file.open("rb") as fp: | ||||||
|             # remove namespaces |             # remove namespaces | ||||||
|             bstr = fp.read() |             bstr = fp.read() | ||||||
| 
 | 
 | ||||||
|             utf8str = bstr.decode("utf-8") |         utf8str = bstr.decode("utf-8") | ||||||
|             utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1) |         utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1) | ||||||
|             utf8str = re.sub(' xml:', ' ', utf8str) |         utf8str = re.sub(' xml:', ' ', utf8str) | ||||||
| 
 | 
 | ||||||
|             root = etree.XML(utf8str.encode("utf-8")) |         root = etree.XML(utf8str.encode("utf-8")) | ||||||
| 
 | 
 | ||||||
|             divs = []  # in ssj, there are divs, in Kres, there are separate files |         divs = []  # in ssj, there are divs, in Kres, there are separate files | ||||||
|             if self.corpus == "kres": |         if self.corpus == "kres": | ||||||
|                 divs = [root] |             divs = [root] | ||||||
|             else: |         else: | ||||||
|                 divs = root.findall(".//div") |             divs = root.findall(".//div") | ||||||
| 
 | 
 | ||||||
|             # parse divs |         res_dict = []  # TODO: try making an iterator instead | ||||||
|             for div in divs: |  | ||||||
|                 f_id = div.get("id") |  | ||||||
| 
 | 
 | ||||||
|                 # parse paragraphs |         # parse divs | ||||||
|                 for p in div.findall(".//p"): |         for div in divs: | ||||||
|                     p_id = p.get("id").split(".")[-1] |             f_id = div.get("id") | ||||||
| 
 | 
 | ||||||
|                     # parse sentences |             # parse paragraphs | ||||||
|                     for s in p.findall(".//s"): |             for p in div.findall(".//p"): | ||||||
|                         s_id = s.get("id").split(".")[-1] |                 p_id = p.get("id").split(".")[-1] | ||||||
|                         sentence_text = "" |  | ||||||
|                         sentence_tokens = [] |  | ||||||
| 
 | 
 | ||||||
|                         # parse tokens |                 # parse sentences | ||||||
|                         for el in s.iter(): |                 for s in p.findall(".//s"): | ||||||
|                             if el.tag in self.W_TAGS: |                     s_id = s.get("id").split(".")[-1] | ||||||
|                                 el_id = el.get("id").split(".")[-1] |                     sentence_text = "" | ||||||
|                                 if el_id[0] == 't': |                     sentence_tokens = [] | ||||||
|                                     el_id = el_id[1:]  # ssj W_TAG ids start with t |  | ||||||
|                                 sentence_text += el.text |  | ||||||
|                                 sentence_tokens += [{ |  | ||||||
|                                     "word": True, |  | ||||||
|                                     "tid": int(el_id), |  | ||||||
|                                     "text": el.text, |  | ||||||
|                                     "lemma": el.get("lemma"), |  | ||||||
|                                     "msd": (el.get("msd") if self.corpus == "kres" |  | ||||||
|                                         else el.get("ana").split(":")[-1]), |  | ||||||
|                                 }] |  | ||||||
|                             elif el.tag in self.C_TAGS: |  | ||||||
|                                 # only Kres' C_TAGS have ids |  | ||||||
|                                 el_id = el.get("id") or "none" |  | ||||||
|                                 el_id = el_id.split(".")[-1] |  | ||||||
|                                 sentence_text += el.text |  | ||||||
|                                 sentence_tokens += [{ |  | ||||||
|                                     "word": False, |  | ||||||
|                                     "tid": el_id, |  | ||||||
|                                     "text": el.text, |  | ||||||
|                                 }] |  | ||||||
|                             elif el.tag in self.S_TAGS: |  | ||||||
|                                 # Kres' <S /> doesn't contain .text |  | ||||||
|                                 sentence_text += " " |  | ||||||
|                             else: |  | ||||||
|                                 # pass links and linkGroups |  | ||||||
|                                 pass |  | ||||||
|                         sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) |  | ||||||
| 
 | 
 | ||||||
|                         # make a generator instead of holding the whole corpus in memory |                     # parse tokens | ||||||
|                         if sentence_id in res_dict: |                     for el in s.iter(): | ||||||
|                             raise KeyError("duplicated id: {}".format(sentence_id)) |                         if el.tag in self.W_TAGS: | ||||||
|                         res_dict[sentence_id] = { |                             el_id = el.get("id").split(".")[-1] | ||||||
|                             "sid": sentence_id, |                             if el_id[0] == 't': | ||||||
|                             "text": sentence_text, |                                 el_id = el_id[1:]  # ssj W_TAG ids start with t | ||||||
|                             "tokens": sentence_tokens, |                             sentence_text += el.text | ||||||
|                             "jos_links": self.parse_jos_links(s) |                             sentence_tokens += [{ | ||||||
|                         } |                                 "word": True, | ||||||
|  |                                 "tid": int(el_id), | ||||||
|  |                                 "text": el.text, | ||||||
|  |                                 "lemma": el.get("lemma"), | ||||||
|  |                                 "msd": (el.get("msd") if self.corpus == "kres" | ||||||
|  |                                     else el.get("ana").split(":")[-1]), | ||||||
|  |                             }] | ||||||
|  |                         elif el.tag in self.C_TAGS: | ||||||
|  |                             # only Kres' C_TAGS have ids | ||||||
|  |                             el_id = el.get("id") or "none" | ||||||
|  |                             el_id = el_id.split(".")[-1] | ||||||
|  |                             sentence_text += el.text | ||||||
|  |                             sentence_tokens += [{ | ||||||
|  |                                 "word": False, | ||||||
|  |                                 "tid": el_id, | ||||||
|  |                                 "text": el.text, | ||||||
|  |                             }] | ||||||
|  |                         elif el.tag in self.S_TAGS: | ||||||
|  |                             # Kres' <S /> doesn't contain .text | ||||||
|  |                             sentence_text += " " | ||||||
|  |                         else: | ||||||
|  |                             # pass links and linkGroups | ||||||
|  |                             pass | ||||||
|  |                     sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) | ||||||
| 
 | 
 | ||||||
|                         print(res_dict[sentence_id]) |                     # make a generator instead of holding the whole corpus in memory | ||||||
|                         break |                     # TODO -- match ids | ||||||
|  |                     print("---") | ||||||
|  |                     print(sorted(srl_dict.keys(), key=lambda x: x.split(".")[1])[:100]) | ||||||
|  |                     print(sentence_id) | ||||||
|  |                     print(srl_dict.get(str(sentence_id))) | ||||||
|  |                     print("---") | ||||||
|  |                     if sentence_id in res_dict: | ||||||
|  |                         raise KeyError("duplicated id: {}".format(sentence_id)) | ||||||
|  |                     res_dict[sentence_id] = { | ||||||
|  |                         "sid": sentence_id, | ||||||
|  |                         "text": sentence_text, | ||||||
|  |                         "tokens": sentence_tokens, | ||||||
|  |                         "jos_links": self.parse_jos_links(s), | ||||||
|  |                         "srl_links": self.parse_srl_links(s, srl_dict[sentence_id]), | ||||||
|  |                     } | ||||||
|  | 
 | ||||||
|  |                     print(res_dict[sentence_id]) | ||||||
|  |                     print("------------------------------------------------- END") | ||||||
|  |                     return  # TODO dev break | ||||||
|         return res_dict |         return res_dict | ||||||
|  | |||||||
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							| @ -9,8 +9,17 @@ if __name__ == "__main__": | |||||||
| 	args = parser.parse_args()	 | 	args = parser.parse_args()	 | ||||||
| 
 | 
 | ||||||
| 	# parse ssj | 	# parse ssj | ||||||
|  | 	""" | ||||||
| 	ssj_parser = Parser( | 	ssj_parser = Parser( | ||||||
| 		corpus="ssj", | 		corpus="ssj", | ||||||
| 		infiles=[args.ssj_file] | 		infiles=[args.ssj_file] | ||||||
| 	) | 	) | ||||||
| 	ssj_parser.parse() | 	ssj_parser.parse() | ||||||
|  | 	""" | ||||||
|  | 
 | ||||||
|  | 	# parse kres | ||||||
|  | 	kres_parser = Parser( | ||||||
|  | 		corpus="kres", | ||||||
|  | 		infiles=[args.kres_folder, args.kres_srl_folder] | ||||||
|  | 	) | ||||||
|  | 	kres_parser.parse() | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user