diff --git a/.gitignore b/.gitignore index ecb8742..c57e05a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ data/samples/ -*/pycache/ +*/__pycache__/ *egg-info/ diff --git a/README.md b/README.md index a956d10..c35bf79 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# cjvt-vezljivost +# cjvt-valency ## Components diff --git a/src/pkg/corpusparser/corpusparser/Parser.py b/src/pkg/corpusparser/corpusparser/Parser.py index dc93554..9fb0e91 100644 --- a/src/pkg/corpusparser/corpusparser/Parser.py +++ b/src/pkg/corpusparser/corpusparser/Parser.py @@ -1,6 +1,7 @@ from corpusparser import Sentence from pathlib import Path import re +import json from lxml import etree # Read input file(.xml, .json; kres or ssj500k). @@ -26,15 +27,8 @@ class Parser(): if self.corpus == "kres": return self.parse_jos_links_kres(sent_el) else: - return self.parse_jos_links_ssj(sent_el) - - def parse_ssj_target_arg(self, text): - # from: 0, to: 6 - # - # from: 6, to: 7 - # - lst = [x.split(".")[-1] for x in text.split(" ")] - return [int(x[1:] if x[0] == "t" else 0) for x in lst] + # 'syntax' is the linkgroup we're looking for + return self.parse_any_links_ssj(sent_el, "syntax") def parse_jos_links_kres(self, sent_el): lgrps = sent_el.findall(".//links") @@ -49,103 +43,138 @@ class Parser(): }] return res_links - def parse_jos_links_ssj(self, sent_el): + def parse_ssj_target_arg(self, text): + # from: 0, to: 6 + # + # from: 6, to: 7 + # + lst = [x.split(".")[-1] for x in text.split(" ")] + return [int(x[1:] if x[0] == "t" else 0) for x in lst] + + def parse_any_links_ssj(self, sent_el, links_type): lgrps = sent_el.findall(".//linkGrp") - if len(lgrps) < 1: - # print(etree.tostring(sent_el)) - raise IOError("Can't find links.") + links = [x for x in lgrps if x.get("type") == links_type][0] res_links = [] - for link in lgrps[0]: - print(link) + for link in links: tar = self.parse_ssj_target_arg(link.get("target")) res_links += [{ "from": tar[0], "afun": link.get("ana").split(":")[1], - "to": [1], + "to": tar[1], }] return res_links + def parse_srl_links(self, sent_el, xml_file=None): + if self.corpus == "kres": + return self.parse_srl_links_kres(sent_el, xml_file) + else: + return self.parse_any_links_ssj(sent_el, "SRL") + + def parse_srl_links_kres(self, sent_el, sent_srl_dict): + print(sent_srl_dict) + # find the correspointing json file with srl links + return "TODO" + def parse(self): if self.corpus == "kres": - print("parse kres: TODO") + for xml_file in self.kres_folder.iterdir(): + self.parse_xml_file(xml_file) + break # TODO dev break else: self.parse_xml_file(self.ssj_file) - def parse_xml_file(self, filepath): - res_dict = {} - with filepath.open("rb") as fp: + def parse_xml_file(self, xml_file): + srl_dict = {} + if self.corpus == "kres": + # in case of kres, read the SRL links form a separate json file + file_id = xml_file.name.split(".")[0] + json_file = self.kres_srl_folder / Path(file_id).with_suffix(".srl.json") + with json_file.open("r") as fp: + srl_dict = json.loads(fp.read()) + + with xml_file.open("rb") as fp: # remove namespaces bstr = fp.read() - utf8str = bstr.decode("utf-8") - utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1) - utf8str = re.sub(' xml:', ' ', utf8str) - - root = etree.XML(utf8str.encode("utf-8")) - - divs = [] # in ssj, there are divs, in Kres, there are separate files - if self.corpus == "kres": - divs = [root] - else: - divs = root.findall(".//div") - - # parse divs - for div in divs: - f_id = div.get("id") - - # parse paragraphs - for p in div.findall(".//p"): - p_id = p.get("id").split(".")[-1] - - # parse sentences - for s in p.findall(".//s"): - s_id = s.get("id").split(".")[-1] - sentence_text = "" - sentence_tokens = [] - - # parse tokens - for el in s.iter(): - if el.tag in self.W_TAGS: - el_id = el.get("id").split(".")[-1] - if el_id[0] == 't': - el_id = el_id[1:] # ssj W_TAG ids start with t - sentence_text += el.text - sentence_tokens += [{ - "word": True, - "tid": int(el_id), - "text": el.text, - "lemma": el.get("lemma"), - "msd": (el.get("msd") if self.corpus == "kres" - else el.get("ana").split(":")[-1]), - }] - elif el.tag in self.C_TAGS: - # only Kres' C_TAGS have ids - el_id = el.get("id") or "none" - el_id = el_id.split(".")[-1] - sentence_text += el.text - sentence_tokens += [{ - "word": False, - "tid": el_id, - "text": el.text, - }] - elif el.tag in self.S_TAGS: - # Kres' doesn't contain .text - sentence_text += " " - else: - # pass links and linkGroups - pass - sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) - - # make a generator instead of holding the whole corpus in memory - if sentence_id in res_dict: - raise KeyError("duplicated id: {}".format(sentence_id)) - res_dict[sentence_id] = { - "sid": sentence_id, - "text": sentence_text, - "tokens": sentence_tokens, - "jos_links": self.parse_jos_links(s) - } - - print(res_dict[sentence_id]) - break + utf8str = bstr.decode("utf-8") + utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1) + utf8str = re.sub(' xml:', ' ', utf8str) + + root = etree.XML(utf8str.encode("utf-8")) + + divs = [] # in ssj, there are divs, in Kres, there are separate files + if self.corpus == "kres": + divs = [root] + else: + divs = root.findall(".//div") + + res_dict = [] # TODO: try making an iterator instead + + # parse divs + for div in divs: + f_id = div.get("id") + + # parse paragraphs + for p in div.findall(".//p"): + p_id = p.get("id").split(".")[-1] + + # parse sentences + for s in p.findall(".//s"): + s_id = s.get("id").split(".")[-1] + sentence_text = "" + sentence_tokens = [] + + # parse tokens + for el in s.iter(): + if el.tag in self.W_TAGS: + el_id = el.get("id").split(".")[-1] + if el_id[0] == 't': + el_id = el_id[1:] # ssj W_TAG ids start with t + sentence_text += el.text + sentence_tokens += [{ + "word": True, + "tid": int(el_id), + "text": el.text, + "lemma": el.get("lemma"), + "msd": (el.get("msd") if self.corpus == "kres" + else el.get("ana").split(":")[-1]), + }] + elif el.tag in self.C_TAGS: + # only Kres' C_TAGS have ids + el_id = el.get("id") or "none" + el_id = el_id.split(".")[-1] + sentence_text += el.text + sentence_tokens += [{ + "word": False, + "tid": el_id, + "text": el.text, + }] + elif el.tag in self.S_TAGS: + # Kres' doesn't contain .text + sentence_text += " " + else: + # pass links and linkGroups + pass + sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) + + # make a generator instead of holding the whole corpus in memory + # TODO -- match ids + print("---") + print(sorted(srl_dict.keys(), key=lambda x: x.split(".")[1])[:100]) + print(sentence_id) + print(srl_dict.get(str(sentence_id))) + print("---") + if sentence_id in res_dict: + raise KeyError("duplicated id: {}".format(sentence_id)) + res_dict[sentence_id] = { + "sid": sentence_id, + "text": sentence_text, + "tokens": sentence_tokens, + "jos_links": self.parse_jos_links(s), + "srl_links": self.parse_srl_links(s, srl_dict[sentence_id]), + } + + print(res_dict[sentence_id]) + print("------------------------------------------------- END") + return # TODO dev break return res_dict diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc deleted file mode 100644 index 98e401c..0000000 Binary files a/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc and /dev/null differ diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc deleted file mode 100644 index c0ae9f7..0000000 Binary files a/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc and /dev/null differ diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc deleted file mode 100644 index a365c8e..0000000 Binary files a/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc and /dev/null differ diff --git a/src/preflight/main_parse.py b/src/preflight/main_parse.py index d696cfb..35bb2bb 100644 --- a/src/preflight/main_parse.py +++ b/src/preflight/main_parse.py @@ -9,8 +9,17 @@ if __name__ == "__main__": args = parser.parse_args() # parse ssj + """ ssj_parser = Parser( corpus="ssj", infiles=[args.ssj_file] ) ssj_parser.parse() + """ + + # parse kres + kres_parser = Parser( + corpus="kres", + infiles=[args.kres_folder, args.kres_srl_folder] + ) + kres_parser.parse()