From 55c07f88ca798c45eb3a08f8d25633d1849ec1db Mon Sep 17 00:00:00 2001 From: voje Date: Mon, 11 Mar 2019 00:17:43 +0100 Subject: [PATCH] bug: matching srl_json files with kres_xml files --- .gitignore | 2 +- README.md | 2 +- src/pkg/corpusparser/corpusparser/Parser.py | 211 ++++++++++-------- .../__pycache__/Parser.cpython-35.pyc | Bin 4322 -> 0 bytes .../__pycache__/Sentence.cpython-35.pyc | Bin 448 -> 0 bytes .../__pycache__/__init__.cpython-35.pyc | Bin 249 -> 0 bytes src/preflight/main_parse.py | 9 + 7 files changed, 131 insertions(+), 93 deletions(-) delete mode 100644 src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc delete mode 100644 src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc delete mode 100644 src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc diff --git a/.gitignore b/.gitignore index ecb8742..c57e05a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ data/samples/ -*/pycache/ +*/__pycache__/ *egg-info/ diff --git a/README.md b/README.md index a956d10..c35bf79 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# cjvt-vezljivost +# cjvt-valency ## Components diff --git a/src/pkg/corpusparser/corpusparser/Parser.py b/src/pkg/corpusparser/corpusparser/Parser.py index dc93554..9fb0e91 100644 --- a/src/pkg/corpusparser/corpusparser/Parser.py +++ b/src/pkg/corpusparser/corpusparser/Parser.py @@ -1,6 +1,7 @@ from corpusparser import Sentence from pathlib import Path import re +import json from lxml import etree # Read input file(.xml, .json; kres or ssj500k). @@ -26,15 +27,8 @@ class Parser(): if self.corpus == "kres": return self.parse_jos_links_kres(sent_el) else: - return self.parse_jos_links_ssj(sent_el) - - def parse_ssj_target_arg(self, text): - # from: 0, to: 6 - # - # from: 6, to: 7 - # - lst = [x.split(".")[-1] for x in text.split(" ")] - return [int(x[1:] if x[0] == "t" else 0) for x in lst] + # 'syntax' is the linkgroup we're looking for + return self.parse_any_links_ssj(sent_el, "syntax") def parse_jos_links_kres(self, sent_el): lgrps = sent_el.findall(".//links") @@ -49,103 +43,138 @@ class Parser(): }] return res_links - def parse_jos_links_ssj(self, sent_el): + def parse_ssj_target_arg(self, text): + # from: 0, to: 6 + # + # from: 6, to: 7 + # + lst = [x.split(".")[-1] for x in text.split(" ")] + return [int(x[1:] if x[0] == "t" else 0) for x in lst] + + def parse_any_links_ssj(self, sent_el, links_type): lgrps = sent_el.findall(".//linkGrp") - if len(lgrps) < 1: - # print(etree.tostring(sent_el)) - raise IOError("Can't find links.") + links = [x for x in lgrps if x.get("type") == links_type][0] res_links = [] - for link in lgrps[0]: - print(link) + for link in links: tar = self.parse_ssj_target_arg(link.get("target")) res_links += [{ "from": tar[0], "afun": link.get("ana").split(":")[1], - "to": [1], + "to": tar[1], }] return res_links + def parse_srl_links(self, sent_el, xml_file=None): + if self.corpus == "kres": + return self.parse_srl_links_kres(sent_el, xml_file) + else: + return self.parse_any_links_ssj(sent_el, "SRL") + + def parse_srl_links_kres(self, sent_el, sent_srl_dict): + print(sent_srl_dict) + # find the correspointing json file with srl links + return "TODO" + def parse(self): if self.corpus == "kres": - print("parse kres: TODO") + for xml_file in self.kres_folder.iterdir(): + self.parse_xml_file(xml_file) + break # TODO dev break else: self.parse_xml_file(self.ssj_file) - def parse_xml_file(self, filepath): - res_dict = {} - with filepath.open("rb") as fp: + def parse_xml_file(self, xml_file): + srl_dict = {} + if self.corpus == "kres": + # in case of kres, read the SRL links form a separate json file + file_id = xml_file.name.split(".")[0] + json_file = self.kres_srl_folder / Path(file_id).with_suffix(".srl.json") + with json_file.open("r") as fp: + srl_dict = json.loads(fp.read()) + + with xml_file.open("rb") as fp: # remove namespaces bstr = fp.read() - utf8str = bstr.decode("utf-8") - utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1) - utf8str = re.sub(' xml:', ' ', utf8str) - - root = etree.XML(utf8str.encode("utf-8")) - - divs = [] # in ssj, there are divs, in Kres, there are separate files - if self.corpus == "kres": - divs = [root] - else: - divs = root.findall(".//div") - - # parse divs - for div in divs: - f_id = div.get("id") - - # parse paragraphs - for p in div.findall(".//p"): - p_id = p.get("id").split(".")[-1] - - # parse sentences - for s in p.findall(".//s"): - s_id = s.get("id").split(".")[-1] - sentence_text = "" - sentence_tokens = [] - - # parse tokens - for el in s.iter(): - if el.tag in self.W_TAGS: - el_id = el.get("id").split(".")[-1] - if el_id[0] == 't': - el_id = el_id[1:] # ssj W_TAG ids start with t - sentence_text += el.text - sentence_tokens += [{ - "word": True, - "tid": int(el_id), - "text": el.text, - "lemma": el.get("lemma"), - "msd": (el.get("msd") if self.corpus == "kres" - else el.get("ana").split(":")[-1]), - }] - elif el.tag in self.C_TAGS: - # only Kres' C_TAGS have ids - el_id = el.get("id") or "none" - el_id = el_id.split(".")[-1] - sentence_text += el.text - sentence_tokens += [{ - "word": False, - "tid": el_id, - "text": el.text, - }] - elif el.tag in self.S_TAGS: - # Kres' doesn't contain .text - sentence_text += " " - else: - # pass links and linkGroups - pass - sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) - - # make a generator instead of holding the whole corpus in memory - if sentence_id in res_dict: - raise KeyError("duplicated id: {}".format(sentence_id)) - res_dict[sentence_id] = { - "sid": sentence_id, - "text": sentence_text, - "tokens": sentence_tokens, - "jos_links": self.parse_jos_links(s) - } - - print(res_dict[sentence_id]) - break + utf8str = bstr.decode("utf-8") + utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1) + utf8str = re.sub(' xml:', ' ', utf8str) + + root = etree.XML(utf8str.encode("utf-8")) + + divs = [] # in ssj, there are divs, in Kres, there are separate files + if self.corpus == "kres": + divs = [root] + else: + divs = root.findall(".//div") + + res_dict = [] # TODO: try making an iterator instead + + # parse divs + for div in divs: + f_id = div.get("id") + + # parse paragraphs + for p in div.findall(".//p"): + p_id = p.get("id").split(".")[-1] + + # parse sentences + for s in p.findall(".//s"): + s_id = s.get("id").split(".")[-1] + sentence_text = "" + sentence_tokens = [] + + # parse tokens + for el in s.iter(): + if el.tag in self.W_TAGS: + el_id = el.get("id").split(".")[-1] + if el_id[0] == 't': + el_id = el_id[1:] # ssj W_TAG ids start with t + sentence_text += el.text + sentence_tokens += [{ + "word": True, + "tid": int(el_id), + "text": el.text, + "lemma": el.get("lemma"), + "msd": (el.get("msd") if self.corpus == "kres" + else el.get("ana").split(":")[-1]), + }] + elif el.tag in self.C_TAGS: + # only Kres' C_TAGS have ids + el_id = el.get("id") or "none" + el_id = el_id.split(".")[-1] + sentence_text += el.text + sentence_tokens += [{ + "word": False, + "tid": el_id, + "text": el.text, + }] + elif el.tag in self.S_TAGS: + # Kres' doesn't contain .text + sentence_text += " " + else: + # pass links and linkGroups + pass + sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) + + # make a generator instead of holding the whole corpus in memory + # TODO -- match ids + print("---") + print(sorted(srl_dict.keys(), key=lambda x: x.split(".")[1])[:100]) + print(sentence_id) + print(srl_dict.get(str(sentence_id))) + print("---") + if sentence_id in res_dict: + raise KeyError("duplicated id: {}".format(sentence_id)) + res_dict[sentence_id] = { + "sid": sentence_id, + "text": sentence_text, + "tokens": sentence_tokens, + "jos_links": self.parse_jos_links(s), + "srl_links": self.parse_srl_links(s, srl_dict[sentence_id]), + } + + print(res_dict[sentence_id]) + print("------------------------------------------------- END") + return # TODO dev break return res_dict diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc deleted file mode 100644 index 98e401c96437b8d431d85e4a9f50989bd02061b5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4322 zcmai2OK%(36+U-nIHahNsD~w6wvy0_6O~CVxoL_9hHW^o5~DBKsf;L?#${>FNFHi9 zr0$Fyg&{X}y)L?H7ySbTy6UE@{*D3{NH<-1-JLh>ch2x35k^YQy>ssSob#Q>9j(sI zmH+L^RCRvy?L(tB{M)q_UL?O&{7&dvz=x2er zi<07W3W*s3(BNcrunz*KmZaaMV1|OS(CD0{z!SPe!5jtiLSsS& zEG`)LVMC{(Zn8G`$8Rq8P|`o5>f>xhM;7THQHCbCv1skNqCQbwjk@e{mqp2zEOKPg z{%FQzkxhw5nQ)J7ev6&*%|w{PoQ4cV8qy*dL^`zOOo%~Xy}WCjBORt%fytL#89Pl| zz4t!WhbL_)yxQ#O?nzorpLR~-p!y`NZeo11+R@cbUbwks)K^9|joS0AZnHL{S#E!p z$La^2I0&_0kw&WHVTUPUi~2znho<~_BR&Z~(Ym9Jn;&J|{raPi@9!D+-tfIA@0x9# zG>i}QJabu$65b&F3+W4Jzxr^yt2?c*nQf=Kx!pZF+|GCF3NYszIapiWuT805kCG^> z*UQ)=<&DVl)UsMto?5j$YX)ym^(w;^quEWiG5+zJi{N}61mJ9m6EM{4(iCI>(H@UM z`HHw9+P8MKi$yg%m+68qTW@vJdK@K3X`S_EmZy86yqcw#u+S7!KvWN7(Lr9VaDQcL z%^EsWK8jRD%~`$Gp+Hm9#&R)+T*h*ZeFyBbfHn)UgED|6!wAK%VAb5US1ar`*_8Ol z+Hp`gJfv@6UZa61hBqVLjQ!yc79G1-ZyU9xiCKZWwhs zCGM;8tvE`vW~be~bB)j8LuaVBeNIZYxvZfN!>o>a>sH)pHsbUSh|$_ts(OxZR+#X8 z3&fD`514QEV2LVgL#N)BHdJBL3&@cmx`vGGk1 z)KxU!K^e+YQ~C|GU@3yTlZB_5Z^vmiwayG}=)ACeK*}gkVO{mEe1}@b7Er)sb_%h8 zzk(P7FV+jtGoWV;aXSL%*C`Y6fh1xPlr^yFR7;?9Xl>{50Sz4LLoWr&Z0bAI8ilTw zL@c&|vV=c`tl@A`_Ai}TFvC5Q1+%gsvN^{67zZBTp?#dOSMzpzrLFC4@gh-qZ}DCu z+03d3Q4&<8dn<>;InbT9z5)S_(>ORujFokKI|#e*UWmn2%{4_{*N9_d$6;cMKY1V# zOS3&28y1y=QxI@8JDVxQhq{}Z84iPTo^kj!1(&{vxtl0??16JGA>O!ZQQ2w(tsB-9 zTIXZK7%jHc5{jVp4-o$ct-FjCZ~zFH37z6m8GsPKo&eaL02nZUSOsi&G6yqPfI?y^ z47V5*1Y-1KE5JU4Qc@2ChomD2vabVeya3!8_&yuLH@n*_58=D7yWb6=JTFj$>I}0+wa9t}a@=%jZ0Af+f7QISQEk9;cKWXt1Ww zq!c(F^u`b*3o<78yd}W#g$vO$&nmqIiSAWSA~&m#9_&1*Ig)4e53#G6m(luZ8(A5N zRD0+|l;Q9)B_8y~NpuGzXLE>mR+zCRE@KuOwOHb|O%x#%NY=}R0OP-3w&APr(-!Ow zqp(4Q1Gy1l6gHF_CTtNl#Fmr?2wP2t#G)|t+?M$Tu|?GA7v1nau0$aO6X;J2k5lrW zsgHh)7AdJP-I7Zl&o9#HGpk>uoyUnaC{e#eEtu#mt9md)r=MHggV`SU>^%O&8kDJD zo=j@tia5t(L6z19vvgXuco-|@^A#0q;8EXu-HNMNfm@|RgL&b)LZ^x1sW^;q*l*MT zU@uT>PQ;m~BbW4=#XTb)C&zDKvqkDJ^5t|bSda_kpA@%oA0$aoJU%ZQRuDKrRVcvt z5(UeY;iN^8pw!3v)J;0m>{ z!wmIjsSg}sSxeN%W66qtS=j{&)@9BT%Yz4TGI)dM@S)i6P0{09q8+GA3?+gI!~B=% zRtB{R#n*oI@OVb;AODl~kN-yd`+()MUw(n%z1lmwzNMe^3MbjY^^ba$FH?AElHT6< z)vq@mzrSG+#hRTHM6n{AtXHUF^rpTFT5gg(7hyAqo*63&dJgEWq&6Y*JHFn)T;KVs zqk~7r&Z0nH|rR6SrWy^0opiG7HaU{IMfwpmxYjYtQVMCL}lEAj&3(HQ~Fu>b)K&|JWKSeE$8>B z5{JGnz9sz9Mq%0C`{}Mk@`*w z!k^>&IDezo<$_k3G;{q}VeA*en5WK;$AH1q<&Sg8Pf=3DOym<+E#X_FqOPbF>yqUn zzo2E$sfxX-*12GYYcC_~tg8k5%^~Bs$Uiffc~!k@Em(_X!PeA0;X5siNhkQNn7Hsvg! zIWyvu7ZuBNMoy8Kzg0BfH2Ln5@^_tQc@@v~O5U8e;JGqZD;e+Xx18qe=ocBaIC>(V ioBB3)IWe3;>_^tU?l4ojl{0^b(`?Gevcf#I;QS9i3VGN7 diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc deleted file mode 100644 index c0ae9f75396887f2b01bd1afd2afe7d6e563d333..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 448 zcmZWku}Z{15S`uJK{)ON8%40V(ln;C5YYn#t#U=If-r<+k?38r?rsXNRX@gG+SZQ{ zteoAD=y3z{c4lYZo0)^zZ1R2e@o)zKU*v-Y6fMa0A;E14P%?P;_{JflfI}Ju1r-P5 z6fMa0F@XTc7!WVxj!Nys&H^X{;*D&WirR>}5IYLjPGOJ0bki6xQb{A8Y>s-;9I>h^ zgLo31&+nHv%ZS;rR8?(0LAlL6{G7#7HBX{2u~tPapC99*QL@!CS6Zm;q?bOGuQu%E z-9_2~u+9F^(N`092V$MC1)`lGuA8!5xp<2B(&nqKqQ?C~f2*mH`(sX++>e+tpC?iG T{_FJzIf4x@x)GP2ML+NhUEyBP diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc deleted file mode 100644 index a365c8e295dbed9a0472980368c09666df7d3ef2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 249 zcmWgR<>jikZ;c6LU|@I*#Bjg_WH|tFu?mn#0U}0*90rD5MusRx21XE{i6MuHA(xpU ziW$gfW(d|~c?nd>pvicPEg-R|IJF4K;s{R7D@n~uPW97di4soEFDfW4E&xmFLDUq1 ztcemuk%8$cVg|~sWGG?*QefhjnSMc0epYI7iGFcWvVK8!x<0Z-R8D++W?p7Ve7s&k X