From f9f7fae76a52dc52efb101ad61d92dd61a35b32b Mon Sep 17 00:00:00 2001 From: voje Date: Wed, 13 Feb 2019 16:49:45 +0100 Subject: [PATCH] added formatter for mate-tools parse_full.sh --- tools/main.py | 6 +++++- .../parser/__pycache__/__init__.cpython-37.pyc | Bin 0 -> 129 bytes tools/parser/__pycache__/msdmap.cpython-37.pyc | Bin 0 -> 9043 bytes tools/parser/__pycache__/parser.cpython-37.pyc | Bin 0 -> 3198 bytes tools/parser/parser.py | 16 +++++++++++++++- 5 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 tools/parser/__pycache__/__init__.cpython-37.pyc create mode 100644 tools/parser/__pycache__/msdmap.cpython-37.pyc create mode 100644 tools/parser/__pycache__/parser.cpython-37.pyc diff --git a/tools/main.py b/tools/main.py index 5fb8fc8..eeb21e6 100644 --- a/tools/main.py +++ b/tools/main.py @@ -17,7 +17,11 @@ if __name__ == "__main__": # kres_file = "../data/kres_example/F0019343.xml.parsed.xml" kres_dir = "../data/kres_example/" for kres_file in os.listdir(kres_dir): + out_file = "" res_dict = parser.parse_tei(join(kres_dir, kres_file)) for _, sentence in res_dict.items(): - parser.to_conll09(sentence) + out_file += parser.to_conll_2009_full(sentence) + with open(join(kres_dir, kres_file + ".tsv"), "wb+") as fp: + fp.write(out_file.encode("utf-8")) + fp.close() print("end parsing kres") diff --git a/tools/parser/__pycache__/__init__.cpython-37.pyc b/tools/parser/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b49a1ebc35c5467384b158794c4ef7f27d2ab9d GIT binary patch literal 129 zcmZ?b<>g`kf>K!yVl7qb9~6oz01O-8?!3`HPe1o2BtKRK(cM7Ovo zN4F#~Jv}ooUB4thKc`r~AhD=8wMaicJ~J<~BtBlRpz;=nO>TZlX-=vg$c$njW&i*J CV;vs= literal 0 HcmV?d00001 diff --git a/tools/parser/__pycache__/msdmap.cpython-37.pyc b/tools/parser/__pycache__/msdmap.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..82d62dc9145da39670dd9193fe6bd0c936376dab GIT binary patch literal 9043 zcmd^FTbJ9^6_$LFHJ(YJgj=AOK)D&13E@)Ol7@r~q!btefdoW?(Rgc&#Ub+r%Vv44Um@+Zz~AG-41C!V^$ePl}(Gb=x!p53j z2bERp+^t?Us0G#~>zjH|y=q^wZR`94XDo6~S*=Eg+nIeTe07w+>7!f(mS>aYRfyZY zYh4#^{bQ|4HchH@*pYEc&>BsaF`YHX44$wVxFa4bEqx>Q}hdZntn;YqF>W9beMiaN9b8PO2_CqI!>$f zJiS0C=(qGD{f=Ium+2K+qgUx9y+*IoDSCt6q|(n8l zfHtU0?^8&d)T4;{6jMS2I!}sH8q$cy^Z{LTDCuf@RX2~(AW6evn8d1@wL18oMbiMYS7wQ8Qz52zmu)$J_h zExi~I110tV{UW0=LXw{HH?Ljw$G)~-^_GsUud84j`bNduEVRTzaWERIYgfO!62ytZ z=+W-$K^oEsR=m}6y@s#+^|Tji+t>ApKM2z~vpLSgZaCC7>Dmj4>Z{wh4ZzleG@zJa z>*{`fG!Js3V8~t8cBCHU$|=b01Q9zGsE)3W75i8H2z_dFnFk@Zt7~s;4Fa`K)aVNN z>H#zlgDHuU@2@ z(?IPPkb07T9Ekmr3vCZ{{j|gw z#vP+z`4n~SYA-ZIi9#U?D8(~`Wpo*$OcxUbeQXfAIu1$ncukJ1jcHy8I1{7;zEeh@ zu9F&dO}y!Pk_PLhciKsVP7;&0AJ@*wK&9+fgHT7I9OkHFD1_?gaD}X|R#6zLc^Glq z1q!B}k04^WG<7ZJA7i$Mujy-UMpXD8$dB9D> zxI2n`rJe^!2sKFc5+3{zCN6jxbvYt)!7d1KK?#w$U>AhApoGX=urngJ3@#`kGZ*B@ z-Cd9)UxbB5r_9(qGpl7hdjtiD*R{SM@_g1eA)m4K3aV#EdIs_z>!xxD{8pQPMG;l$MIFI0MOCTwL)Mp5>BDe9IGiNm5)ka<4Vr&4ok^c>sZ z`nffF4tN64%u$Pc9S3F)fz1%~>P9+D2JxmZ5ushPhvSv{GpKY{gRTmKX=LowfWrBK zEPoZ@3OE&@K!o!e7Grs;m1jVqKr*OM*$g>U&ECb#Lo%i9Gtil`A9Ho~*lP>5s}nk7j|03}J&m2#UfSCu4%X!$iJnQlq_E5_ zY_RTPA#YDQT2ExfHRM&US+5H4Zm#7+03I*hWD{9uGnvsMbux5K2<(pCi;l%8a*P@7yunoA`y82@tV}G+IdL{d!;#_u z-zer-Ik)1a6?!LtLhm^Ec}s1a_At~HdZz|>Ag;_)(0M$|C9^)c+$ZDSoNICKQYR^R zS8xUI;>?V*gx}9y^urS+1&z|IAn)Z{J_@M~)YvzVWnT=>>iR~=2R_<9WcJ0zrz)K_ zknf;}cs4?@io%Q@&rxBY_io1K`?+Fn0C{zl^!?b>J_wi@LfYj-Vdgv`D^C?JpE+@e zat4&(EvAVv@VPi*MSXxGmozb=N|KT$N|ITcC`n41C`o2%VqS92%5pEVG~x15oG&pa z0hz}fyO3i)9vWlHV=x^=yrh~NPGah#GgHN>*>HfvW(d(EqjoQZO9&HqJh>w5HbXN^ zJP1l8ID9C{=VoTsI*Z3yFkHa3G%do?nJqj!g+e4V-b<3gdr30my(B5Tmn1XZ=OyQP zFCh%?Do30{2TN_gi)W^7?aDy~(sIrm%;Q6|Bu%rKSgHz?s|i}j0n+~OXU{~^wC^np z!vPlL6zOc1k{0k*rI+B;=RNcWD(A60Rr1Rg9L;1qU4$d9rag0@i`up%8lZhA4^NOAyNeS@iC#Mq;-F$YOVAfW%1& zkj2T4fH~e&326A1*GbpDEiiBn7L+vQ8L6TdIq|gl5UB0PU7qUsh7ZqLHt(#1Xyad& z`i$}4KXkCOIUX*jDq0@;-7YTz4-S(gN)Ph7gkNGD?8`gPmBE&F+ikppX}7D4OYgL~ ztXAxu{#>{6fBl)TfCpNt@qp-_3qaG3G3LiT_-Ca(iY>Nw5Q3 z@Tv26X46}IbK&94rUMsi+cknsw%AnLuJCo$T+i&Q;C21b-j>(){5!p`%5#39KC{2J zb{(TsDR(@D_u(%_o#WH{t!qS6rdK zMsWoN21J1x89&@0Xt@WpV9^6w?qph~f0J7Z^kl*v%j z{npjW#mY9Dx8M50`po*w-l*We8ZNtB%kKV!C04CVHit!JySh=~x8}~;bqu=e{I#w_ z#q2*3a06zLV$yrL=vv#>xw|exlhv!L8uWfHny*&>QaNAa)+*oI%j&Nz+p^VP@Ke#1 zmB-}Rh%fmZKH~^cl#}v(^KcZVL+o3Qtbh}!cI?T684LZC+8ex1N8VXEjKkn4J;E1L z3@iH+lue_P<%N4)^M^wvdN`Q>2A!fLbvK3~t#svlo`Qob1~{`&z4kl!X(q zc3U^w?S4X|2-i#P_IZ90WNOqG(1kt;3+f-Z$^${J&$pv+ zfhxS~Xt%xkDc;g6&pA1WC;k4UabmDc%l*k(%MnTSPsa0Vj?d5AY*(uO&iCriD0b7W Z-NUByN2CJ?!@W@d1m{^!z)rPk{|~4V?JxiU literal 0 HcmV?d00001 diff --git a/tools/parser/__pycache__/parser.cpython-37.pyc b/tools/parser/__pycache__/parser.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..732b7528f05f6067bf33be15eb24a2c33884bba2 GIT binary patch literal 3198 zcmbtWL312O74GionVlW&u69?l9VNyHg;cUdEQu5lV~mT$G880{iIqZTh}l|i&u-S5 zomusag1nx&Aah9-pum-)cB?qRjWfT3;y71Mf;$J7tKfURvSdd+bj7bieod zy|3r(YSm--{r#%?@cna){goz9e*%MVp(S@f6jMB8L+*1~r5% zHmIWa4{JJ4kg9Y+~fRm#AT9 zRPdh^yj$=sUyQn4PHOlqpjFXspd~+mjf&-bz;cnY3^pE!L-EV?+{&y8pR!zLaw3$N z^4w0X`%+m-X0TE@KbORrNChi1dum~3Pn^u2lt>%;PM_r^RnDA=J7qhLPL-GQ%<1#I z6xGxmjVJI3nx#X&|EH|9!4CP|Kjv=cPCQixzYP9wF)OR;lpElx?uPf6MDx{eCsA?Kp>Af0EVDyw%7W zs zQx5wt%1O~m|3R#Pt7 zxBF!G$u?6jP8PCqHkVa#Ud^nT3G5|wJzZ2ULDHP^1O8JEKKwbhUFg@+b6MqpN0hA> ze`6&kY9bwmkTjazI^BQ%`M;B+WOwuSzTLGgOA4& zJyLr0=%4tRS)yr)f~Q)d_w6tk4m*W4Oq6eRqRwM})t6Bm1=@yq>^{s_&41ibd!umB z?WBQP9;mC!5A&Avtz@7InFjl5;iU0S5G92j4x*i;b)k^)D2NKFgN`a36?9`26ha4h zXYZb-D)+5B@4Z_%L1aX7CwPz)rQRS?oiHq{evlS+f-TcR4$?puR@&(o&UX8Qx4yHb z8{~sEh_LSs!zrAtXH!VbOx7Mg%w%M`n=t zGQq&FCP9?Kv!G4E@#jycvnYum^z9%d)wvVdK>dN~Re1LJwYBd3kJHPE4ll#&{vhhF zrEwf4YsM9Q+^vot6y=$7X)qv&NjD?#u?A4?S&g2B z>XN9g*SBXi$#tlKFJ7CaQ0sauwqE2zO7)4iG_QY``XiEKj}4sG1+0EXr*T#X_BOFD*-5+pry1nW$Y@r|N@~H}4$YaZK-k4; z4QOrcUc&iX>Ku(zWX%D-88Qo)dVaG@a6>=p3I4Xw62d>lp&&L3-Zp2{623KxQN+EV zlYZvt3}S>M9Vkt2n8!Cls-B5o&PtY0RfydMvu!X9bd7%jJ`oj@*7 zT0hWVqt_%}8y2N_FKrXl+vYyQP@_P5ZMJ6nv4b;9g}XJzJkRy?STBCn6;#Si&$?Hxj|%u z$n$VkG@p0Bw-<&Js3mm}x`%4Gb;##BeAC|suT}A7h>sTY1)wj0<5y>w+A77rGcsqW zKM#Ta0+AIWbki7MG|@2&J!6TniO{uhuMK0h7Y5fU5fZ96g4YU6aJcKao9?{3{x45; B>bC#@ literal 0 HcmV?d00001 diff --git a/tools/parser/parser.py b/tools/parser/parser.py index dacc8c5..2409089 100644 --- a/tools/parser/parser.py +++ b/tools/parser/parser.py @@ -90,6 +90,7 @@ def parse_tei(filepath): parse_links(s) if guess_corpus == "KRES" else None ) } + fp.close() return res_dict @@ -108,7 +109,7 @@ def parse_links(s_el): return res_links -def to_conll09(sentence_entry): +def to_conll_2009_SRL(sentence_entry): def fillpred(pos, feat): if False: @@ -154,3 +155,16 @@ def to_conll09(sentence_entry): out_str += "\n" print(out_str) return out_str + + +def to_conll_2009_full(sentence_entry): + out_str = "" + for token in sentence_entry["tokens"]: + t_id = token[1] + # 1 3 + out_str += "{}\t{}\n".format( + t_id, # id + token[2], # form + ) + out_str += "\n" + return out_str \ No newline at end of file