forked from kristjan/cjvt-srl-tagging
finished parse + tag toolchain -> TODO: tagger error
This commit is contained in:
11
tools/README.md
Normal file
11
tools/README.md
Normal file
@@ -0,0 +1,11 @@
|
||||
# Usage
|
||||
|
||||
## parse.py
|
||||
Reads corpora and creates various `tsv` output formats in `../data`.
|
||||
|
||||
Edit the file and run `$ python3 parse.py`.
|
||||
|
||||
## Tagging
|
||||
Cd to srl-20...
|
||||
Run `./scripts/learn_mod.sh` to train a model -> output is `srl-ger.model`.
|
||||
Run `./scripts/parse_srl_only_mod.sh` to
|
||||
@@ -1,43 +0,0 @@
|
||||
from parser.parser import Parser
|
||||
import os
|
||||
from os.path import join
|
||||
import re
|
||||
import sys
|
||||
import cProfile
|
||||
|
||||
|
||||
def main():
|
||||
# make sure you sanitize every input into unicode
|
||||
|
||||
SSJ500K_2_1 = 27829 # number of sentences
|
||||
par = Parser()
|
||||
|
||||
print("parsing ssj")
|
||||
ssj_file = "../data/ssj500k-sl.sample.xml"
|
||||
ssj_dict = par.parse_tei(ssj_file)
|
||||
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
|
||||
print("end parsing ssj")
|
||||
|
||||
print("parsing kres")
|
||||
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
|
||||
kres_dir = "../data/kres_example/"
|
||||
for kres_file in os.listdir(kres_dir):
|
||||
|
||||
# since there will be processed files in the same folder
|
||||
if re.match("^F.+\.xml\.parsed\.xml$", kres_file) is None:
|
||||
continue
|
||||
|
||||
print("Processing file: " + kres_file)
|
||||
out_file = ""
|
||||
res_dict = par.parse_tei(join(kres_dir, kres_file))
|
||||
for _, sentence in res_dict.items():
|
||||
out_file += par.to_conll_2009_SRL(sentence)
|
||||
with open(join(kres_dir, kres_file + ".tsv"), "wb+") as fp:
|
||||
fp.write(out_file.encode("utf-8"))
|
||||
fp.close()
|
||||
print("end parsing kres")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cProfile.run("main()", sort="tottime")
|
||||
# main()
|
||||
45
tools/parse.py
Normal file
45
tools/parse.py
Normal file
@@ -0,0 +1,45 @@
|
||||
from parser.parser import Parser
|
||||
import os
|
||||
from os.path import join, dirname
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
import cProfile
|
||||
|
||||
|
||||
def main():
|
||||
# make sure you sanitize every input into unicode
|
||||
|
||||
SSJ500K_2_1 = 27829 # number of sentences
|
||||
par = Parser()
|
||||
|
||||
"""
|
||||
print("parsing ssj")
|
||||
ssj_file = "../data/ssj500k-sl.sample.xml"
|
||||
ssj_dict = par.parse_tei(ssj_file)
|
||||
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
|
||||
print("end parsing ssj")
|
||||
"""
|
||||
|
||||
print("parsing kres")
|
||||
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
|
||||
kres_dir = Path("../data/kres_example/").resolve()
|
||||
kres_out_dir = kres_dir.parent / (kres_dir.name + "_out")
|
||||
kres_out_dir.mkdir(exist_ok=True)
|
||||
|
||||
for kres_file in [x for x in kres_dir.iterdir() if x.is_file()]:
|
||||
|
||||
print("Processing file: " + str(kres_file))
|
||||
out_str = ""
|
||||
res_dict = par.parse_tei(kres_file)
|
||||
for _, sentence in res_dict.items():
|
||||
out_str += par.to_conll_2009_SRL(sentence)
|
||||
with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
|
||||
fp.write(out_str.encode("utf-8"))
|
||||
fp.close()
|
||||
print("end parsing kres")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# cProfile.run("main()", sort="tottime")
|
||||
main()
|
||||
@@ -2,5 +2,30 @@
|
||||
Help conversion between english and slovenian MSD.
|
||||
Hardcoded values from online documentation (html tables).
|
||||
|
||||
## Tagging
|
||||
Go to `./srl-29...` and run `./scripts/{learn...,parse...}`.
|
||||
Change paths in the scripts.
|
||||
|
||||
## ERR
|
||||
Getting this weird error:
|
||||
```bash
|
||||
Executing: java -cp srl.jar:lib/liblinear-1.51-with-deps.jar:lib/anna.jar -Xmx2g se.lth.cs.srl.Parse ger ./../../data/kres_example_out/F0006347.xml.parsed.tsv ./srl-ger.model -nopi ger-eval.out
|
||||
Loading pipeline from ./srl-ger.model
|
||||
Writing corpus to ger-eval.out...
|
||||
Opening reader for ./../../data/kres_example_out/F0006347.xml.parsed.tsv...
|
||||
Exception in thread "main" java.lang.IndexOutOfBoundsException: Index: 33, Size: 32
|
||||
at java.util.ArrayList.rangeCheck(ArrayList.java:657)
|
||||
at java.util.ArrayList.get(ArrayList.java:433)
|
||||
at se.lth.cs.srl.corpus.Sentence.buildDependencyTree(Sentence.java:61)
|
||||
at se.lth.cs.srl.corpus.Sentence.newSRLOnlySentence(Sentence.java:182)
|
||||
at se.lth.cs.srl.io.SRLOnlyCoNLL09Reader.readNextSentence(SRLOnlyCoNLL09Reader.java:23)
|
||||
at se.lth.cs.srl.io.AbstractCoNLL09Reader.open(AbstractCoNLL09Reader.java:43)
|
||||
at se.lth.cs.srl.io.AbstractCoNLL09Reader.<init>(AbstractCoNLL09Reader.java:26)
|
||||
at se.lth.cs.srl.io.SRLOnlyCoNLL09Reader.<init>(SRLOnlyCoNLL09Reader.java:11)
|
||||
at se.lth.cs.srl.Parse.main(Parse.java:36)
|
||||
root@9f69d66a0d39:/cjvt-srl-tagging/tools/srl-20131216#
|
||||
|
||||
```
|
||||
|
||||
## Sources
|
||||
[1] (conll09 data format) https://nlpado.de/~sebastian/pub/papers/conll09_hajic.pdf
|
||||
@@ -34,7 +34,7 @@ class Parser:
|
||||
|
||||
guess_corpus = None # SSJ | KRES
|
||||
res_dict = {}
|
||||
with open(filepath, "rb") as fp:
|
||||
with filepath.open("rb") as fp:
|
||||
# remove namespaces
|
||||
bstr = fp.read()
|
||||
|
||||
@@ -135,8 +135,10 @@ class Parser:
|
||||
print(sentence_entry["links"])
|
||||
"""
|
||||
|
||||
# 1 3 4 5 6 7 8 9 10 11 12 13 14 15
|
||||
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
|
||||
more_underscores = "".join(["\t_" for x in range(9)])
|
||||
|
||||
# 1 3 4 5 6 7 8 9 10 11 12 13 14 15 n
|
||||
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}{}\r\n".format(
|
||||
t_id, # id
|
||||
token[2], # form
|
||||
token[3], # lemma
|
||||
@@ -151,6 +153,7 @@ class Parser:
|
||||
sentence_entry["links"][t_id][0], # pdeprel
|
||||
"Y" if fprd else "_", # fillpred
|
||||
token[3] if fprd else "_", # pred
|
||||
more_underscores,
|
||||
)
|
||||
out_str += "\n"
|
||||
# print(out_str)
|
||||
|
||||
0
tools/srl-20131216/ger-eval.out
Normal file
0
tools/srl-20131216/ger-eval.out
Normal file
21409
tools/srl-20131216/ger-eval.out.tmp
Normal file
21409
tools/srl-20131216/ger-eval.out.tmp
Normal file
File diff suppressed because it is too large
Load Diff
2
tools/srl-20131216/scripts/learn_mod.sh
Normal file → Executable file
2
tools/srl-20131216/scripts/learn_mod.sh
Normal file → Executable file
@@ -11,7 +11,7 @@
|
||||
##################################################
|
||||
## (1) The following needs to be set appropriately
|
||||
##################################################
|
||||
CORPUS=./../../data/sl.train.mate
|
||||
CORPUS=./../../data/mate_train/sl.all.mate
|
||||
Lang="ger"
|
||||
MODEL="srl-$Lang.model"
|
||||
|
||||
|
||||
@@ -11,9 +11,9 @@
|
||||
##################################################
|
||||
## (1) The following needs to be set appropriately
|
||||
##################################################
|
||||
INPUT=./../../data/sl.test.mate
|
||||
Lang="ger"
|
||||
MODEL="./srl-ger.model"
|
||||
INPUT=~/corpora/conll09/spa/CoNLL2009-ST-evaluation-Spanish-SRLonly.txt
|
||||
Lang="spa"
|
||||
MODEL="./srl-spa.model"
|
||||
OUTPUT="${Lang}-eval.out"
|
||||
|
||||
##################################################
|
||||
|
||||
8
tools/srl-20131216/scripts/parse_srl_only_mod.sh
Normal file → Executable file
8
tools/srl-20131216/scripts/parse_srl_only_mod.sh
Normal file → Executable file
@@ -11,9 +11,11 @@
|
||||
##################################################
|
||||
## (1) The following needs to be set appropriately
|
||||
##################################################
|
||||
INPUT=~/corpora/conll09/spa/CoNLL2009-ST-evaluation-Spanish-SRLonly.txt
|
||||
Lang="spa"
|
||||
MODEL="./srl-spa.model"
|
||||
|
||||
# INPUT=./../../data/mate_train/sl.test.mate
|
||||
INPUT=./../../data/kres_example_out/F0006347.xml.parsed.tsv
|
||||
Lang="ger"
|
||||
MODEL="./srl-ger.model"
|
||||
OUTPUT="${Lang}-eval.out"
|
||||
|
||||
##################################################
|
||||
|
||||
BIN
tools/srl-20131216/srl-ger.model
Normal file
BIN
tools/srl-20131216/srl-ger.model
Normal file
Binary file not shown.
Reference in New Issue
Block a user