forked from kristjan/cjvt-srl-tagging
finished parse + tag toolchain -> TODO: tagger error
This commit is contained in:
@@ -2,5 +2,30 @@
|
||||
Help conversion between english and slovenian MSD.
|
||||
Hardcoded values from online documentation (html tables).
|
||||
|
||||
## Tagging
|
||||
Go to `./srl-29...` and run `./scripts/{learn...,parse...}`.
|
||||
Change paths in the scripts.
|
||||
|
||||
## ERR
|
||||
Getting this weird error:
|
||||
```bash
|
||||
Executing: java -cp srl.jar:lib/liblinear-1.51-with-deps.jar:lib/anna.jar -Xmx2g se.lth.cs.srl.Parse ger ./../../data/kres_example_out/F0006347.xml.parsed.tsv ./srl-ger.model -nopi ger-eval.out
|
||||
Loading pipeline from ./srl-ger.model
|
||||
Writing corpus to ger-eval.out...
|
||||
Opening reader for ./../../data/kres_example_out/F0006347.xml.parsed.tsv...
|
||||
Exception in thread "main" java.lang.IndexOutOfBoundsException: Index: 33, Size: 32
|
||||
at java.util.ArrayList.rangeCheck(ArrayList.java:657)
|
||||
at java.util.ArrayList.get(ArrayList.java:433)
|
||||
at se.lth.cs.srl.corpus.Sentence.buildDependencyTree(Sentence.java:61)
|
||||
at se.lth.cs.srl.corpus.Sentence.newSRLOnlySentence(Sentence.java:182)
|
||||
at se.lth.cs.srl.io.SRLOnlyCoNLL09Reader.readNextSentence(SRLOnlyCoNLL09Reader.java:23)
|
||||
at se.lth.cs.srl.io.AbstractCoNLL09Reader.open(AbstractCoNLL09Reader.java:43)
|
||||
at se.lth.cs.srl.io.AbstractCoNLL09Reader.<init>(AbstractCoNLL09Reader.java:26)
|
||||
at se.lth.cs.srl.io.SRLOnlyCoNLL09Reader.<init>(SRLOnlyCoNLL09Reader.java:11)
|
||||
at se.lth.cs.srl.Parse.main(Parse.java:36)
|
||||
root@9f69d66a0d39:/cjvt-srl-tagging/tools/srl-20131216#
|
||||
|
||||
```
|
||||
|
||||
## Sources
|
||||
[1] (conll09 data format) https://nlpado.de/~sebastian/pub/papers/conll09_hajic.pdf
|
||||
@@ -34,7 +34,7 @@ class Parser:
|
||||
|
||||
guess_corpus = None # SSJ | KRES
|
||||
res_dict = {}
|
||||
with open(filepath, "rb") as fp:
|
||||
with filepath.open("rb") as fp:
|
||||
# remove namespaces
|
||||
bstr = fp.read()
|
||||
|
||||
@@ -135,8 +135,10 @@ class Parser:
|
||||
print(sentence_entry["links"])
|
||||
"""
|
||||
|
||||
# 1 3 4 5 6 7 8 9 10 11 12 13 14 15
|
||||
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
|
||||
more_underscores = "".join(["\t_" for x in range(9)])
|
||||
|
||||
# 1 3 4 5 6 7 8 9 10 11 12 13 14 15 n
|
||||
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}{}\r\n".format(
|
||||
t_id, # id
|
||||
token[2], # form
|
||||
token[3], # lemma
|
||||
@@ -151,6 +153,7 @@ class Parser:
|
||||
sentence_entry["links"][t_id][0], # pdeprel
|
||||
"Y" if fprd else "_", # fillpred
|
||||
token[3] if fprd else "_", # pred
|
||||
more_underscores,
|
||||
)
|
||||
out_str += "\n"
|
||||
# print(out_str)
|
||||
|
||||
Reference in New Issue
Block a user