finished parse + tag toolchain -> TODO: tagger error

This commit is contained in:
voje
2019-02-18 08:49:04 +01:00
parent 9e9e1910a0
commit f251c912e3
20 changed files with 26489 additions and 54 deletions

11
tools/README.md Normal file
View File

@@ -0,0 +1,11 @@
# Usage
## parse.py
Reads corpora and creates various `tsv` output formats in `../data`.
Edit the file and run `$ python3 parse.py`.
## Tagging
Cd to srl-20...
Run `./scripts/learn_mod.sh` to train a model -> output is `srl-ger.model`.
Run `./scripts/parse_srl_only_mod.sh` to

View File

@@ -1,43 +0,0 @@
from parser.parser import Parser
import os
from os.path import join
import re
import sys
import cProfile
def main():
# make sure you sanitize every input into unicode
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
print("parsing kres")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
kres_dir = "../data/kres_example/"
for kres_file in os.listdir(kres_dir):
# since there will be processed files in the same folder
if re.match("^F.+\.xml\.parsed\.xml$", kres_file) is None:
continue
print("Processing file: " + kres_file)
out_file = ""
res_dict = par.parse_tei(join(kres_dir, kres_file))
for _, sentence in res_dict.items():
out_file += par.to_conll_2009_SRL(sentence)
with open(join(kres_dir, kres_file + ".tsv"), "wb+") as fp:
fp.write(out_file.encode("utf-8"))
fp.close()
print("end parsing kres")
if __name__ == "__main__":
cProfile.run("main()", sort="tottime")
# main()

45
tools/parse.py Normal file
View File

@@ -0,0 +1,45 @@
from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
def main():
# make sure you sanitize every input into unicode
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""
print("parsing kres")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
kres_dir = Path("../data/kres_example/").resolve()
kres_out_dir = kres_dir.parent / (kres_dir.name + "_out")
kres_out_dir.mkdir(exist_ok=True)
for kres_file in [x for x in kres_dir.iterdir() if x.is_file()]:
print("Processing file: " + str(kres_file))
out_str = ""
res_dict = par.parse_tei(kres_file)
for _, sentence in res_dict.items():
out_str += par.to_conll_2009_SRL(sentence)
with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(out_str.encode("utf-8"))
fp.close()
print("end parsing kres")
if __name__ == "__main__":
# cProfile.run("main()", sort="tottime")
main()

View File

@@ -2,5 +2,30 @@
Help conversion between english and slovenian MSD.
Hardcoded values from online documentation (html tables).
## Tagging
Go to `./srl-29...` and run `./scripts/{learn...,parse...}`.
Change paths in the scripts.
## ERR
Getting this weird error:
```bash
Executing: java -cp srl.jar:lib/liblinear-1.51-with-deps.jar:lib/anna.jar -Xmx2g se.lth.cs.srl.Parse ger ./../../data/kres_example_out/F0006347.xml.parsed.tsv ./srl-ger.model -nopi ger-eval.out
Loading pipeline from ./srl-ger.model
Writing corpus to ger-eval.out...
Opening reader for ./../../data/kres_example_out/F0006347.xml.parsed.tsv...
Exception in thread "main" java.lang.IndexOutOfBoundsException: Index: 33, Size: 32
at java.util.ArrayList.rangeCheck(ArrayList.java:657)
at java.util.ArrayList.get(ArrayList.java:433)
at se.lth.cs.srl.corpus.Sentence.buildDependencyTree(Sentence.java:61)
at se.lth.cs.srl.corpus.Sentence.newSRLOnlySentence(Sentence.java:182)
at se.lth.cs.srl.io.SRLOnlyCoNLL09Reader.readNextSentence(SRLOnlyCoNLL09Reader.java:23)
at se.lth.cs.srl.io.AbstractCoNLL09Reader.open(AbstractCoNLL09Reader.java:43)
at se.lth.cs.srl.io.AbstractCoNLL09Reader.<init>(AbstractCoNLL09Reader.java:26)
at se.lth.cs.srl.io.SRLOnlyCoNLL09Reader.<init>(SRLOnlyCoNLL09Reader.java:11)
at se.lth.cs.srl.Parse.main(Parse.java:36)
root@9f69d66a0d39:/cjvt-srl-tagging/tools/srl-20131216#
```
## Sources
[1] (conll09 data format) https://nlpado.de/~sebastian/pub/papers/conll09_hajic.pdf

View File

@@ -34,7 +34,7 @@ class Parser:
guess_corpus = None # SSJ | KRES
res_dict = {}
with open(filepath, "rb") as fp:
with filepath.open("rb") as fp:
# remove namespaces
bstr = fp.read()
@@ -135,8 +135,10 @@ class Parser:
print(sentence_entry["links"])
"""
# 1 3 4 5 6 7 8 9 10 11 12 13 14 15
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
more_underscores = "".join(["\t_" for x in range(9)])
# 1 3 4 5 6 7 8 9 10 11 12 13 14 15 n
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}{}\r\n".format(
t_id, # id
token[2], # form
token[3], # lemma
@@ -151,6 +153,7 @@ class Parser:
sentence_entry["links"][t_id][0], # pdeprel
"Y" if fprd else "_", # fillpred
token[3] if fprd else "_", # pred
more_underscores,
)
out_str += "\n"
# print(out_str)

View File

File diff suppressed because it is too large Load Diff

2
tools/srl-20131216/scripts/learn_mod.sh Normal file → Executable file
View File

@@ -11,7 +11,7 @@
##################################################
## (1) The following needs to be set appropriately
##################################################
CORPUS=./../../data/sl.train.mate
CORPUS=./../../data/mate_train/sl.all.mate
Lang="ger"
MODEL="srl-$Lang.model"

View File

@@ -11,9 +11,9 @@
##################################################
## (1) The following needs to be set appropriately
##################################################
INPUT=./../../data/sl.test.mate
Lang="ger"
MODEL="./srl-ger.model"
INPUT=~/corpora/conll09/spa/CoNLL2009-ST-evaluation-Spanish-SRLonly.txt
Lang="spa"
MODEL="./srl-spa.model"
OUTPUT="${Lang}-eval.out"
##################################################

8
tools/srl-20131216/scripts/parse_srl_only_mod.sh Normal file → Executable file
View File

@@ -11,9 +11,11 @@
##################################################
## (1) The following needs to be set appropriately
##################################################
INPUT=~/corpora/conll09/spa/CoNLL2009-ST-evaluation-Spanish-SRLonly.txt
Lang="spa"
MODEL="./srl-spa.model"
# INPUT=./../../data/mate_train/sl.test.mate
INPUT=./../../data/kres_example_out/F0006347.xml.parsed.tsv
Lang="ger"
MODEL="./srl-ger.model"
OUTPUT="${Lang}-eval.out"
##################################################

Binary file not shown.