finished parse + tag toolchain -> TODO: tagger error

2019-02-18 08:49:04 +01:00
parent 9e9e1910a0
commit f251c912e3
20 changed files with 26489 additions and 54 deletions
@@ -0,0 +1,11 @@
+# Usage
+
+## parse.py
+Reads corpora and creates various `tsv` output formats in `../data`.  
+
+Edit the file and run `$ python3 parse.py`.  
+
+## Tagging
+Cd to srl-20...
+Run `./scripts/learn_mod.sh` to train a model -> output is `srl-ger.model`.   
+Run `./scripts/parse_srl_only_mod.sh` to
@@ -1,43 +0,0 @@
-from parser.parser import Parser
-import os
-from os.path import join
-import re
-import sys
-import cProfile
-
-
-def main():
-    # make sure you sanitize every input into unicode
-
-    SSJ500K_2_1 = 27829  # number of sentences
-    par = Parser()
-
-    print("parsing ssj")
-    ssj_file = "../data/ssj500k-sl.sample.xml"
-    ssj_dict = par.parse_tei(ssj_file)
-    # assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
-    print("end parsing ssj")
-
-    print("parsing kres")
-    # kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
-    kres_dir = "../data/kres_example/"
-    for kres_file in os.listdir(kres_dir):
-
-        # since there will be processed files in the same folder
-        if re.match("^F.+\.xml\.parsed\.xml$", kres_file) is None:
-            continue
-
-        print("Processing file: " + kres_file)
-        out_file = ""
-        res_dict = par.parse_tei(join(kres_dir, kres_file))
-        for _, sentence in res_dict.items():
-            out_file += par.to_conll_2009_SRL(sentence)
-        with open(join(kres_dir, kres_file + ".tsv"), "wb+") as fp:
-            fp.write(out_file.encode("utf-8"))
-            fp.close()
-    print("end parsing kres")
-
-
-if __name__ == "__main__":
-    cProfile.run("main()", sort="tottime")
-    # main()
@@ -0,0 +1,45 @@
+from parser.parser import Parser
+import os
+from os.path import join, dirname
+from pathlib import Path
+import re
+import sys
+import cProfile
+
+
+def main():
+    # make sure you sanitize every input into unicode
+
+    SSJ500K_2_1 = 27829  # number of sentences
+    par = Parser()
+
+    """
+    print("parsing ssj")
+    ssj_file = "../data/ssj500k-sl.sample.xml"
+    ssj_dict = par.parse_tei(ssj_file)
+    # assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
+    print("end parsing ssj")
+    """
+
+    print("parsing kres")
+    # kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
+    kres_dir = Path("../data/kres_example/").resolve()
+    kres_out_dir = kres_dir.parent / (kres_dir.name + "_out")
+    kres_out_dir.mkdir(exist_ok=True)
+
+    for kres_file in [x for x in kres_dir.iterdir() if x.is_file()]:
+
+        print("Processing file: " + str(kres_file))
+        out_str = ""
+        res_dict = par.parse_tei(kres_file)
+        for _, sentence in res_dict.items():
+            out_str += par.to_conll_2009_SRL(sentence)
+        with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
+            fp.write(out_str.encode("utf-8"))
+            fp.close()
+    print("end parsing kres")
+
+
+if __name__ == "__main__":
+    # cProfile.run("main()", sort="tottime")
+    main()
@@ -2,5 +2,30 @@
 Help conversion between english and slovenian MSD.  
 Hardcoded values from online documentation (html tables).  

+## Tagging
+Go to `./srl-29...` and run `./scripts/{learn...,parse...}`.  
+Change paths in the scripts.  
+
+## ERR
+Getting this weird error:
+```bash
+Executing: java -cp srl.jar:lib/liblinear-1.51-with-deps.jar:lib/anna.jar -Xmx2g se.lth.cs.srl.Parse ger ./../../data/kres_example_out/F0006347.xml.parsed.tsv ./srl-ger.model  -nopi ger-eval.out
+Loading pipeline from ./srl-ger.model
+Writing corpus to ger-eval.out...
+Opening reader for ./../../data/kres_example_out/F0006347.xml.parsed.tsv...
+Exception in thread "main" java.lang.IndexOutOfBoundsException: Index: 33, Size: 32
+	at java.util.ArrayList.rangeCheck(ArrayList.java:657)
+	at java.util.ArrayList.get(ArrayList.java:433)
+	at se.lth.cs.srl.corpus.Sentence.buildDependencyTree(Sentence.java:61)
+	at se.lth.cs.srl.corpus.Sentence.newSRLOnlySentence(Sentence.java:182)
+	at se.lth.cs.srl.io.SRLOnlyCoNLL09Reader.readNextSentence(SRLOnlyCoNLL09Reader.java:23)
+	at se.lth.cs.srl.io.AbstractCoNLL09Reader.open(AbstractCoNLL09Reader.java:43)
+	at se.lth.cs.srl.io.AbstractCoNLL09Reader.<init>(AbstractCoNLL09Reader.java:26)
+	at se.lth.cs.srl.io.SRLOnlyCoNLL09Reader.<init>(SRLOnlyCoNLL09Reader.java:11)
+	at se.lth.cs.srl.Parse.main(Parse.java:36)
+root@9f69d66a0d39:/cjvt-srl-tagging/tools/srl-20131216# 
+
+```
+
 ## Sources
 [1] (conll09 data format) https://nlpado.de/~sebastian/pub/papers/conll09_hajic.pdf
@@ -34,7 +34,7 @@ class Parser:

        guess_corpus = None  # SSJ | KRES
        res_dict = {}
-        with open(filepath, "rb") as fp:
+        with filepath.open("rb") as fp:
            # remove namespaces
            bstr = fp.read()

@@ -135,8 +135,10 @@ class Parser:
            print(sentence_entry["links"])
            """

-            #            1   3   4   5   6   7   8   9  10  11  12  13  14  15
-            out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
+            more_underscores = "".join(["\t_" for x in range(9)])
+
+            #            1   3   4   5   6   7   8   9  10  11  12  13  14  15 n
+            out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}{}\r\n".format(
                t_id,  # id
                token[2],  # form
                token[3],  # lemma
@@ -151,6 +153,7 @@ class Parser:
                sentence_entry["links"][t_id][0],  # pdeprel
                "Y" if fprd else "_",  # fillpred
                token[3] if fprd else "_",  # pred
+                more_underscores,
            )
        out_str += "\n"
        # print(out_str)
@@ -11,7 +11,7 @@
 ##################################################
 ## (1) The following needs to be set appropriately
 ##################################################
-CORPUS=./../../data/sl.train.mate
+CORPUS=./../../data/mate_train/sl.all.mate
 Lang="ger"
 MODEL="srl-$Lang.model"

@@ -11,9 +11,9 @@
 ##################################################
 ## (1) The following needs to be set appropriately
 ##################################################
-INPUT=./../../data/sl.test.mate
-Lang="ger"
-MODEL="./srl-ger.model"
+INPUT=~/corpora/conll09/spa/CoNLL2009-ST-evaluation-Spanish-SRLonly.txt
+Lang="spa"
+MODEL="./srl-spa.model"
 OUTPUT="${Lang}-eval.out"

 ##################################################
@@ -11,9 +11,11 @@
 ##################################################
 ## (1) The following needs to be set appropriately
 ##################################################
-INPUT=~/corpora/conll09/spa/CoNLL2009-ST-evaluation-Spanish-SRLonly.txt
-Lang="spa"
-MODEL="./srl-spa.model"
+
+# INPUT=./../../data/mate_train/sl.test.mate
+INPUT=./../../data/kres_example_out/F0006347.xml.parsed.tsv
+Lang="ger"
+MODEL="./srl-ger.model"
 OUTPUT="${Lang}-eval.out"

 ##################################################