asdf

2019-02-19 08:07:03 +01:00
parent f251c912e3
commit 142ad22ba3
18 changed files with 10831 additions and 4931 deletions
@@ -0,0 +1,10 @@
+all: parse tag
+
+parse:
+	python3 parse.py
+
+tag_srl:
+	cd srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
+
+tag_full:
+	cd srl-20131216 ./scripts/parser_full_mod.sh; cd -
@@ -24,19 +24,42 @@ def main():
    print("parsing kres")
    # kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
    kres_dir = Path("../data/kres_example/").resolve()
+
    kres_out_dir = kres_dir.parent / (kres_dir.name + "_out")
    kres_out_dir.mkdir(exist_ok=True)

+    kres_full_out_dir = kres_dir.parent / (kres_dir.name + "_full_out")
+    kres_full_out_dir.mkdir(exist_ok=True)
+
    for kres_file in [x for x in kres_dir.iterdir() if x.is_file()]:

        print("Processing file: " + str(kres_file))
-        out_str = ""
        res_dict = par.parse_tei(kres_file)
+        longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
+        print("Longest sentence: ", longest_sent)
+        kres_out_str = ""
+        kres_full_out_str = ""
+
        for _, sentence in res_dict.items():
-            out_str += par.to_conll_2009_SRL(sentence)
-        with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
-            fp.write(out_str.encode("utf-8"))
-            fp.close()
+            kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
+            kres_full_out_str += par.to_conll_2009_full(sentence)
+
+        # for SRL tagging
+        try:
+            with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
+                fp.write(kres_out_str.encode("utf-8"))
+                fp.close()
+        except:
+            pass
+
+        try:
+            # for full tokenization
+            with (kres_full_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
+                fp.write(kres_full_out_str.encode("utf-8"))
+                fp.close()
+        except:
+            pass
+
    print("end parsing kres")


@@ -109,7 +109,7 @@ class Parser:
        return res_dict


-    def to_conll_2009_SRL(self, sentence_entry):
+    def to_conll_2009_SRL(self, sentence_entry, napreds=100):

        def fillpred(pos, feat):
            if pos == "V" and "main" in feat.split("|"):
@@ -135,10 +135,10 @@ class Parser:
            print(sentence_entry["links"])
            """

-            more_underscores = "".join(["\t_" for x in range(9)])
+            apreds = "".join(["\t_" for x in range(napreds)])

-            #            1   3   4   5   6   7   8   9  10  11  12  13  14  15 n
-            out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}{}\r\n".format(
+            # format: 14 + apreds
+            out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}{}\n".format(
                t_id,  # id
                token[2],  # form
                token[3],  # lemma
@@ -153,7 +153,7 @@ class Parser:
                sentence_entry["links"][t_id][0],  # pdeprel
                "Y" if fprd else "_",  # fillpred
                token[3] if fprd else "_",  # pred
-                more_underscores,
+                apreds,
            )
        out_str += "\n"
        # print(out_str)
@@ -0,0 +1,60 @@
+#!/bin/sh
+
+## There are three sets of options that need, may need to, and could be changed.
+## (1) deals with input and output. You have to set these (in particular, you need to provide models)
+## (2) deals with the jvm parameters and may need to be changed
+## (3) deals with the behaviour of the system
+
+## For further information on switches, see the source code, or run
+## java -cp srl.jar se.lth.cs.srl.Parse --help
+
+##################################################
+## (1) The following needs to be set appropriately
+##################################################
+#INPUT="/home/anders/corpora/conll09/eng/CoNLL2009-evaluation-English-SRLonly.txt" #evaluation corpus
+# INPUT=/home/anders/corpora/conll09/chi/CoNLL2009-ST-evaluation-Chinese-SRLonly.txt
+INPUT=./../../data/kres_example_full_out/F0006347.xml.parsed.tsv
+LANG="ger"
+##TOKENIZER_MODEL="models/eng/EnglishTok.bin.gz" #This is not used here anyway. The input is assumed to be segmented/tokenized already. 
+##LEMMATIZER_MODEL="models/chi/lemma-eng.model"
+POS_MODEL="models/chi/tag-chn.model"
+#MORPH_MODEL="models/ger/morph-ger.model" #Morphological tagger is not applicable to English. Fix the path and uncomment if you are running german.
+PARSER_MODEL="models/chi/prs-chn.model"
+SRL_MODEL="models/chi/srl-chn.model"
+OUTPUT="$LANG.out"
+
+##################################################
+## (2) These ones may need to be changed
+##################################################
+JAVA="java" #Edit this i you want to use a specific JRE.
+MEM="4g" #Memory for the JVM, might need to be increased for large corpora.
+CP="srl.jar:lib/anna.jar:lib/liblinear-1.51-with-deps.jar:lib/opennlp-tools-1.4.3.jar:lib/maxent-2.5.2.jar:lib/trove.jar:lib/seg.jar"
+JVM_ARGS="-cp $CP -Xmx$MEM"
+
+##################################################
+## (3) The following changes the behaviour of the system
+##################################################
+#RERANKER="-reranker" #Uncomment this if you want to use a reranker too. The model is assumed to contain a reranker. While training, the corresponding parameter has to be provided.
+#NOPI="-nopi" #Uncomment this if you want to skip the predicate identification step.
+
+
+
+##################################################
+
+CMD="$JAVA $JVM_ARGS se.lth.cs.srl.CompletePipeline $LANG $NOPI $RERANKER -tagger $POS_MODEL -parser $PARSER_MODEL -srl $SRL_MODEL -test $INPUT -out $OUTPUT"
+
+if [ "$TOKENIZER_MODEL" != "" ]; then
+  CMD="$CMD -token $TOKENIZER_MODEL"
+fi
+
+if [ "$LEMMATIZER_MODEL" != "" ]; then
+  CMD="$CMD -lemma $LEMMATIZER_MODEL"
+fi
+
+if [ "$MORPH_MODEL" != "" ]; then
+  CMD="$CMD -morph $MORPH_MODEL"
+fi
+
+echo "Executing: $CMD"
+
+$CMD