This commit is contained in:
voje
2019-02-19 08:07:03 +01:00
parent f251c912e3
commit 142ad22ba3
18 changed files with 10831 additions and 4931 deletions

10
tools/Makefile Normal file
View File

@@ -0,0 +1,10 @@
all: parse tag
parse:
python3 parse.py
tag_srl:
cd srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
tag_full:
cd srl-20131216 ./scripts/parser_full_mod.sh; cd -

View File

@@ -24,19 +24,42 @@ def main():
print("parsing kres")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
kres_dir = Path("../data/kres_example/").resolve()
kres_out_dir = kres_dir.parent / (kres_dir.name + "_out")
kres_out_dir.mkdir(exist_ok=True)
kres_full_out_dir = kres_dir.parent / (kres_dir.name + "_full_out")
kres_full_out_dir.mkdir(exist_ok=True)
for kres_file in [x for x in kres_dir.iterdir() if x.is_file()]:
print("Processing file: " + str(kres_file))
out_str = ""
res_dict = par.parse_tei(kres_file)
longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
print("Longest sentence: ", longest_sent)
kres_out_str = ""
kres_full_out_str = ""
for _, sentence in res_dict.items():
out_str += par.to_conll_2009_SRL(sentence)
with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(out_str.encode("utf-8"))
fp.close()
kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
kres_full_out_str += par.to_conll_2009_full(sentence)
# for SRL tagging
try:
with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
fp.close()
except:
pass
try:
# for full tokenization
with (kres_full_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_full_out_str.encode("utf-8"))
fp.close()
except:
pass
print("end parsing kres")

View File

@@ -109,7 +109,7 @@ class Parser:
return res_dict
def to_conll_2009_SRL(self, sentence_entry):
def to_conll_2009_SRL(self, sentence_entry, napreds=100):
def fillpred(pos, feat):
if pos == "V" and "main" in feat.split("|"):
@@ -135,10 +135,10 @@ class Parser:
print(sentence_entry["links"])
"""
more_underscores = "".join(["\t_" for x in range(9)])
apreds = "".join(["\t_" for x in range(napreds)])
# 1 3 4 5 6 7 8 9 10 11 12 13 14 15 n
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}{}\r\n".format(
# format: 14 + apreds
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}{}\n".format(
t_id, # id
token[2], # form
token[3], # lemma
@@ -153,7 +153,7 @@ class Parser:
sentence_entry["links"][t_id][0], # pdeprel
"Y" if fprd else "_", # fillpred
token[3] if fprd else "_", # pred
more_underscores,
apreds,
)
out_str += "\n"
# print(out_str)

View File

@@ -0,0 +1,60 @@
#!/bin/sh
## There are three sets of options that need, may need to, and could be changed.
## (1) deals with input and output. You have to set these (in particular, you need to provide models)
## (2) deals with the jvm parameters and may need to be changed
## (3) deals with the behaviour of the system
## For further information on switches, see the source code, or run
## java -cp srl.jar se.lth.cs.srl.Parse --help
##################################################
## (1) The following needs to be set appropriately
##################################################
#INPUT="/home/anders/corpora/conll09/eng/CoNLL2009-evaluation-English-SRLonly.txt" #evaluation corpus
# INPUT=/home/anders/corpora/conll09/chi/CoNLL2009-ST-evaluation-Chinese-SRLonly.txt
INPUT=./../../data/kres_example_full_out/F0006347.xml.parsed.tsv
LANG="ger"
##TOKENIZER_MODEL="models/eng/EnglishTok.bin.gz" #This is not used here anyway. The input is assumed to be segmented/tokenized already.
##LEMMATIZER_MODEL="models/chi/lemma-eng.model"
POS_MODEL="models/chi/tag-chn.model"
#MORPH_MODEL="models/ger/morph-ger.model" #Morphological tagger is not applicable to English. Fix the path and uncomment if you are running german.
PARSER_MODEL="models/chi/prs-chn.model"
SRL_MODEL="models/chi/srl-chn.model"
OUTPUT="$LANG.out"
##################################################
## (2) These ones may need to be changed
##################################################
JAVA="java" #Edit this i you want to use a specific JRE.
MEM="4g" #Memory for the JVM, might need to be increased for large corpora.
CP="srl.jar:lib/anna.jar:lib/liblinear-1.51-with-deps.jar:lib/opennlp-tools-1.4.3.jar:lib/maxent-2.5.2.jar:lib/trove.jar:lib/seg.jar"
JVM_ARGS="-cp $CP -Xmx$MEM"
##################################################
## (3) The following changes the behaviour of the system
##################################################
#RERANKER="-reranker" #Uncomment this if you want to use a reranker too. The model is assumed to contain a reranker. While training, the corresponding parameter has to be provided.
#NOPI="-nopi" #Uncomment this if you want to skip the predicate identification step.
##################################################
CMD="$JAVA $JVM_ARGS se.lth.cs.srl.CompletePipeline $LANG $NOPI $RERANKER -tagger $POS_MODEL -parser $PARSER_MODEL -srl $SRL_MODEL -test $INPUT -out $OUTPUT"
if [ "$TOKENIZER_MODEL" != "" ]; then
CMD="$CMD -token $TOKENIZER_MODEL"
fi
if [ "$LEMMATIZER_MODEL" != "" ]; then
CMD="$CMD -lemma $LEMMATIZER_MODEL"
fi
if [ "$MORPH_MODEL" != "" ]; then
CMD="$CMD -morph $MORPH_MODEL"
fi
echo "Executing: $CMD"
$CMD

Binary file not shown.