srl taggin pipeline (output in .tsv)

This commit is contained in:
2019-02-24 22:23:32 +01:00
parent 9939bf0f55
commit b79721f6a7
25 changed files with 10104 additions and 4255 deletions

View File

@@ -1,13 +0,0 @@
all: parse tag_srl
parse:
python3 main.py
tag_srl:
cd srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
tag_full:
cd srl-20131216 ./scripts/parser_full_mod.sh; cd -
fillpred_model/model:
cd fillpred_model; python3 fpmodel.py

View File

@@ -4,4 +4,7 @@ data1.pickle:
python3 step1.py
model.pickle: data1.pickle
python3 step2.py
python3 step2.py
clean:
rm ./*.pickle

View File

View File

@@ -2,39 +2,53 @@ from pathlib import Path
import numpy as np
import pandas as pd
INFILE = "../../data/mate_train/sl.test.mate"
# INFILE = "../../data/mate_train/sl.all.mate"
OUTFILE = "data1.pickle"
ssj_mate = Path(INFILE)
df = pd.read_csv(ssj_mate, sep='~', header=None)
df = df.iloc[:,0].str.split('\t', n=14, expand=True)
print(df.head())
"""
msd_set = set()
for i, r in df.iterrows():
msd_set.update(r[6].split("|"))
def gen_msdlabels(df):
msd_set = set()
for i, r in df.iterrows():
msd_set.update(r[6].split("|"))
return sorted(list(msd_set))
msdlabels = sorted(list(msd_set))
"""
msdlabels = ['!', '"', '#', '%', "'", '(', ')', '+Animate', '+Clitic', '+Definiteness', '+Negative', ',', '-', '-Animate', '-Definiteness', '-Negative', '.', '/', ':', ';', '?', 'Abbreviation', 'Adjective', 'Adposition', 'Adverb', 'Conjunction', 'Interjection', 'Noun', 'Numeral', 'Particle', 'Pronoun', 'Residual', 'Verb', 'accusative', 'auxiliary', 'biaspectual', 'bound', 'cardinal', 'common', 'comparative', 'conditional', 'coordinating', 'dative', 'demonstrative', 'digit', 'dual', 'feminine', 'first', 'foreign', 'future', 'general', 'genitive', 'imperative', 'indefinite', 'infinitive', 'instrumental', 'interrogative', 'letter', 'locative', 'main', 'masculine', 'negative', 'neuter', 'nominative', 'ordinal', 'participle', 'perfective', 'personal', 'plural', 'positive', 'possessive', 'present', 'progressive', 'pronominal', 'proper', 'reflexive', 'relative', 'roman', 'second', 'singular', 'special', 'subordinating', 'superlative', 'supine', 'third', '«', '°', '»', '', '', '']
print("labels: \n", msdlabels)
labels = ["biti"] + msdlabels + ["fillpred"]
ndf = pd.DataFrame(columns=labels, dtype=bool)
for i, r in df.iterrows():
lemma = r[2]
msd = r[6]
fillpred = r[12] # y
def build_model_row(tsv_row):
# input: tsv_row in conll_2009 format
# preprocessed -- all possible msd values, alphabetically sorted
msdlabels = ['!', '"', '#', '%', "'", '(', ')', '+Animate', '+Clitic', '+Definiteness', '+Negative', ',', '-', '-Animate', '-Definiteness', '-Negative', '.', '/', ':', ';', '?', 'Abbreviation', 'Adjective', 'Adposition', 'Adverb', 'Conjunction', 'Interjection', 'Noun', 'Numeral', 'Particle', 'Pronoun', 'Residual', 'Verb', 'accusative', 'auxiliary', 'biaspectual', 'bound', 'cardinal', 'common', 'comparative', 'conditional', 'coordinating', 'dative', 'demonstrative', 'digit', 'dual', 'feminine', 'first', 'foreign', 'future', 'general', 'genitive', 'imperative', 'indefinite', 'infinitive', 'instrumental', 'interrogative', 'letter', 'locative', 'main', 'masculine', 'negative', 'neuter', 'nominative', 'ordinal', 'participle', 'perfective', 'personal', 'plural', 'positive', 'possessive', 'present', 'progressive', 'pronominal', 'proper', 'reflexive', 'relative', 'roman', 'second', 'singular', 'special', 'subordinating', 'superlative', 'supine', 'third', '«', '°', '»', '', '', '']
lemma = tsv_row[2]
msd = tsv_row[6]
fillpred = tsv_row[12] # Y
row = []
row.append(lemma == "biti")
row.extend([lb in msd.split("|") for lb in msdlabels])
row.append(fillpred == "Y")
if i % 1000 == 0:
print(i, df.shape)
ndf.loc[i] = row
return row
print(ndf.head())
ndf.to_pickle(OUTFILE)
if __name__ == "__main__":
ssj_mate = Path(INFILE)
df = pd.read_csv(ssj_mate, sep='~', header=None)
df = df.iloc[:,0].str.split('\t', n=14, expand=True)
print(df.head())
# msdlabels = gen_msdlabels()
msdlabels = ['!', '"', '#', '%', "'", '(', ')', '+Animate', '+Clitic', '+Definiteness', '+Negative', ',', '-', '-Animate', '-Definiteness', '-Negative', '.', '/', ':', ';', '?', 'Abbreviation', 'Adjective', 'Adposition', 'Adverb', 'Conjunction', 'Interjection', 'Noun', 'Numeral', 'Particle', 'Pronoun', 'Residual', 'Verb', 'accusative', 'auxiliary', 'biaspectual', 'bound', 'cardinal', 'common', 'comparative', 'conditional', 'coordinating', 'dative', 'demonstrative', 'digit', 'dual', 'feminine', 'first', 'foreign', 'future', 'general', 'genitive', 'imperative', 'indefinite', 'infinitive', 'instrumental', 'interrogative', 'letter', 'locative', 'main', 'masculine', 'negative', 'neuter', 'nominative', 'ordinal', 'participle', 'perfective', 'personal', 'plural', 'positive', 'possessive', 'present', 'progressive', 'pronominal', 'proper', 'reflexive', 'relative', 'roman', 'second', 'singular', 'special', 'subordinating', 'superlative', 'supine', 'third', '«', '°', '»', '', '', '']
print("labels: \n", msdlabels)
labels = ["biti"] + msdlabels + ["fillpred"]
ndf = pd.DataFrame(columns=labels, dtype=bool)
for i, r in df.iterrows():
ndf.loc[i] = build_model_row(r)
if i % 1000 == 0:
print(i, df.shape)
print(ndf.head())
ndf.to_pickle(OUTFILE)

View File

@@ -8,22 +8,23 @@ from sklearn.metrics import accuracy_score
INFILE = "data1.pickle"
OUTFILE = "model.pickle"
df = pd.read_pickle(INFILE)
if __name__ == "__main__":
df = pd.read_pickle(INFILE)
X = df.values[:,0:-1]
y = df.values[:,-1]
X = df.values[:,0:-1]
y = df.values[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred = clf.predict(X_test)
print("Accuracy score: {:.4f}".format(accuracy_score(y_test, y_pred)*100))
print("Accuracy score: {:.4f}".format(accuracy_score(y_test, y_pred)*100))
# above was a test, now fit the actual model using the entire data
clf_full = DecisionTreeClassifier()
clf_full.fit(X, y)
# above was a test, now fit the actual model using the entire data
clf_full = DecisionTreeClassifier()
clf_full.fit(X, y)
pickle.dump(clf_full, open(OUTFILE, "wb"))
pickle.dump(clf_full, open(OUTFILE, "wb"))

View File

@@ -25,12 +25,9 @@ def main():
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
kres_dir = Path("../data/kres_example/").resolve()
kres_out_dir = kres_dir.parent / (kres_dir.name + "_out")
kres_out_dir = kres_dir.parent / (kres_dir.name + "_tsv")
kres_out_dir.mkdir(exist_ok=True)
kres_full_out_dir = kres_dir.parent / (kres_dir.name + "_full_out")
kres_full_out_dir.mkdir(exist_ok=True)
for kres_file in [x for x in kres_dir.iterdir() if x.is_file()]:
print("Processing file: " + str(kres_file))
@@ -38,27 +35,13 @@ def main():
longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
print("Longest sentence: ", longest_sent)
kres_out_str = ""
kres_full_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
kres_full_out_str += par.to_conll_2009_full(sentence)
# for SRL tagging
try:
with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
fp.close()
except:
pass
try:
# for full tokenization
with (kres_full_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_full_out_str.encode("utf-8"))
fp.close()
except:
pass
with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
fp.close()
print("end parsing kres")

View File

@@ -1,6 +1,9 @@
from lxml import etree
import re
from parser.msd.msdmap import Msdmap
import pickle
from pathlib import Path
from fillpred_model.step1 import build_model_row
class Parser:
# reads a TEI xml file and returns a dictionary:
@@ -15,6 +18,8 @@ class Parser:
self.W_TAGS = ['w']
self.C_TAGS = ['c']
self.S_TAGS = ['S', 'pc']
with Path("./fillpred_model/model.pickle").open("rb") as fp:
self.fillpred_model = pickle.load(fp)
def parse_tei(self, filepath):
@@ -111,11 +116,11 @@ class Parser:
def to_conll_2009_SRL(self, sentence_entry, napreds=9):
def fillpred(pos, feat):
# TODO (decision tree or bayes on mate training data)
if pos == "V" and "main" in feat.split("|"):
return True
return False
def fillpred(tsv_row):
mrow = build_model_row(tsv_row)
x = mrow[:-1]
y = self.fillpred_model.predict([x])
return y[0] # bool
apreds_string = '\t'.join(["_" for x in range(napreds)])
@@ -137,7 +142,6 @@ class Parser:
pos = self.msdmap.slo_msd_to_eng_pos(token[4])
feat = "|".join(self.msdmap.slo_msd_to_eng_long(token[4]).split(" "))
fprd = fillpred(pos, feat)
"""
print(t_id)
@@ -146,10 +150,7 @@ class Parser:
print(token)
print(sentence_entry["links"])
"""
# format: 14 + apreds
out_str += '\t'.join(map(str,
[
row_list = [
t_id,
form,
token[3], # lemma
@@ -162,11 +163,18 @@ class Parser:
sentence_entry["links"][t_id][2], # phead
sentence_entry["links"][t_id][0], # deprel
sentence_entry["links"][t_id][0], # pdeprel
"Y" if fprd else "_", # fillpred
token[3] if fprd else "_", # pred
"_", # fillpred
"_", # pred
apreds_string,
"\n",
]
]
fprd = fillpred(row_list)
row_list[12] = "Y" if fprd else "_"
row_list[13] = token[3] if fprd else "_"
# format: 14 + apreds
out_str += '\t'.join(map(str,
row_list
))
out_str += "\n" # newline at the end of sentence
return out_str

View File

@@ -81,43 +81,6 @@
81 akcijo akcija akcija N N _ Noun|common|feminine|singular|accusative 78 78 dol dol _ _ _ _ _ _ _ _
82 . . . . . _ . 0 0 modra modra _ _ _ _ _ _ _ _
1 Ker ker ker C C _ Conjunction|subordinating 14 14 vez vez _ _ _ _ _ _
2 se se se P P _ Pronoun|reflexive|+Clitic 14 14 del del _ _ _ _ _ _
3 nas jaz jaz P P _ Pronoun|personal|first|plural|accusative 5 5 dve dve _ _ _ _ _ _
4 v v v S S _ Adposition|locative 5 5 dol dol _ _ _ _ _ _
5 preteklosti preteklost preteklost N N _ Noun|common|feminine|singular|locative 14 14 štiri štiri _ _ TIME _ _ _
6 zaradi zaradi zaradi S S _ Adposition|genitive 8 8 dol dol _ _ _ _ _ _
7 genialnega genialen genialen A A _ Adjective|general|positive|masculine|singular|genitive 8 8 dol dol _ _ _ _ _ _
8 sistema sistem sistem N N _ Noun|common|masculine|singular|genitive 5 5 dol dol _ _ _ _ _ _
9 splošne splošen splošen A A _ Adjective|general|positive|feminine|singular|genitive 11 11 dol dol _ _ _ _ _ _
10 ljudske ljudski ljudski A A _ Adjective|general|positive|feminine|singular|genitive 11 11 dol dol _ _ _ _ _ _
11 obrambe obramba obramba N N _ Noun|common|feminine|singular|genitive 8 8 dol dol _ _ _ _ _ _
12 nihče nihče nihče P P _ Pronoun|negative|masculine|singular|nominative 14 14 ena ena _ _ ACT _ _ _
13 ni biti biti V V _ Verb|auxiliary|present|third|singular|+Negative 14 14 del del _ _ _ _ _ _
14 upal upati upati V V _ Verb|main|progressive|participle|singular|masculine 0 0 modra modra Y upati _ _ _ _
15 lotiti lotiti lotiti V V _ Verb|main|perfective|infinitive 14 14 dol dol Y lotiti MWPRED _ _ _
16 , , , , , _ , 0 0 modra modra _ _ _ _ _ _
17 se se se P P _ Pronoun|reflexive|+Clitic 20 20 del del _ _ _ _ _ _
18 sovrag sovrag sovrag N N _ Noun|common|masculine|singular|nominative 20 20 ena ena _ _ _ _ ACT _
19 sedaj sedaj sedaj R R _ Adverb|general|positive 20 20 štiri štiri _ _ _ _ TIME _
20 poslužuje posluževati posluževati V V _ Verb|main|progressive|present|third|singular 0 0 modra modra Y posluževati _ _ _ _
21 bolj bolj bolj R R _ Adverb|general|comparative 22 22 dol dol _ _ _ _ _ _
22 perfidnih perfiden perfiden A A _ Adjective|general|positive|feminine|plural|genitive 23 23 dol dol _ _ _ _ _ _
23 strategij strategija strategija N N _ Noun|common|feminine|plural|genitive 20 20 dve dve _ _ _ _ PAT _
24 , , , , , _ , 0 0 modra modra _ _ _ _ _ _
25 saj saj saj C C _ Conjunction|coordinating 30 30 vez vez _ _ _ _ _ _
26 nam jaz jaz P P _ Pronoun|personal|first|plural|dative 30 30 dve dve _ _ _ REC _ REC
27 lahko lahko lahko R R _ Adverb|general|positive 30 30 del del _ _ _ _ _ _
28 praktično praktično praktično R R _ Adverb|general|positive 29 29 dol dol _ _ _ _ _ _
29 kadarkoli kadarkoli kadarkoli R R _ Adverb|general|positive 30 30 tri tri _ _ _ _ _ MANN
30 odreže odrezati odrezati V V _ Verb|main|perfective|present|third|singular 0 0 modra modra Y odrezati _ _ _ _
31 dostop dostop dostop N N _ Noun|common|masculine|singular|accusative|-Animate 30 30 dve dve _ _ _ _ _ PAT
32 do do do S S _ Adposition|genitive 33 33 dol dol _ _ _ _ _ _
33 hrane hrana hrana N N _ Noun|common|feminine|singular|genitive 31 31 dol dol _ _ _ _ _ _
34 in in in C C _ Conjunction|coordinating 35 35 vez vez _ _ _ _ _ _
35 pijače pijača pijača N N _ Noun|common|feminine|singular|genitive 33 33 prir prir _ _ _ _ _ _
36 . . . . . _ . 0 0 modra modra _ _ _ _ _ _
1 Zadeva zadeva zadeva N N _ Noun|common|feminine|singular|nominative 2 2 ena ena _ _
2 je biti biti V V _ Verb|auxiliary|present|third|singular|-Negative 0 0 modra modra _ _
3 kristalno kristalno kristalno R R _ Adverb|general|positive 4 4 dol dol _ _
@@ -160,3 +123,40 @@
34 supersile supersila supersila N N _ Noun|common|feminine|singular|genitive 0 0 modra modra _ _ _ _
35 . . . . . _ . 0 0 modra modra _ _ _ _
1 Ker ker ker C C _ Conjunction|subordinating 14 14 vez vez _ _ _ _ _ _ _
2 se se se P P _ Pronoun|reflexive|+Clitic 14 14 del del _ _ _ _ _ _ _
3 nas jaz jaz P P _ Pronoun|personal|first|plural|accusative 5 5 dve dve _ _ _ _ _ _ _
4 v v v S S _ Adposition|locative 5 5 dol dol _ _ _ _ _ _ _
5 preteklosti preteklost preteklost N N _ Noun|common|feminine|singular|locative 14 14 štiri štiri _ _ _ TIME _ _ _
6 zaradi zaradi zaradi S S _ Adposition|genitive 8 8 dol dol _ _ _ _ _ _ _
7 genialnega genialen genialen A A _ Adjective|general|positive|masculine|singular|genitive 8 8 dol dol _ _ _ _ _ _ _
8 sistema sistem sistem N N _ Noun|common|masculine|singular|genitive 5 5 dol dol _ _ _ _ _ _ _
9 splošne splošen splošen A A _ Adjective|general|positive|feminine|singular|genitive 11 11 dol dol _ _ _ _ _ _ _
10 ljudske ljudski ljudski A A _ Adjective|general|positive|feminine|singular|genitive 11 11 dol dol _ _ _ _ _ _ _
11 obrambe obramba obramba N N _ Noun|common|feminine|singular|genitive 8 8 dol dol _ _ _ _ _ _ _
12 nihče nihče nihče P P _ Pronoun|negative|masculine|singular|nominative 14 14 ena ena _ _ _ ACT _ _ _
13 ni biti biti V V _ Verb|auxiliary|present|third|singular|+Negative 14 14 del del Y biti _ _ _ _ _
14 upal upati upati V V _ Verb|main|progressive|participle|singular|masculine 0 0 modra modra Y upati _ _ _ _ _
15 lotiti lotiti lotiti V V _ Verb|main|perfective|infinitive 14 14 dol dol Y lotiti _ MWPRED _ _ _
16 , , , , , _ , 0 0 modra modra _ _ _ _ _ _ _
17 se se se P P _ Pronoun|reflexive|+Clitic 20 20 del del _ _ _ _ _ _ _
18 sovrag sovrag sovrag N N _ Noun|common|masculine|singular|nominative 20 20 ena ena _ _ _ _ _ ACT _
19 sedaj sedaj sedaj R R _ Adverb|general|positive 20 20 štiri štiri _ _ _ _ _ TIME _
20 poslužuje posluževati posluževati V V _ Verb|main|progressive|present|third|singular 0 0 modra modra Y posluževati _ _ _ _ _
21 bolj bolj bolj R R _ Adverb|general|comparative 22 22 dol dol _ _ _ _ _ _ _
22 perfidnih perfiden perfiden A A _ Adjective|general|positive|feminine|plural|genitive 23 23 dol dol _ _ _ _ _ _ _
23 strategij strategija strategija N N _ Noun|common|feminine|plural|genitive 20 20 dve dve _ _ _ _ _ PAT _
24 , , , , , _ , 0 0 modra modra _ _ _ _ _ _ _
25 saj saj saj C C _ Conjunction|coordinating 30 30 vez vez _ _ _ _ _ _ _
26 nam jaz jaz P P _ Pronoun|personal|first|plural|dative 30 30 dve dve _ _ REC _ REC _ REC
27 lahko lahko lahko R R _ Adverb|general|positive 30 30 del del _ _ _ _ _ _ _
28 praktično praktično praktično R R _ Adverb|general|positive 29 29 dol dol _ _ _ _ _ _ _
29 kadarkoli kadarkoli kadarkoli R R _ Adverb|general|positive 30 30 tri tri _ _ _ _ _ _ MANN
30 odreže odrezati odrezati V V _ Verb|main|perfective|present|third|singular 0 0 modra modra Y odrezati _ _ _ _ _
31 dostop dostop dostop N N _ Noun|common|masculine|singular|accusative|-Animate 30 30 dve dve _ _ _ _ _ _ PAT
32 do do do S S _ Adposition|genitive 33 33 dol dol _ _ _ _ _ _ _
33 hrane hrana hrana N N _ Noun|common|feminine|singular|genitive 31 31 dol dol _ _ _ _ _ _ _
34 in in in C C _ Conjunction|coordinating 35 35 vez vez _ _ _ _ _ _ _
35 pijače pijača pijača N N _ Noun|common|feminine|singular|genitive 33 33 prir prir _ _ _ _ _ _ _
36 . . . . . _ . 0 0 modra modra _ _ _ _ _ _ _

View File

@@ -13,10 +13,11 @@
##################################################
# INPUT=./../../data/mate_train/sl.test.mate
INPUT=./../../data/kres_example_out/F0006347.xml.parsed.tsv
INPUT="$1"
Lang="ger"
MODEL="./srl-ger.model"
OUTPUT="${Lang}-eval.out"
# OUTPUT="${Lang}-eval.out"
OUTPUT="$2"
##################################################
## (2) These ones may need to be changed

25
tools/srl-20131216/tag_all.sh Executable file
View File

@@ -0,0 +1,25 @@
#!/bin/bash
IN_FOLDER="$1"
OUT_FOLDER="$2"
SUFFIX="srl.tsv"
mkdir -p $OUT_FOLDER
rm $OUT_FOLDER/*${SUFFIX}
for infile in $IN_FOLDER/*; do
echo "Tagging: ${infile}"
base=$(basename $infile | cut -d'.' -f1)
outfile=${OUT_FOLDER}/${base}.${SUFFIX}
# mate-tools tagger
./scripts/parse_srl_only_mod.sh $infile $outfile
if [ $? -eq 0 ]; then
echo "Saved as ${outfile}"
else
echo "ERR"
exit 1
fi
done