forked from kristjan/cjvt-srl-tagging
model for fillpred
This commit is contained in:
7
tools/fillpred_model/Makefile
Normal file
7
tools/fillpred_model/Makefile
Normal file
@@ -0,0 +1,7 @@
|
||||
all: model.pickle
|
||||
|
||||
data1.pickle:
|
||||
python3 step1.py
|
||||
|
||||
model.pickle: data1.pickle
|
||||
python3 step2.py
|
||||
2
tools/fillpred_model/README.md
Normal file
2
tools/fillpred_model/README.md
Normal file
@@ -0,0 +1,2 @@
|
||||
Need to figure out how to pick the FILLPRED attribute (using tokens with "modra") is not enough.
|
||||
We'll use a decision tree classifier to build a model based on the ssj500k mate data.
|
||||
@@ -1,28 +0,0 @@
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def main():
|
||||
ssj_mate = Path("../../data/mate_train/sl.all.mate")
|
||||
df = pd.read_csv(ssj_mate, sep='~', header=None)
|
||||
df = df.iloc[:,0].str.split('\t', n=14, expand=True)
|
||||
print(df.head())
|
||||
|
||||
msd_set = set()
|
||||
for i, r in df.iterrows():
|
||||
msd_set.update(r[6].split("|"))
|
||||
|
||||
labels = ["biti"] + sorted(list(msd_set)) + ["fillpred"]
|
||||
print("labels: \n", labels)
|
||||
|
||||
ndf = pd.DataFrame(columns=labels, dtype=bool)
|
||||
for i, r in df.iterrows():
|
||||
row = []
|
||||
row.append(r[2] == "biti")
|
||||
row.extend() ## TODO
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
40
tools/fillpred_model/step1.py
Normal file
40
tools/fillpred_model/step1.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
INFILE = "../../data/mate_train/sl.test.mate"
|
||||
# INFILE = "../../data/mate_train/sl.all.mate"
|
||||
OUTFILE = "data1.pickle"
|
||||
|
||||
ssj_mate = Path(INFILE)
|
||||
df = pd.read_csv(ssj_mate, sep='~', header=None)
|
||||
df = df.iloc[:,0].str.split('\t', n=14, expand=True)
|
||||
print(df.head())
|
||||
|
||||
"""
|
||||
msd_set = set()
|
||||
for i, r in df.iterrows():
|
||||
msd_set.update(r[6].split("|"))
|
||||
|
||||
msdlabels = sorted(list(msd_set))
|
||||
"""
|
||||
msdlabels = ['!', '"', '#', '%', "'", '(', ')', '+Animate', '+Clitic', '+Definiteness', '+Negative', ',', '-', '-Animate', '-Definiteness', '-Negative', '.', '/', ':', ';', '?', 'Abbreviation', 'Adjective', 'Adposition', 'Adverb', 'Conjunction', 'Interjection', 'Noun', 'Numeral', 'Particle', 'Pronoun', 'Residual', 'Verb', 'accusative', 'auxiliary', 'biaspectual', 'bound', 'cardinal', 'common', 'comparative', 'conditional', 'coordinating', 'dative', 'demonstrative', 'digit', 'dual', 'feminine', 'first', 'foreign', 'future', 'general', 'genitive', 'imperative', 'indefinite', 'infinitive', 'instrumental', 'interrogative', 'letter', 'locative', 'main', 'masculine', 'negative', 'neuter', 'nominative', 'ordinal', 'participle', 'perfective', 'personal', 'plural', 'positive', 'possessive', 'present', 'progressive', 'pronominal', 'proper', 'reflexive', 'relative', 'roman', 'second', 'singular', 'special', 'subordinating', 'superlative', 'supine', 'third', '«', '°', '»', '‘', '’', '…']
|
||||
print("labels: \n", msdlabels)
|
||||
|
||||
labels = ["biti"] + msdlabels + ["fillpred"]
|
||||
ndf = pd.DataFrame(columns=labels, dtype=bool)
|
||||
for i, r in df.iterrows():
|
||||
lemma = r[2]
|
||||
msd = r[6]
|
||||
fillpred = r[12] # y
|
||||
|
||||
row = []
|
||||
row.append(lemma == "biti")
|
||||
row.extend([lb in msd.split("|") for lb in msdlabels])
|
||||
row.append(fillpred == "Y")
|
||||
if i % 1000 == 0:
|
||||
print(i, df.shape)
|
||||
ndf.loc[i] = row
|
||||
|
||||
print(ndf.head())
|
||||
ndf.to_pickle(OUTFILE)
|
||||
29
tools/fillpred_model/step2.py
Normal file
29
tools/fillpred_model/step2.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
INFILE = "data1.pickle"
|
||||
OUTFILE = "model.pickle"
|
||||
|
||||
df = pd.read_pickle(INFILE)
|
||||
|
||||
X = df.values[:,0:-1]
|
||||
y = df.values[:,-1]
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)
|
||||
|
||||
clf = DecisionTreeClassifier()
|
||||
clf.fit(X_train, y_train)
|
||||
|
||||
y_pred = clf.predict(X_test)
|
||||
|
||||
print("Accuracy score: {:.4f}".format(accuracy_score(y_test, y_pred)*100))
|
||||
|
||||
# above was a test, now fit the actual model using the entire data
|
||||
clf_full = DecisionTreeClassifier()
|
||||
clf_full.fit(X, y)
|
||||
|
||||
pickle.dump(clf_full, open(OUTFILE, "wb"))
|
||||
Reference in New Issue
Block a user