|
|
from pathlib import Path
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
INFILE = "../../data/mate_train/sl.test.mate"
|
|
|
# INFILE = "../../data/mate_train/sl.all.mate"
|
|
|
OUTFILE = "data1.pickle"
|
|
|
|
|
|
|
|
|
def gen_msdlabels(df):
|
|
|
msd_set = set()
|
|
|
for i, r in df.iterrows():
|
|
|
msd_set.update(r[6].split("|"))
|
|
|
return sorted(list(msd_set))
|
|
|
|
|
|
|
|
|
def build_model_row(tsv_row):
|
|
|
# input: tsv_row in conll_2009 format
|
|
|
|
|
|
# preprocessed -- all possible msd values, alphabetically sorted
|
|
|
msdlabels = ['!', '"', '#', '%', "'", '(', ')', '+Animate', '+Clitic', '+Definiteness', '+Negative', ',', '-', '-Animate', '-Definiteness', '-Negative', '.', '/', ':', ';', '?', 'Abbreviation', 'Adjective', 'Adposition', 'Adverb', 'Conjunction', 'Interjection', 'Noun', 'Numeral', 'Particle', 'Pronoun', 'Residual', 'Verb', 'accusative', 'auxiliary', 'biaspectual', 'bound', 'cardinal', 'common', 'comparative', 'conditional', 'coordinating', 'dative', 'demonstrative', 'digit', 'dual', 'feminine', 'first', 'foreign', 'future', 'general', 'genitive', 'imperative', 'indefinite', 'infinitive', 'instrumental', 'interrogative', 'letter', 'locative', 'main', 'masculine', 'negative', 'neuter', 'nominative', 'ordinal', 'participle', 'perfective', 'personal', 'plural', 'positive', 'possessive', 'present', 'progressive', 'pronominal', 'proper', 'reflexive', 'relative', 'roman', 'second', 'singular', 'special', 'subordinating', 'superlative', 'supine', 'third', '«', '°', '»', '‘', '’', '…']
|
|
|
|
|
|
lemma = tsv_row[2]
|
|
|
msd = tsv_row[6]
|
|
|
fillpred = tsv_row[12] # Y
|
|
|
|
|
|
row = []
|
|
|
row.append(lemma == "biti")
|
|
|
row.extend([lb in msd.split("|") for lb in msdlabels])
|
|
|
row.append(fillpred == "Y")
|
|
|
return row
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
ssj_mate = Path(INFILE)
|
|
|
df = pd.read_csv(ssj_mate, sep='~', header=None)
|
|
|
df = df.iloc[:,0].str.split('\t', n=14, expand=True)
|
|
|
print(df.head())
|
|
|
|
|
|
|
|
|
# msdlabels = gen_msdlabels()
|
|
|
msdlabels = ['!', '"', '#', '%', "'", '(', ')', '+Animate', '+Clitic', '+Definiteness', '+Negative', ',', '-', '-Animate', '-Definiteness', '-Negative', '.', '/', ':', ';', '?', 'Abbreviation', 'Adjective', 'Adposition', 'Adverb', 'Conjunction', 'Interjection', 'Noun', 'Numeral', 'Particle', 'Pronoun', 'Residual', 'Verb', 'accusative', 'auxiliary', 'biaspectual', 'bound', 'cardinal', 'common', 'comparative', 'conditional', 'coordinating', 'dative', 'demonstrative', 'digit', 'dual', 'feminine', 'first', 'foreign', 'future', 'general', 'genitive', 'imperative', 'indefinite', 'infinitive', 'instrumental', 'interrogative', 'letter', 'locative', 'main', 'masculine', 'negative', 'neuter', 'nominative', 'ordinal', 'participle', 'perfective', 'personal', 'plural', 'positive', 'possessive', 'present', 'progressive', 'pronominal', 'proper', 'reflexive', 'relative', 'roman', 'second', 'singular', 'special', 'subordinating', 'superlative', 'supine', 'third', '«', '°', '»', '‘', '’', '…']
|
|
|
print("labels: \n", msdlabels)
|
|
|
|
|
|
labels = ["biti"] + msdlabels + ["fillpred"]
|
|
|
ndf = pd.DataFrame(columns=labels, dtype=bool)
|
|
|
for i, r in df.iterrows():
|
|
|
ndf.loc[i] = build_model_row(r)
|
|
|
if i % 1000 == 0:
|
|
|
print(i, df.shape)
|
|
|
|
|
|
print(ndf.head())
|
|
|
ndf.to_pickle(Path(OUTFILE))
|