Added num of letters to x_other_features

This commit is contained in:
lkrsnik 2017-08-18 19:08:42 +02:00
parent 18348b78fc
commit a2fce7c1ae
7 changed files with 1430 additions and 126 deletions

2
.gitignore vendored
View File

@ -91,3 +91,5 @@ ENV/
# Custom
data/
cnn/internal_representations/inputs/
joblist.xml
new_sloleks.xml

View File

@ -3,9 +3,11 @@
<component name="ChangeListManager">
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/accent_classification/letters/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/accent_classification/letters/cnn.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/workbench.py" afterPath="$PROJECT_DIR$/workbench.py" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="TRACKING_ENABLED" value="true" />
@ -32,28 +34,25 @@
</provider>
</entry>
</file>
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="244">
<caret line="252" column="38" lean-forward="true" selection-start-line="252" selection-start-column="38" selection-end-line="252" selection-end-column="38" />
<state relative-caret-position="198">
<caret line="14" column="121" lean-forward="false" selection-start-line="14" selection-start-column="102" selection-end-line="14" selection-end-column="121" />
<folding>
<element signature="e#24#63#0" expanded="true" />
<element signature="e#5658#5771#0" expanded="false" />
<element signature="e#5818#7106#0" expanded="false" />
<element signature="e#7267#8674#0" expanded="false" />
<element signature="e#8762#9057#0" expanded="false" />
<element signature="e#13496#13798#0" expanded="false" />
<element signature="e#13855#14684#0" expanded="false" />
<element signature="e#14748#15094#0" expanded="false" />
<element signature="e#16969#17882#0" expanded="false" />
<element signature="e#18312#18508#0" expanded="false" />
<element signature="e#18569#18760#0" expanded="false" />
<element signature="e#18827#19474#0" expanded="false" />
<element signature="e#19573#21871#0" expanded="false" />
<element signature="e#22137#22836#0" expanded="false" />
<element signature="e#29631#29772#0" expanded="false" />
<element signature="e#29922#32067#0" expanded="false" />
<element signature="e#5979#7267#0" expanded="false" />
<element signature="e#7428#8835#0" expanded="false" />
<element signature="e#8923#9218#0" expanded="false" />
<element signature="e#13669#13971#0" expanded="false" />
<element signature="e#14028#14857#0" expanded="false" />
<element signature="e#14921#15267#0" expanded="false" />
<element signature="e#18834#19030#0" expanded="false" />
<element signature="e#19091#19282#0" expanded="false" />
<element signature="e#19349#19996#0" expanded="false" />
<element signature="e#20095#22393#0" expanded="false" />
<element signature="e#30153#30294#0" expanded="false" />
<element signature="e#30444#32589#0" expanded="false" />
</folding>
</state>
</provider>
@ -69,13 +68,13 @@
</provider>
</entry>
</file>
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="396">
<caret line="37" column="68" lean-forward="false" selection-start-line="37" selection-start-column="68" selection-end-line="37" selection-end-column="68" />
<state relative-caret-position="410">
<caret line="42" column="17" lean-forward="true" selection-start-line="42" selection-start-column="17" selection-end-line="42" selection-end-column="17" />
<folding>
<element signature="e#53#92#0" expanded="true" />
<element signature="e#24#63#0" expanded="true" />
</folding>
</state>
</provider>
@ -150,21 +149,6 @@
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>count_vowels</find>
<find>shuffle_full_vowel_inputs</find>
<find>generate_presentable_y</find>
<find>is_accetuated_vowel</find>
<find>is_vowel</find>
<find>load_shuffle_vector</find>
<find>create_and_save_shuffle_vector</find>
<find>load_extended_inputs</find>
<find>create_and_save_inputs</find>
<find>shuffle_inputs</find>
<find>complete_feature_dict</find>
<find>create_syllable_letters_translator</find>
<find>syllable_letters_translator</find>
<find>get_max_syllable</find>
<find>check_feature_letter_usage</find>
<find>orig_X</find>
<find>vowels</find>
<find>_create_syllable_letters_translator</find>
@ -180,6 +164,21 @@
<find>size</find>
<find>decode_x</find>
<find>self._input_type ==</find>
<find>../</find>
<find>math</find>
<find>predict</find>
<find>_reverse_inputs</find>
<find>_letter_generator</find>
<find>_accent_classification</find>
<find>_create_feature_dictionary</find>
<find>generate_data</find>
<find>Data</find>
<find>shuffle_vector</find>
<find>shuffle_vector_path</find>
<find>fit_generator</find>
<find>../../../data/</find>
<find>self.x_other_features_train</find>
<find>_create_x_features</find>
</findStrings>
</component>
<component name="Git.Settings">
@ -196,10 +195,10 @@
<option value="$PROJECT_DIR$/theano_tutorial/logistic_regression_loop.py" />
<option value="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn_test_on_other_attributes.ipynb" />
<option value="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.py" />
<option value="$PROJECT_DIR$/workbench.py" />
<option value="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.ipynb" />
<option value="$PROJECT_DIR$/tex_hyphenation.py" />
<option value="$PROJECT_DIR$/prepare_data.py" />
<option value="$PROJECT_DIR$/workbench.py" />
</list>
</option>
</component>
@ -224,6 +223,8 @@
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scope" />
<pane id="Scratches" />
<pane id="ProjectPane">
<subPane>
<PATH>
@ -238,8 +239,6 @@
</PATH>
</subPane>
</pane>
<pane id="Scope" />
<pane id="Scratches" />
</panes>
</component>
<component name="PropertiesComponent">
@ -830,31 +829,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="244">
<caret line="252" column="38" lean-forward="true" selection-start-line="252" selection-start-column="38" selection-end-line="252" selection-end-column="38" />
<folding>
<element signature="e#24#63#0" expanded="true" />
<element signature="e#5658#5771#0" expanded="false" />
<element signature="e#5818#7106#0" expanded="false" />
<element signature="e#7267#8674#0" expanded="false" />
<element signature="e#8762#9057#0" expanded="false" />
<element signature="e#13496#13798#0" expanded="false" />
<element signature="e#13855#14684#0" expanded="false" />
<element signature="e#14748#15094#0" expanded="false" />
<element signature="e#16969#17882#0" expanded="false" />
<element signature="e#18312#18508#0" expanded="false" />
<element signature="e#18569#18760#0" expanded="false" />
<element signature="e#18827#19474#0" expanded="false" />
<element signature="e#19573#21871#0" expanded="false" />
<element signature="e#22137#22836#0" expanded="false" />
<element signature="e#29631#29772#0" expanded="false" />
<element signature="e#29922#32067#0" expanded="false" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tex_hyphenation.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1206">
@ -863,16 +837,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="396">
<caret line="37" column="68" lean-forward="false" selection-start-line="37" selection-start-column="68" selection-end-line="37" selection-end-column="68" />
<folding>
<element signature="e#53#92#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/theano_tutorial/logistic_regression.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="162">
@ -921,5 +885,37 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="198">
<caret line="14" column="121" lean-forward="false" selection-start-line="14" selection-start-column="102" selection-end-line="14" selection-end-column="121" />
<folding>
<element signature="e#24#63#0" expanded="true" />
<element signature="e#5979#7267#0" expanded="false" />
<element signature="e#7428#8835#0" expanded="false" />
<element signature="e#8923#9218#0" expanded="false" />
<element signature="e#13669#13971#0" expanded="false" />
<element signature="e#14028#14857#0" expanded="false" />
<element signature="e#14921#15267#0" expanded="false" />
<element signature="e#18834#19030#0" expanded="false" />
<element signature="e#19091#19282#0" expanded="false" />
<element signature="e#19349#19996#0" expanded="false" />
<element signature="e#20095#22393#0" expanded="false" />
<element signature="e#30153#30294#0" expanded="false" />
<element signature="e#30444#32589#0" expanded="false" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="410">
<caret line="42" column="17" lean-forward="true" selection-start-line="42" selection-start-column="17" selection-end-line="42" selection-end-column="17" />
<folding>
<element signature="e#24#63#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</component>
</project>

View File

@ -7,11 +7,12 @@ import h5py
import math
import keras.backend as K
import os.path
import codecs
class Data:
def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True,
additional_letter_attributes=True, reverse_inputs=True, accent_classification=False):
additional_letter_attributes=True, reverse_inputs=True, accent_classification=False, number_of_syllables=False):
self._input_type = input_type
self._save_generated_data = save_generated_data
self._allow_shuffle_vector_generation = allow_shuffle_vector_generation
@ -19,6 +20,7 @@ class Data:
self._additional_letter_attributes = additional_letter_attributes
self._reverse_inputs = reverse_inputs
self._accent_classification = accent_classification
self._number_of_syllables = number_of_syllables
self.x_train = None
self.x_other_features_train = None
@ -88,7 +90,8 @@ class Data:
# functions for creating X and y from content
@staticmethod
def _read_content(content_path):
with open(content_path) as f:
# with open(content_path) as f:
with codecs.open(content_path, encoding='utf8') as f:
content = f.readlines()
return [x.split('\t') for x in content]
@ -261,9 +264,9 @@ class Data:
raise ValueError('No input_type provided. It could be \'l\', \'s\' or \'sl\'.')
y = self._y_output(content, max_num_vowels, vowels, accentuated_vowels)
print('CREATING OTHER FEATURES...')
x_other_features = self._create_x_features(content, feature_dictionary)
print('OTHER FEATURES CREATED!')
# print('CREATING OTHER FEATURES...')
x_other_features = self._create_x_features(content, feature_dictionary, vowels)
# print('OTHER FEATURES CREATED!')
if self._shuffle_all_inputs:
print('SHUFFELING INPUTS...')
@ -347,7 +350,7 @@ class Data:
split = min(split_options, key=lambda x: x[1])
return consonants[:split[0] + 1], consonants[split[0] + 1:]
def _create_x_features(self, content, feature_dictionary):
def _create_x_features(self, content, feature_dictionary, vowels):
content = content
x_other_features = []
for el in content:
@ -364,6 +367,14 @@ class Data:
x_el_other_features.append(0)
else:
x_el_other_features.extend([0] * feature[0])
if self._number_of_syllables:
list_of_letters = list(el[0])
num_of_vowels = 0
for i in range(len(list_of_letters)):
if self._is_vowel(list(el[0]), i, vowels):
num_of_vowels += 1
x_el_other_features.append(num_of_vowels)
x_other_features.append(x_el_other_features)
return np.array(x_other_features)
@ -652,6 +663,60 @@ class Data:
def _get_nonresonant_silent_consonants():
return ['p', 't', 's', 'š', 'č', 'k', 'f', 'h', 'c']
@staticmethod
def _create_slovene_feature_dictionary():
# old: http://nl.ijs.si/ME/Vault/V3/msd/html/
# new: http://nl.ijs.si/ME/V4/msd/html/
# changes: http://nl.ijs.si/jos/msd/html-en/msd.diffs.html
return [[21,
'P',
['p', 's'],
['n', 'p', 's'],
['m', 'z', 's'],
['e', 'd', 'm'],
['i', 'r', 'd', 't', 'm', 'o'],
['-', 'n', 'd']],
[3, 'V', ['p', 'd']],
[1, 'M'],
[21,
'K',
['b'],
['-', 'g', 'v', 'd'],
['m', 'z', 's'],
['e', 'd', 'm'],
['i', 'r', 'd', 't', 'm', 'o'],
['-', 'n', 'd']],
[17,
'S',
['o'],
['m', 'z', 's'],
['e', 'd', 'm'],
['i', 'r', 'd', 't', 'm', 'o'],
['-', 'n', 'd']],
[40,
'Z',
['o', 's', 'k', 'z', 'p', 'c', 'v', 'n', 'l'],
['-', 'p', 'd', 't'],
['-', 'm', 'z', 's'],
['-', 'e', 'd', 'm'],
['-', 'i', 'r', 'd', 't', 'm', 'o'],
['-', 'e', 'd', 'm'],
['-', 'm', 'z', 's'],
['-', 'k', 'z']],
[1, 'L'],
[5, 'R', ['s'], ['n', 'r', 's']],
[7, 'D', ['-', 'r', 'd', 't', 'm', 'o']],
[24,
'G',
['g'],
['-'],
['n', 'm', 'd', 's', 'p', 'g'],
['-', 'p', 'd', 't'],
['-', 'e', 'm', 'd'],
['-', 'm', 'z', 's'],
['-', 'n', 'd']]
]
@staticmethod
def _create_feature_dictionary():
# old: http://nl.ijs.si/ME/Vault/V3/msd/html/

1204
sloleks_accetuation.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@ -1,26 +1,17 @@
# coding: utf-8
# In[1]:
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
# text in Western (Windows 1252)
import pickle
import numpy as np
# import StringIO
import math
from keras.models import Sequential
from keras.layers import Dense, Dropout, Merge
from keras import optimizers
from keras.models import Model
from keras.layers import Dense, Dropout, Input
from keras.layers.merge import concatenate
from keras import regularizers
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.constraints import maxnorm
from keras.layers import Flatten
from keras.optimizers import SGD
from keras.models import load_model
# from keras import backend as Input
np.random.seed(7)
# get_ipython().magic('run ../../../prepare_data.py')
@ -34,39 +25,68 @@ from prepare_data import *
# X_train, X_other_features_train, y_train, X_validate, X_other_features_validate, y_validate = generate_full_matrix_inputs()
# save_inputs('../../internal_representations/inputs/shuffeled_matrix_train_inputs_other_features_output_11.h5', X_train, y_train, other_features = X_other_features_train)
# save_inputs('../../internal_representations/inputs/shuffeled_matrix_validate_inputs_other_features_output_11.h5', X_validate, y_validate, other_features = X_other_features_validate)
X_train, X_other_features_train, y_train = load_inputs('cnn/internal_representations/inputs/shuffeled_matrix_train_inputs_other_features_output_11.h5', other_features=True)
X_validate, X_other_features_validate, y_validate = load_inputs('cnn/internal_representations/inputs/shuffeled_matrix_validate_inputs_other_features_output_11.h5', other_features=True)
# X_train, X_other_features_train, y_train = load_inputs('cnn/internal_representations/inputs/shuffeled_matrix_train_inputs_other_features_output_11.h5', other_features=True)
# X_validate, X_other_features_validate, y_validate = load_inputs('cnn/internal_representations/inputs/shuffeled_matrix_validate_inputs_other_features_output_11.h5', other_features=True)
data = Data('l', save_generated_data=False, number_of_syllables=True)
data.generate_data('letters_word_accetuation_train',
'letters_word_accetuation_test',
'letters_word_accetuation_validate', content_name='SlovarIJS_BESEDE_utf8.lex',
content_shuffle_vector='content_shuffle_vector', shuffle_vector='shuffle_vector',
inputs_location='', content_location='')
num_examples = len(X_train) # training set size
nn_output_dim = 11
num_examples = len(data.x_train) # training set size
nn_output_dim = 10
nn_hdim = 516
word_processor = Sequential()
word_processor.add(Conv1D(43, (3), input_shape=(23, 43), padding='same', activation='relu'))
word_processor.add(Conv1D(43, (3), padding='same', activation='relu'))
word_processor.add(MaxPooling1D(pool_size=2))
word_processor.add(Flatten())
word_processor.add(Dense(516, activation='relu', kernel_constraint=maxnorm(3)))
metadata_processor = Sequential()
metadata_processor.add(Dense(256, input_dim=167, activation='relu'))
model = Sequential()
model.add(Merge([word_processor, metadata_processor], mode='concat')) # Merge is your sensor fusion buddy
model.add(Dense(1024, input_dim=(516 + 256), activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1024, input_dim=(516 + 256), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(nn_output_dim, activation='sigmoid'))
batch_size = 16
# actual_epoch = 1
actual_epoch = 60
# num_fake_epoch = 2
num_fake_epoch = 20
# In[10]:
# epochs = 5
# lrate = 0.1
# decay = lrate/epochs
# sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
model.fit([X_train, X_other_features_train], y_train, validation_data=([X_validate, X_other_features_validate], y_validate), epochs=10, batch_size=10)
model.save('v1_1.h5')
conv_input_shape=(23, 36)
othr_input = (141, )
conv_input = Input(shape=conv_input_shape, name='conv_input')
x_conv = Conv1D(133, (3), padding='same', activation='relu')(conv_input)
x_conv = Conv1D(46, (3), padding='same', activation='relu')(x_conv)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
# x = Dense(1024, input_dim=(516 + 256), activation='relu')(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy,])
# model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
history = model.fit_generator(data.generator('train', batch_size, content_name='SlovarIJS_BESEDE_utf8.lex', content_location=''),
data.x_train.shape[0]/(batch_size * num_fake_epoch),
epochs=actual_epoch*num_fake_epoch,
validation_data=data.generator('test', batch_size, content_name='SlovarIJS_BESEDE_utf8.lex', content_location=''),
validation_steps=data.x_test.shape[0]/(batch_size * num_fake_epoch),
verbose=2
)
name = '60_epoch'
model.save(name + '.h5')
output = open(name + '_history.pkl', 'wb')
pickle.dump(history.history, output)
output.close()

3
workbench.sh Normal file
View File

@ -0,0 +1,3 @@
#!/bin/sh
#export KERAS_BACKEND=theano
THEANO_FLAGS='mode=FAST_RUN,device=gpu,floatX=float32,nvcc.flags=-D_FORCE_INLINES' python3 workbench.py

14
workbench.xrsl Normal file
View File

@ -0,0 +1,14 @@
&
(jobName="accetuation_nn")
(executable="workbench.sh")
(inputfiles=
("workbench.py" "")("prepare_data.py" "")("content_shuffle_vector.h5" "cnn/internal_representations/inputs/content_shuffle_vector.h5")("shuffle_vector_test.h5" "cnn/internal_representations/inputs/shuffle_vector_test.h5")("shuffle_vector_train.h5" "cnn/internal_representations/inputs/shuffle_vector_train.h5")("shuffle_vector_validate.h5" "cnn/internal_representations/inputs/shuffle_vector_validate.h5")("SlovarIJS_BESEDE_utf8.lex" "data/SlovarIJS_BESEDE_utf8.lex")
)
(outputfiles=("60_epoch.h5" "")("workbench.py" "")("workbench.sh" "")("60_epoch_history.pkl" "")
)
(stdout="out.txt")
(stderr="err.txt")
(gmlog="gmlog")
(runtimeenvironment="APPS/BASE/THEANO-GPU-0.9")
(gridTime=1000)
(memory=12000)