diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 5bd0e4f..027b147 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,19 +2,10 @@ - - - - - - - - - - + - round - is_vow - self._input_type == 'l' - print - np.eye - allow_shuffle_vector_generation - accented_vowels - generate generate_x_and accentuate _generator @@ -206,12 +190,20 @@ convert_multext _syllable_generator generator - generate_data _x bidirectional_basic_input _bidirectional_basic_input shuffeling + generate_data + _generate_inputs + content_shuffle_vector_path + content_shuffle_vector_location + _shuffle_all_inputs + _generator_instance + _x_letter_input _generate_x_and_y + content + number_of_syllables @@ -232,9 +224,9 @@ @@ -925,39 +917,40 @@ - + + - - + + - - + - - + + + + + + + + + - + - - + + - - - - - - diff --git a/prepare_data.py b/prepare_data.py index 0973a84..4c12ed8 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -22,7 +22,7 @@ from keras.models import load_model class Data: def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True, additional_letter_attributes=True, reverse_inputs=True, accent_classification=False, number_of_syllables=False, - convert_multext=True, bidirectional_basic_input=False): + convert_multext=True, bidirectional_basic_input=False, bidirectional_architectural_input=False): self._input_type = input_type self._save_generated_data = save_generated_data self._allow_shuffle_vector_generation = allow_shuffle_vector_generation @@ -33,14 +33,18 @@ class Data: self._number_of_syllables = number_of_syllables self._convert_multext = convert_multext self._bidirectional_basic_input = bidirectional_basic_input + self._bidirectional_architectural_input = bidirectional_architectural_input self.x_train = None + # self.x2_train = None self.x_other_features_train = None self.y_train = None self.x_test = None + # self.x2_test = None self.x_other_features_test = None self.y_test = None self.x_validate = None + # self.x2_validate = None self.x_other_features_validate = None self.y_validate = None @@ -63,15 +67,11 @@ class Data: shuffle_vector_path = '{}{}'.format(inputs_location, shuffle_vector) # actual generation of inputs - self._generate_inputs(content_path, content_shuffle_vector_path, shuffle_vector_path, test_and_validation_size) + self._generate_inputs(content_path, content_shuffle_vector_path, shuffle_vector_path, test_and_validation_size, train_path, test_path, + validate_path) - # save inputs - if self._save_generated_data: - self._save_inputs(train_path, self.x_train, self.x_other_features_train, self.y_train) - self._save_inputs(test_path, self.x_test, self.x_other_features_test, self.y_test) - self._save_inputs(validate_path, self.x_validate, self.x_other_features_validate, self.y_validate) - - def _generate_inputs(self, content_location, content_shuffle_vector_location, shuffle_vector_location, test_and_validation_size): + def _generate_inputs(self, content_location, content_shuffle_vector_location, shuffle_vector_location, test_and_validation_size, train_path, + test_path, validate_path): print('READING CONTENT...') content = self._read_content(content_location) print('CONTENT READ SUCCESSFULLY') @@ -97,6 +97,13 @@ class Data: accented_vowels, feature_dictionary, shuffle_vector_location + '_validate.h5') print('GENERATION SUCCESSFUL!') + + # save inputs + if self._save_generated_data: + self._save_inputs(train_path, self.x_train, self.x_other_features_train, self.y_train) + self._save_inputs(test_path, self.x_test, self.x_other_features_test, self.y_test) + self._save_inputs(validate_path, self.x_validate, self.x_other_features_validate, self.y_validate) + # return X_train, X_other_features_train, y_train, X_test, X_other_features_test, y_test, X_validate, X_other_features_validate, y_validate # functions for creating X and y from content @@ -179,7 +186,7 @@ class Data: h5f.close() return shuffle_vector - def _x_letter_input(self, content, dictionary, max_word, vowels): + def _x_letter_input(self, content, dictionary, max_word, vowels, shuffle_vector_location): if self._additional_letter_attributes: if not self._bidirectional_basic_input: x = np.zeros((len(content), max_word, len(dictionary) + 6), dtype=int) @@ -196,9 +203,18 @@ class Data: else: x = np.zeros((len(content), 2 * max_word, len(dictionary)), dtype=int) - i = 0 - for el in content: - word = el[0] + if self._shuffle_all_inputs: + s = self._load_shuffle_vector(shuffle_vector_location, len(content)) + else: + s = None + + # i = 0 + for i in range(len(content)): + if self._shuffle_all_inputs: + mod_i = s[i] + else: + mod_i = i + word = content[mod_i][0] if self._reverse_inputs: word = word[::-1] j = 0 @@ -242,7 +258,7 @@ class Data: if self._bidirectional_basic_input: x[i][j2][len(dictionary) + 5] = 1 j += 1 - i += 1 + #i += 1 return x def _x_syllable_input(self, content, dictionary, max_num_vowels, vowels): @@ -266,11 +282,19 @@ class Data: i += 1 return x - def _y_output(self, content, max_num_vowels, vowels, accentuated_vowels): + def _y_output(self, content, max_num_vowels, vowels, accentuated_vowels, shuffle_vector_location): y = np.zeros((len(content), max_num_vowels)) i = 0 - - for el in content: + if self._shuffle_all_inputs: + s = self._load_shuffle_vector(shuffle_vector_location, len(content)) + else: + s = None + for i in range(len(content)): + if self._shuffle_all_inputs: + mod_i = s[i] + else: + mod_i = i + el = content[mod_i] word = el[3] if self._reverse_inputs: word = word[::-1] @@ -292,27 +316,26 @@ class Data: if self._is_vowel(word, j, vowels): num_vowels += 1 j += 1 - i += 1 return y # Generate each y as an array of 11 numbers (with possible values between 0 and 1) def _generate_x_and_y(self, dictionary, max_word, max_num_vowels, content, vowels, accentuated_vowels, feature_dictionary, shuffle_vector_location): if self._input_type == 'l': - x = self._x_letter_input(content, dictionary, max_word, vowels) + x = self._x_letter_input(content, dictionary, max_word, vowels, shuffle_vector_location) elif self._input_type == 's' or self._input_type == 'sl': x = self._x_syllable_input(content, dictionary, max_num_vowels, vowels) else: raise ValueError('No input_type provided. It could be \'l\', \'s\' or \'sl\'.') - y = self._y_output(content, max_num_vowels, vowels, accentuated_vowels) + y = self._y_output(content, max_num_vowels, vowels, accentuated_vowels, shuffle_vector_location) # print('CREATING OTHER FEATURES...') - x_other_features = self._create_x_features(content, feature_dictionary, vowels) + x_other_features = self._create_x_features(content, feature_dictionary, vowels, shuffle_vector_location) # print('OTHER FEATURES CREATED!') if self._shuffle_all_inputs: print('SHUFFELING INPUTS...') - x, x_other_features, y = self._shuffle_inputs(x, x_other_features, y, shuffle_vector_location) + #x, x_other_features, y = self._shuffle_inputs(x, x_other_features, y, shuffle_vector_location) print('INPUTS SHUFFELED!') return x, x_other_features, y @@ -390,10 +413,19 @@ class Data: split = min(split_options, key=lambda x: x[1]) return consonants[:split[0] + 1], consonants[split[0] + 1:] - def _create_x_features(self, content, feature_dictionary, vowels): + def _create_x_features(self, content, feature_dictionary, vowels, shuffle_vector_location): content = content x_other_features = [] - for el in content: + if self._shuffle_all_inputs: + s = self._load_shuffle_vector(shuffle_vector_location, len(content)) + else: + s = None + for index in range(len(content)): + if self._shuffle_all_inputs: + mod_i = s[index] + else: + mod_i = index + el = content[mod_i] x_el_other_features = [] if self._convert_multext: converted_el = ''.join(self._convert_to_multext_east_v4(list(el[2]), feature_dictionary)) @@ -587,9 +619,17 @@ class Data: else: while loc < size: if loc + batch_size >= size: - yield ([orig_x[loc:size], orig_x_additional[loc:size]], orig_y[loc:size]) + if self._bidirectional_architectural_input: + split_orig_x = np.hsplit(orig_x[loc:size], 2) + yield ([split_orig_x[0], split_orig_x[1], orig_x_additional[loc:size]], orig_y[loc:size]) + else: + yield ([orig_x[loc:size], orig_x_additional[loc:size]], orig_y[loc:size]) else: - yield ([orig_x[loc:loc + batch_size], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size]) + if self._bidirectional_architectural_input: + split_orig_x = np.hsplit(orig_x[loc:loc + batch_size], 2) + yield ([split_orig_x[0], split_orig_x[1], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size]) + else: + yield ([orig_x[loc:loc + batch_size], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size]) loc += batch_size # generator for inputs for tracking of data fitting diff --git a/workbench.py b/workbench.py index 0eb7081..5de3e69 100644 --- a/workbench.py +++ b/workbench.py @@ -27,29 +27,16 @@ from prepare_data import * # save_inputs('../../internal_representations/inputs/shuffeled_matrix_validate_inputs_other_features_output_11.h5', X_validate, y_validate, other_features = X_other_features_validate) # X_train, X_other_features_train, y_train = load_inputs('cnn/internal_representations/inputs/shuffeled_matrix_train_inputs_other_features_output_11.h5', other_features=True) # X_validate, X_other_features_validate, y_validate = load_inputs('cnn/internal_representations/inputs/shuffeled_matrix_validate_inputs_other_features_output_11.h5', other_features=True) -# letters -# data = Data('l', save_generated_data=False, number_of_syllables=True) - -# syllabled letters -data = Data('s', save_generated_data=False, accent_classification=True) -data.generate_data('letters_word_accetuation_train', - 'letters_word_accetuation_test', - 'letters_word_accetuation_validate', content_name='SlovarIJS_BESEDE_utf8.lex', +data = Data('l', bidirectional_basic_input=True, bidirectional_architectural_input=True) +data.generate_data('letters_word_accetuation_bidirectional_train', + 'letters_word_accetuation_bidirectional_test', + 'letters_word_accetuation_bidirectional_validate', content_name='SlovarIJS_BESEDE_utf8.lex', content_shuffle_vector='content_shuffle_vector', shuffle_vector='shuffle_vector', inputs_location='', content_location='') -# concatenate test and train data -# data.x_train = np.concatenate((data.x_train, data.x_test), axis=0) -# data.x_other_features_train = np.concatenate((data.x_other_features_train, data.x_other_features_test), axis=0) -# data.y_train = np.concatenate((data.y_train, data.y_test), axis=0) - -# concatenate all data -data.x_train = np.concatenate((data.x_train, data.x_test, data.x_validate), axis=0) -data.x_other_features_train = np.concatenate((data.x_other_features_train, data.x_other_features_test, data.x_other_features_validate), axis=0) -data.y_train = np.concatenate((data.y_train, data.y_test, data.y_validate), axis=0) num_examples = len(data.x_train) # training set size -nn_output_dim = 13 +nn_output_dim = 10 nn_hdim = 516 batch_size = 16 # actual_epoch = 1 @@ -57,32 +44,28 @@ actual_epoch = 20 # num_fake_epoch = 2 num_fake_epoch = 20 -# letters -# conv_input_shape=(23, 36) - -# syllabled letters -# conv_input_shape=(10, 252) -# syllables -conv_input_shape=(10, 5168) -# othr_input = (140, ) -othr_input = (150, ) +conv_input_shape=(23, 36) +othr_input = (140, ) conv_input = Input(shape=conv_input_shape, name='conv_input') -# letters -# x_conv = Conv1D(115, (3), padding='same', activation='relu')(conv_input) -# x_conv = Conv1D(46, (3), padding='same', activation='relu')(x_conv) - -# syllabled letters -x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input) +x_conv = Conv1D(115, (3), padding='same', activation='relu')(conv_input) +x_conv = Conv1D(46, (3), padding='same', activation='relu')(x_conv) x_conv = MaxPooling1D(pool_size=2)(x_conv) x_conv = Flatten()(x_conv) +conv_input2 = Input(shape=conv_input_shape, name='conv_input2') +x_conv2 = Conv1D(115, (3), padding='same', activation='relu')(conv_input2) +x_conv2 = Conv1D(46, (3), padding='same', activation='relu')(x_conv2) +x_conv2 = MaxPooling1D(pool_size=2)(x_conv2) +x_conv2 = Flatten()(x_conv2) +# x_conv = Dense(516, activation='relu', kernel_constraint=maxnorm(3))(x_conv) + othr_input = Input(shape=othr_input, name='othr_input') -x = concatenate([x_conv, othr_input]) +x = concatenate([x_conv, x_conv2, othr_input]) # x = Dense(1024, input_dim=(516 + 256), activation='relu')(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) @@ -95,7 +78,7 @@ x = Dense(nn_output_dim, activation='sigmoid')(x) -model = Model(inputs=[conv_input, othr_input], outputs=x) +model = Model(inputs=[conv_input, conv_input2, othr_input], outputs=x) opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08) model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy,]) # model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy']) @@ -104,10 +87,12 @@ model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accurac history = model.fit_generator(data.generator('train', batch_size, content_name='SlovarIJS_BESEDE_utf8.lex', content_location=''), data.x_train.shape[0]/(batch_size * num_fake_epoch), epochs=actual_epoch*num_fake_epoch, + validation_data=data.generator('test', batch_size, content_name='SlovarIJS_BESEDE_utf8.lex', content_location=''), + validation_steps=data.x_test.shape[0]/(batch_size * num_fake_epoch), verbose=2 ) -name = '40_epoch' +name = '20_epoch' model.save(name + '.h5') output = open(name + '_history.pkl', 'wb') pickle.dump(history.history, output)