Files renamed, orderd plus ordered prepare_data file

This commit is contained in:
lkrsnik 2017-07-01 12:19:09 +02:00
parent f033638ee9
commit ac8f0057c5
4 changed files with 88 additions and 155 deletions

2
.gitignore vendored
View File

@ -90,4 +90,4 @@ ENV/
# Custom # Custom
data/ data/
character_based_ffnn/internal_representations/inputs/ cnn/internal_representations/inputs/

View File

@ -0,0 +1,7 @@
<component name="ProjectDictionaryState">
<dictionary name="luka">
<words>
<w>overfitting</w>
</words>
</dictionary>
</component>

View File

@ -2,8 +2,20 @@
<project version="4"> <project version="4">
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment=""> <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_theano.ipynb" afterPath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_theano.ipynb" /> <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn.ipynb" afterPath="" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" afterPath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" /> <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" afterPath="" />
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_theano.ipynb" afterPath="" />
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/cnn_per_vowel_3epoch.h5" afterPath="" />
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/create_and_save_inputs.py" afterPath="" />
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65.h5" afterPath="" />
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65_10epoch.h5" afterPath="" />
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65_5epoch.h5" afterPath="" />
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/ffnn_i1_s500relu_121sigmoid_mse_adam_a65.h5" afterPath="" />
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/ffnn_i1_s500relu_d20_121sigmoid_mse_adam_a65.h5" afterPath="" />
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/test.txt" afterPath="" />
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" afterPath="" />
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65_10epoch_no_overfitting.h5" afterPath="" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
</list> </list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" /> <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
@ -34,8 +46,8 @@
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true"> <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/prepare_data.py"> <entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="630"> <state relative-caret-position="410">
<caret line="80" column="13" lean-forward="false" selection-start-line="80" selection-start-column="13" selection-end-line="80" selection-end-column="13" /> <caret line="545" column="14" lean-forward="true" selection-start-line="545" selection-start-column="14" selection-end-line="545" selection-end-column="14" />
<folding> <folding>
<element signature="e#24#63#0" expanded="true" /> <element signature="e#24#63#0" expanded="true" />
</folding> </folding>
@ -174,8 +186,6 @@
<foldersAlwaysOnTop value="true" /> <foldersAlwaysOnTop value="true" />
</navigator> </navigator>
<panes> <panes>
<pane id="Scratches" />
<pane id="Scope" />
<pane id="ProjectPane"> <pane id="ProjectPane">
<subPane> <subPane>
<PATH> <PATH>
@ -188,40 +198,10 @@
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" /> <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT> </PATH_ELEMENT>
</PATH> </PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="character_based_ffnn" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="character_based_ffnn" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="internal_representations" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
</subPane> </subPane>
</pane> </pane>
<pane id="Scratches" />
<pane id="Scope" />
</panes> </panes>
</component> </component>
<component name="PropertiesComponent"> <component name="PropertiesComponent">
@ -1010,8 +990,8 @@
</entry> </entry>
<entry file="file://$PROJECT_DIR$/prepare_data.py"> <entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="630"> <state relative-caret-position="410">
<caret line="80" column="13" lean-forward="false" selection-start-line="80" selection-start-column="13" selection-end-line="80" selection-end-column="13" /> <caret line="545" column="14" lean-forward="true" selection-start-line="545" selection-start-column="14" selection-end-line="545" selection-end-column="14" />
<folding> <folding>
<element signature="e#24#63#0" expanded="true" /> <element signature="e#24#63#0" expanded="true" />
</folding> </folding>

View File

@ -6,38 +6,15 @@ import numpy as np
import h5py import h5py
import gc import gc
import math import math
import copy
# functions for saving, loading and shuffling whole arrays to ram
def save_inputs(file_name, X, y): def save_inputs(file_name, X, y):
h5f = h5py.File(file_name, 'w') h5f = h5py.File(file_name, 'w')
adict=dict(X=X, y=y) adict = dict(X=X, y=y)
for k,v in adict.items(): for k, v in adict.items():
h5f.create_dataset(k,data=v) h5f.create_dataset(k,data=v)
h5f.close() h5f.close()
def create_and_save_inputs(file_name, part, X, y, X_pure):
# X, y, X_pure = generate_full_vowel_matrix_inputs()
h5f = h5py.File(file_name + part + '.h5', 'w')
adict=dict(X=X, y=y, X_pure=X_pure)
for k,v in adict.items():
h5f.create_dataset(k,data=v)
h5f.close()
def create_and_save_shuffle_vector(file_name, shuffle_vector):
# X, y, X_pure = generate_full_vowel_matrix_inputs()
h5f = h5py.File(file_name + '_shuffle_vector.h5', 'w')
adict=dict(shuffle_vector=shuffle_vector)
for k,v in adict.items():
h5f.create_dataset(k,data=v)
h5f.close()
def load_shuffle_vector(file_name):
h5f = h5py.File(file_name,'r')
shuffle_vector = h5f['shuffle_vector'][[179859, 385513, 893430]]
h5f.close()
return shuffle_vector
def load_inputs(file_name): def load_inputs(file_name):
h5f = h5py.File(file_name,'r') h5f = h5py.File(file_name,'r')
X = h5f['X'][:] X = h5f['X'][:]
@ -46,6 +23,27 @@ def load_inputs(file_name):
h5f.close() h5f.close()
return X, y return X, y
def shuffle_inputs(X, y, X_pure=False):
s = np.arange(X.shape[0])
np.random.shuffle(s)
X = X[s]
y = y[s]
if X_pure:
X_pure = X_pure[s]
return X, y, X_pure
else:
return X, y
# functions for saving and loading partial arrays to ram
def create_and_save_inputs(file_name, part, X, y, X_pure):
# X, y, X_pure = generate_full_vowel_matrix_inputs()
h5f = h5py.File(file_name + part + '.h5', 'w')
adict=dict(X=X, y=y, X_pure=X_pure)
for k,v in adict.items():
h5f.create_dataset(k,data=v)
h5f.close()
def load_extended_inputs(file_name, obtain_range): def load_extended_inputs(file_name, obtain_range):
h5f = h5py.File(file_name,'r') h5f = h5py.File(file_name,'r')
X = h5f['X'][obtain_range[0]:obtain_range[1]] X = h5f['X'][obtain_range[0]:obtain_range[1]]
@ -55,14 +53,34 @@ def load_extended_inputs(file_name, obtain_range):
h5f.close() h5f.close()
return X, y, X_pure return X, y, X_pure
# functions for creating and loading shuffle vector
def create_and_save_shuffle_vector(file_name, shuffle_vector):
# X, y, X_pure = generate_full_vowel_matrix_inputs()
h5f = h5py.File(file_name + '_shuffle_vector.h5', 'w')
adict=dict(shuffle_vector=shuffle_vector)
for k, v in adict.items():
h5f.create_dataset(k,data=v)
h5f.close()
def load_shuffle_vector(file_name):
h5f = h5py.File(file_name,'r')
shuffle_vector = h5f['shuffle_vector'][[179859, 385513, 893430]]
h5f.close()
return shuffle_vector
# functions for saving and loading model - ONLY WHERE KERAS IS NOT NEEDED
def save_model(model, file_name): def save_model(model, file_name):
h5f = h5py.File(file_name, 'w') h5f = h5py.File(file_name, 'w')
adict=dict(W1=model['W1'], b1=model['b1'], W2=model['W2'], b2=model['b2']) adict = dict(W1=model['W1'], b1=model['b1'], W2=model['W2'], b2=model['b2'])
for k,v in adict.items(): for k,v in adict.items():
h5f.create_dataset(k,data=v) h5f.create_dataset(k,data=v)
h5f.close() h5f.close()
def load_model(file_name): def load_model(file_name):
h5f = h5py.File(file_name,'r') h5f = h5py.File(file_name,'r')
model = {} model = {}
@ -73,6 +91,7 @@ def load_model(file_name):
h5f.close() h5f.close()
return model return model
# functions for creating X and y from content
def read_content(): def read_content():
print('READING CONTENT...') print('READING CONTENT...')
with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f: with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f:
@ -88,15 +107,15 @@ def is_vowel(word_list, position, vowels):
return True return True
return False return False
def is_accetuated_vowel(word_list, position, accetuated_vowels): def is_accetuated_vowel(word_list, position, accetuated_vowels):
if word_list[position] in accetuated_vowels: if word_list[position] in accetuated_vowels:
return True return True
return False return False
def create_dict(): def create_dict():
content = read_content() content = read_content()
print('CREATING DICTIONARY...') print('CREATING DICTIONARY...')
# CREATE dictionary AND max_word # CREATE dictionary AND max_word
@ -151,16 +170,6 @@ def generate_presentable_y(accetuations_list, word_list, max_num_vowels):
final_position = accetuations_list[0] + max_num_vowels * accetuations_list[1] final_position = accetuations_list[0] + max_num_vowels * accetuations_list[1]
return final_position return final_position
def shuffle_inputs(X, y, X_pure=False):
s = np.arange(X.shape[0])
np.random.shuffle(s)
X = X[s]
y = y[s]
if X_pure:
X_pure = X_pure[s]
return X, y, X_pure
else:
return X, y
# def generate_inputs(): # def generate_inputs():
# dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict() # dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
@ -262,29 +271,21 @@ def generate_full_matrix_inputs():
return X_train, y_train, X_validate, y_validate return X_train, y_train, X_validate, y_validate
def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels): def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels):
# X = np.zeros((len(content), max_word*len(dictionary)))
y = np.zeros((len(content), max_num_vowels * max_num_vowels )) y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
X = np.zeros((len(content), max_word, len(dictionary))) X = np.zeros((len(content), max_word, len(dictionary)))
X_aditional_data = []
i = 0 i = 0
for el in content: for el in content:
j = 0 j = 0
# word = []
for c in list(el[0]): for c in list(el[0]):
index = 0 index = 0
# character = np.zeros(len(dictionary))
for d in dictionary: for d in dictionary:
if c == d: if c == d:
X[i][j][index] = 1 X[i][j][index] = 1
# character[index] = 1
break break
index += 1 index += 1
# word.append(character)
j += 1 j += 1
j = 0 j = 0
# X.append(word)
word_accetuations = [] word_accetuations = []
num_vowels = 0 num_vowels = 0
for c in list(el[3]): for c in list(el[3]):
@ -299,13 +300,13 @@ def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, acce
j += 1 j += 1
y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1 y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
i += 1 i += 1
# X = np.array(X)
print('SHUFFELING INPUTS...') print('SHUFFELING INPUTS...')
X, y = shuffle_inputs(X, y) X, y = shuffle_inputs(X, y)
print('INPUTS SHUFFELED!') print('INPUTS SHUFFELED!')
return X, y return X, y
def count_vowels(content, vowels): def count_vowels(content, vowels):
num_all_vowels = 0 num_all_vowels = 0
for el in content: for el in content:
@ -314,10 +315,8 @@ def count_vowels(content, vowels):
num_all_vowels += 1 num_all_vowels += 1
return num_all_vowels return num_all_vowels
# def generate_full_vowel_matrix_inputs(name, split_number):
# Data generation for generator inputs
def generate_X_and_y_RAM_efficient(name, split_number): def generate_X_and_y_RAM_efficient(name, split_number):
h5f = h5py.File(name + '.h5', 'w') h5f = h5py.File(name + '.h5', 'w')
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict() dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
@ -332,14 +331,8 @@ def generate_X_and_y_RAM_efficient(name, split_number):
maxshape=(num_all_vowels,), maxshape=(num_all_vowels,),
dtype=np.uint8) dtype=np.uint8)
gc.collect() gc.collect()
# print (2018553 * max_word * len(dictionary) / (2**30.0))
print('GENERATING X AND y...') print('GENERATING X AND y...')
# X = np.zeros((len(content), max_word*len(dictionary)))
# y = np.zeros((len(content), max_num_vowels * max_num_vowels))
# X = np.zeros((2018553, max_word, len(dictionary)))
X_pure = [] X_pure = []
X = [] X = []
y = [] y = []
@ -373,12 +366,6 @@ def generate_X_and_y_RAM_efficient(name, split_number):
if current_part_generation * part_len <= i: if current_part_generation * part_len <= i:
print('Saving part '+ str(current_part_generation)) print('Saving part '+ str(current_part_generation))
# create_and_save_inputs(name, str(current_part_generation), np.array(X), np.zeros(len(X)), np.array(X_pure))
# adict = dict(X=np.array(X), y=np.zeros(len(X)), X_pure=np.array(X_pure))
# for k, v in adict.items():
# h5f.create_dataset(k, data=v)
# print (len(np.array(X)))
data_X[old_num_all_vowels:num_all_vowels + 1] = np.array(X) data_X[old_num_all_vowels:num_all_vowels + 1] = np.array(X)
data_y[old_num_all_vowels:num_all_vowels + 1] = np.array(y) data_y[old_num_all_vowels:num_all_vowels + 1] = np.array(y)
data_X_pure[old_num_all_vowels:num_all_vowels + 1] = np.array(X_pure) data_X_pure[old_num_all_vowels:num_all_vowels + 1] = np.array(X_pure)
@ -394,39 +381,18 @@ def generate_X_and_y_RAM_efficient(name, split_number):
num_all_vowels += 1 num_all_vowels += 1
if i%10000 == 0: if i%10000 == 0:
print(i) print(i)
# text_file.write("Purchase Amount: %s" % TotalAmount)
j = 0
# X.append(word)
# word_accetuations = []
# num_vowels = 0
# for c in list(el[3]):
# index = 0
# if is_vowel(el[3], j, vowels):
# num_vowels += 1
# for d in accetuated_vowels:
# if c == d:
# word_accetuations.append(num_vowels)
# break
# index += 1
# j += 1
# y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
i += 1 i += 1
print('Saving part ' + str(current_part_generation)) print('Saving part ' + str(current_part_generation))
# create_and_save_inputs(name, str(current_part_generation), np.array(X), np.zeros(len(X)), np.array(X_pure))
data_X[old_num_all_vowels:num_all_vowels] = np.array(X) data_X[old_num_all_vowels:num_all_vowels] = np.array(X)
data_y[old_num_all_vowels:num_all_vowels] = np.array(y) data_y[old_num_all_vowels:num_all_vowels] = np.array(y)
data_X_pure[old_num_all_vowels:num_all_vowels] = np.array(X_pure) data_X_pure[old_num_all_vowels:num_all_vowels] = np.array(X_pure)
# adict = dict(X=X, y=y, X_pure=X_pure)
# for k, v in adict.items():
# h5f.create_dataset(k, data=v)
h5f.close() h5f.close()
# generator for inputs
def generate_arrays_from_file(path, batch_size): def generate_arrays_from_file(path, batch_size):
h5f = h5py.File(path, 'r') h5f = h5py.File(path, 'r')
@ -446,25 +412,15 @@ def generate_arrays_from_file(path, batch_size):
h5f.close() h5f.close()
# shuffle inputs for generator
def shuffle_full_vowel_inputs(name, orderd_name, parts): def shuffle_full_vowel_inputs(name, orderd_name, parts):
# internal_representations/inputs/X_ordered_part
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict() dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
num_all_vowels = count_vowels(content, vowels) num_all_vowels = count_vowels(content, vowels)
num_all_vowels = 12 # num_all_vowels = 12
s = np.arange(num_all_vowels) s = np.arange(num_all_vowels)
np.random.shuffle(s) np.random.shuffle(s)
# create_and_save_shuffle_vector(name, s)
# s = load_shuffle_vector('internal_representations/inputs/X_shuffled_part_shuffle_vector.h5')
# try:
# h5f.close()
# except Exception, e:
# pass
h5f = h5py.File(name, 'w') h5f = h5py.File(name, 'w')
data_X = h5f.create_dataset('X', (num_all_vowels, max_word, len(dictionary)), data_X = h5f.create_dataset('X', (num_all_vowels, max_word, len(dictionary)),
@ -491,9 +447,6 @@ def shuffle_full_vowel_inputs(name, orderd_name, parts):
for i in range(1, parts+1): for i in range(1, parts+1):
X, y, X_pure = load_extended_inputs(orderd_name, targeted_range) X, y, X_pure = load_extended_inputs(orderd_name, targeted_range)
for j in range(X.shape[0]): for j in range(X.shape[0]):
# print targeted_range[0]
# print targeted_range[1]
# print s[j]
if s[j + targeted_range[0]] >= section_range[0] and s[j + targeted_range[0]] < section_range[1]: if s[j + targeted_range[0]] >= section_range[0] and s[j + targeted_range[0]] < section_range[1]:
# print 's[j] ' + str(s[j + targeted_range[0]]) + ' section_range[0] ' + str(section_range[0]) + ' section_range[1] ' + str(section_range[1]) # print 's[j] ' + str(s[j + targeted_range[0]]) + ' section_range[0] ' + str(section_range[0]) + ' section_range[1] ' + str(section_range[1])
new_X[s[j + targeted_range[0]] - section_range[0]] = X[j] new_X[s[j + targeted_range[0]] - section_range[0]] = X[j]
@ -506,15 +459,6 @@ def shuffle_full_vowel_inputs(name, orderd_name, parts):
targeted_range[1] = num_all_vowels targeted_range[1] = num_all_vowels
del X, y, X_pure del X, y, X_pure
print('CREATED ' + str(h) + '. PART OF SHUFFLED MATRIX') print('CREATED ' + str(h) + '. PART OF SHUFFLED MATRIX')
# create_and_save_inputs(name, str(h), new_X, new_y, new_X_pure)
# a =
# print (a.shape)
# print s
# for el in np.array(new_X):
# print el
# print 'new_X ' + str(new_X) + ' section_range[0] ' + str(section_range[0]) + ' section_range[1] ' + str(section_range[1])
# print new_X.shape
# print type(new_X)
data_X[section_range[0]:section_range[1]] = new_X data_X[section_range[0]:section_range[1]] = new_X
data_y[section_range[0]:section_range[1]] = new_y data_y[section_range[0]:section_range[1]] = new_y
data_X_pure[section_range[0]:section_range[1]] = new_X_pure data_X_pure[section_range[0]:section_range[1]] = new_X_pure
@ -528,8 +472,7 @@ def shuffle_full_vowel_inputs(name, orderd_name, parts):
h5f.close() h5f.close()
# Decoders for inputs and outputs
def decode_position(y, max_num_vowels): def decode_position(y, max_num_vowels):
max_el = 0 max_el = 0
i = 0 i = 0
@ -541,6 +484,7 @@ def decode_position(y, max_num_vowels):
i += 1 i += 1
return [pos % max_num_vowels, pos / max_num_vowels] return [pos % max_num_vowels, pos / max_num_vowels]
def decode_input(word_encoded, dictionary): def decode_input(word_encoded, dictionary):
word = '' word = ''
for el in word_encoded: for el in word_encoded:
@ -570,6 +514,7 @@ def generate_input_from_word(word, max_word, dictionary):
j += 1 j += 1
return x return x
def generate_input_per_vowel_from_word(word, max_word, dictionary, vowels): def generate_input_per_vowel_from_word(word, max_word, dictionary, vowels):
X_el = np.zeros((max_word, len(dictionary))) X_el = np.zeros((max_word, len(dictionary)))
j = 0 j = 0
@ -592,6 +537,7 @@ def generate_input_per_vowel_from_word(word, max_word, dictionary, vowels):
vowel_i += 1 vowel_i += 1
return np.array(X), np.array(X_pure) return np.array(X), np.array(X_pure)
def decode_position_from_vowel_to_final_number(y): def decode_position_from_vowel_to_final_number(y):
res = [] res = []
for i in range(len(y)): for i in range(len(y)):
@ -600,6 +546,7 @@ def decode_position_from_vowel_to_final_number(y):
return res return res
# split content so that there is no overfitting
def split_content(content, ratio): def split_content(content, ratio):
expanded_content = [el[1] if el[1] != '=' else el[0] for el in content] expanded_content = [el[1] if el[1] != '=' else el[0] for el in content]
# print(len(content)) # print(len(content))
@ -609,7 +556,6 @@ def split_content(content, ratio):
np.random.shuffle(s) np.random.shuffle(s)
split_num = math.floor(len(unique_content) * ratio) split_num = math.floor(len(unique_content) * ratio)
validate_content = []
shuffled_unique_train_content = [unique_content[i] for i in range(len(s)) if s[i] >= split_num] shuffled_unique_train_content = [unique_content[i] for i in range(len(s)) if s[i] >= split_num]
shuffled_unique_train_content_set = set(shuffled_unique_train_content) shuffled_unique_train_content_set = set(shuffled_unique_train_content)