Managing validate data to counter overfitting

This commit is contained in:
lkrsnik 2017-06-27 11:40:56 +02:00
parent 0e3dd4f88c
commit 3c9edd5d1c
2 changed files with 148 additions and 93 deletions

View File

@ -4,6 +4,7 @@
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment=""> <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" afterPath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" afterPath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" afterPath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
</list> </list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" /> <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
@ -34,8 +35,8 @@
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true"> <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/prepare_data.py"> <entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="54"> <state relative-caret-position="266">
<caret line="438" column="14" lean-forward="false" selection-start-line="438" selection-start-column="14" selection-end-line="438" selection-end-column="14" /> <caret line="562" column="17" lean-forward="true" selection-start-line="562" selection-start-column="17" selection-end-line="562" selection-end-column="17" />
<folding> <folding>
<element signature="e#24#63#0" expanded="true" /> <element signature="e#24#63#0" expanded="true" />
</folding> </folding>
@ -132,6 +133,7 @@
<find>load_extended_inputs</find> <find>load_extended_inputs</find>
<find>create_and_save_shuffle_vector</find> <find>create_and_save_shuffle_vector</find>
<find>h5f</find> <find>h5f</find>
<find>generate_inputs</find>
</findStrings> </findStrings>
</component> </component>
<component name="Git.Settings"> <component name="Git.Settings">
@ -171,6 +173,7 @@
<foldersAlwaysOnTop value="true" /> <foldersAlwaysOnTop value="true" />
</navigator> </navigator>
<panes> <panes>
<pane id="Scratches" />
<pane id="Scope" /> <pane id="Scope" />
<pane id="ProjectPane"> <pane id="ProjectPane">
<subPane> <subPane>
@ -218,7 +221,6 @@
</PATH> </PATH>
</subPane> </subPane>
</pane> </pane>
<pane id="Scratches" />
</panes> </panes>
</component> </component>
<component name="PropertiesComponent"> <component name="PropertiesComponent">
@ -951,8 +953,8 @@
</entry> </entry>
<entry file="file://$PROJECT_DIR$/prepare_data.py"> <entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="54"> <state relative-caret-position="266">
<caret line="438" column="14" lean-forward="false" selection-start-line="438" selection-start-column="14" selection-end-line="438" selection-end-column="14" /> <caret line="562" column="17" lean-forward="true" selection-start-line="562" selection-start-column="17" selection-end-line="562" selection-end-column="17" />
<folding> <folding>
<element signature="e#24#63#0" expanded="true" /> <element signature="e#24#63#0" expanded="true" />
</folding> </folding>

View File

@ -5,6 +5,7 @@ from __future__ import unicode_literals
import numpy as np import numpy as np
import h5py import h5py
import gc import gc
import math
import StringIO import StringIO
import copy import copy
@ -75,7 +76,7 @@ def load_model(file_name):
def read_content(): def read_content():
print('READING CONTENT...') print('READING CONTENT...')
with open('../data/SlovarIJS_BESEDE_utf8.lex') as f: with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f:
content = f.readlines() content = f.readlines()
print('CONTENT READ SUCCESSFULY') print('CONTENT READ SUCCESSFULY')
return [x.decode('utf8').split('\t') for x in content] return [x.decode('utf8').split('\t') for x in content]
@ -159,97 +160,98 @@ def shuffle_inputs(X, y, X_pure):
X_pure = X_pure[s] X_pure = X_pure[s]
return X, y, X_pure return X, y, X_pure
def generate_inputs(): # def generate_inputs():
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict() # dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
#
print('GENERATING X AND y...') # print('GENERATING X AND y...')
X = np.zeros((len(content), max_word*len(dictionary)))
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
i = 0
for el in content:
j = 0
for c in list(el[0]):
index = 0
for d in dictionary:
if c == d:
X[i][index + j * max_word] = 1
break
index += 1
j += 1
j = 0
word_accetuations = []
num_vowels = 0
for c in list(el[3]):
index = 0
if is_vowel(el[3], j, vowels):
num_vowels += 1
for d in accetuated_vowels:
if c == d:
word_accetuations.append(num_vowels)
break
index += 1
j += 1
y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
i += 1
print('GENERATION SUCCESSFUL!')
print('SHUFFELING INPUTS...')
X, y = shuffle_inputs(X, y)
print('INPUTS SHUFFELED!')
return X, y
def generate_matrix_inputs():
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
print('GENERATING X AND y...')
# X = np.zeros((len(content), max_word*len(dictionary))) # X = np.zeros((len(content), max_word*len(dictionary)))
y = np.zeros((len(content), max_num_vowels * max_num_vowels )) # y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
#
X = [] # i = 0
# for el in content:
i = 0
for el in content:
# j = 0 # j = 0
word = [] # for c in list(el[0]):
for c in list(el[0]): # index = 0
index = 0 # for d in dictionary:
character = np.zeros(len(dictionary)) # if c == d:
for d in dictionary:
if c == d:
# X[i][index + j * max_word] = 1 # X[i][index + j * max_word] = 1
character[index] = 1 # break
break # index += 1
index += 1
word.append(character)
# j += 1 # j += 1
j = 0 # j = 0
X.append(word) # word_accetuations = []
word_accetuations = [] # num_vowels = 0
num_vowels = 0 # for c in list(el[3]):
for c in list(el[3]): # index = 0
index = 0 # if is_vowel(el[3], j, vowels):
if is_vowel(el[3], j, vowels): # num_vowels += 1
num_vowels += 1 # for d in accetuated_vowels:
for d in accetuated_vowels: # if c == d:
if c == d: # word_accetuations.append(num_vowels)
word_accetuations.append(num_vowels) # break
break # index += 1
index += 1 # j += 1
j += 1 # y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1 # i += 1
i += 1 # print('GENERATION SUCCESSFUL!')
X = np.array(X) # print('SHUFFELING INPUTS...')
print('GENERATION SUCCESSFUL!') # X, y = shuffle_inputs(X, y)
print('SHUFFELING INPUTS...') # print('INPUTS SHUFFELED!')
X, y = shuffle_inputs(X, y) # return X, y
print('INPUTS SHUFFELED!') #
return X, y #
# def generate_matrix_inputs():
# dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
#
# print('GENERATING X AND y...')
# # X = np.zeros((len(content), max_word*len(dictionary)))
# y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
#
# X = []
#
# i = 0
# for el in content:
# # j = 0
# word = []
# for c in list(el[0]):
# index = 0
# character = np.zeros(len(dictionary))
# for d in dictionary:
# if c == d:
# # X[i][index + j * max_word] = 1
# character[index] = 1
# break
# index += 1
# word.append(character)
# # j += 1
# j = 0
# X.append(word)
# word_accetuations = []
# num_vowels = 0
# for c in list(el[3]):
# index = 0
# if is_vowel(el[3], j, vowels):
# num_vowels += 1
# for d in accetuated_vowels:
# if c == d:
# word_accetuations.append(num_vowels)
# break
# index += 1
# j += 1
# y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
# i += 1
# X = np.array(X)
# print('GENERATION SUCCESSFUL!')
# print('SHUFFELING INPUTS...')
# X, y = shuffle_inputs(X, y)
# print('INPUTS SHUFFELED!')
# return X, y
def generate_full_matrix_inputs(): def generate_full_matrix_inputs():
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict() dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
print('GENERATING X AND y...') print('GENERATING X AND y...')
# X = np.zeros((len(content), max_word*len(dictionary))) # X = np.zeros((len(content), max_word*len(dictionary)))
y = np.zeros((len(content), max_num_vowels * max_num_vowels )) y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
@ -553,3 +555,54 @@ def generate_input_from_word(word, max_word, dictionary):
index += 1 index += 1
j += 1 j += 1
return x return x
def generate_input_per_vowel_from_word(word, max_word, dictionary, vowels):
X_el = np.zeros((max_word, len(dictionary)))
j = 0
for c in list(word):
index = 0
for d in dictionary:
if c == d:
X_el[j][index] = 1
break
index += 1
j += 1
X = []
X_pure = []
vowel_i = 0
for i in range(len(word)):
if is_vowel(list(word), i, vowels):
X.append(X_el)
X_pure.append(vowel_i)
vowel_i += 1
return np.array(X), np.array(X_pure)
def decode_position_from_vowel_to_final_number(y):
res = []
for i in range(len(y)):
if y[i][0] > 0.5:
res.append(i + 1)
return res
def split_content(content, ratio):
expanded_content = [el[1] if el[1] != '=' else el[0] for el in content]
# print(len(content))
unique_content = sorted(set(expanded_content))
s = np.arange(len(unique_content))
np.random.shuffle(s)
split_num = math.floor(len(unique_content) * ratio)
validate_content = []
shuffled_unique_train_content = [unique_content[i] for i in range(len(s)) if s[i] >= split_num]
shuffled_unique_train_content_set = set(shuffled_unique_train_content)
shuffled_unique_validate_content = [unique_content[i] for i in range(len(s)) if s[i] < split_num]
shuffled_unique_validate_content_set = set(shuffled_unique_validate_content)
train_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_train_content_set]
validate_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_validate_content_set]
return train_content, validate_content