Managing validate data to counter overfitting
This commit is contained in:
parent
0e3dd4f88c
commit
3c9edd5d1c
|
@ -4,6 +4,7 @@
|
||||||
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
|
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
|
||||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
|
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
|
||||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" afterPath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" />
|
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" afterPath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" />
|
||||||
|
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" afterPath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" />
|
||||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
|
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
|
||||||
</list>
|
</list>
|
||||||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||||
|
@ -34,8 +35,8 @@
|
||||||
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
|
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
|
||||||
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="54">
|
<state relative-caret-position="266">
|
||||||
<caret line="438" column="14" lean-forward="false" selection-start-line="438" selection-start-column="14" selection-end-line="438" selection-end-column="14" />
|
<caret line="562" column="17" lean-forward="true" selection-start-line="562" selection-start-column="17" selection-end-line="562" selection-end-column="17" />
|
||||||
<folding>
|
<folding>
|
||||||
<element signature="e#24#63#0" expanded="true" />
|
<element signature="e#24#63#0" expanded="true" />
|
||||||
</folding>
|
</folding>
|
||||||
|
@ -132,6 +133,7 @@
|
||||||
<find>load_extended_inputs</find>
|
<find>load_extended_inputs</find>
|
||||||
<find>create_and_save_shuffle_vector</find>
|
<find>create_and_save_shuffle_vector</find>
|
||||||
<find>h5f</find>
|
<find>h5f</find>
|
||||||
|
<find>generate_inputs</find>
|
||||||
</findStrings>
|
</findStrings>
|
||||||
</component>
|
</component>
|
||||||
<component name="Git.Settings">
|
<component name="Git.Settings">
|
||||||
|
@ -171,6 +173,7 @@
|
||||||
<foldersAlwaysOnTop value="true" />
|
<foldersAlwaysOnTop value="true" />
|
||||||
</navigator>
|
</navigator>
|
||||||
<panes>
|
<panes>
|
||||||
|
<pane id="Scratches" />
|
||||||
<pane id="Scope" />
|
<pane id="Scope" />
|
||||||
<pane id="ProjectPane">
|
<pane id="ProjectPane">
|
||||||
<subPane>
|
<subPane>
|
||||||
|
@ -218,7 +221,6 @@
|
||||||
</PATH>
|
</PATH>
|
||||||
</subPane>
|
</subPane>
|
||||||
</pane>
|
</pane>
|
||||||
<pane id="Scratches" />
|
|
||||||
</panes>
|
</panes>
|
||||||
</component>
|
</component>
|
||||||
<component name="PropertiesComponent">
|
<component name="PropertiesComponent">
|
||||||
|
@ -951,8 +953,8 @@
|
||||||
</entry>
|
</entry>
|
||||||
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="54">
|
<state relative-caret-position="266">
|
||||||
<caret line="438" column="14" lean-forward="false" selection-start-line="438" selection-start-column="14" selection-end-line="438" selection-end-column="14" />
|
<caret line="562" column="17" lean-forward="true" selection-start-line="562" selection-start-column="17" selection-end-line="562" selection-end-column="17" />
|
||||||
<folding>
|
<folding>
|
||||||
<element signature="e#24#63#0" expanded="true" />
|
<element signature="e#24#63#0" expanded="true" />
|
||||||
</folding>
|
</folding>
|
||||||
|
|
229
prepare_data.py
229
prepare_data.py
|
@ -5,6 +5,7 @@ from __future__ import unicode_literals
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import h5py
|
import h5py
|
||||||
import gc
|
import gc
|
||||||
|
import math
|
||||||
import StringIO
|
import StringIO
|
||||||
import copy
|
import copy
|
||||||
|
|
||||||
|
@ -75,7 +76,7 @@ def load_model(file_name):
|
||||||
|
|
||||||
def read_content():
|
def read_content():
|
||||||
print('READING CONTENT...')
|
print('READING CONTENT...')
|
||||||
with open('../data/SlovarIJS_BESEDE_utf8.lex') as f:
|
with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f:
|
||||||
content = f.readlines()
|
content = f.readlines()
|
||||||
print('CONTENT READ SUCCESSFULY')
|
print('CONTENT READ SUCCESSFULY')
|
||||||
return [x.decode('utf8').split('\t') for x in content]
|
return [x.decode('utf8').split('\t') for x in content]
|
||||||
|
@ -159,97 +160,98 @@ def shuffle_inputs(X, y, X_pure):
|
||||||
X_pure = X_pure[s]
|
X_pure = X_pure[s]
|
||||||
return X, y, X_pure
|
return X, y, X_pure
|
||||||
|
|
||||||
def generate_inputs():
|
# def generate_inputs():
|
||||||
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
# dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
||||||
|
#
|
||||||
print('GENERATING X AND y...')
|
# print('GENERATING X AND y...')
|
||||||
X = np.zeros((len(content), max_word*len(dictionary)))
|
# X = np.zeros((len(content), max_word*len(dictionary)))
|
||||||
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
# y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
||||||
|
#
|
||||||
i = 0
|
# i = 0
|
||||||
for el in content:
|
# for el in content:
|
||||||
j = 0
|
# j = 0
|
||||||
for c in list(el[0]):
|
# for c in list(el[0]):
|
||||||
index = 0
|
# index = 0
|
||||||
for d in dictionary:
|
# for d in dictionary:
|
||||||
if c == d:
|
# if c == d:
|
||||||
X[i][index + j * max_word] = 1
|
# X[i][index + j * max_word] = 1
|
||||||
break
|
# break
|
||||||
index += 1
|
# index += 1
|
||||||
j += 1
|
# j += 1
|
||||||
j = 0
|
# j = 0
|
||||||
word_accetuations = []
|
# word_accetuations = []
|
||||||
num_vowels = 0
|
# num_vowels = 0
|
||||||
for c in list(el[3]):
|
# for c in list(el[3]):
|
||||||
index = 0
|
# index = 0
|
||||||
if is_vowel(el[3], j, vowels):
|
# if is_vowel(el[3], j, vowels):
|
||||||
num_vowels += 1
|
# num_vowels += 1
|
||||||
for d in accetuated_vowels:
|
# for d in accetuated_vowels:
|
||||||
if c == d:
|
# if c == d:
|
||||||
word_accetuations.append(num_vowels)
|
# word_accetuations.append(num_vowels)
|
||||||
break
|
# break
|
||||||
index += 1
|
# index += 1
|
||||||
j += 1
|
# j += 1
|
||||||
y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
# y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
||||||
i += 1
|
# i += 1
|
||||||
print('GENERATION SUCCESSFUL!')
|
# print('GENERATION SUCCESSFUL!')
|
||||||
print('SHUFFELING INPUTS...')
|
# print('SHUFFELING INPUTS...')
|
||||||
X, y = shuffle_inputs(X, y)
|
# X, y = shuffle_inputs(X, y)
|
||||||
print('INPUTS SHUFFELED!')
|
# print('INPUTS SHUFFELED!')
|
||||||
return X, y
|
# return X, y
|
||||||
|
#
|
||||||
|
#
|
||||||
def generate_matrix_inputs():
|
# def generate_matrix_inputs():
|
||||||
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
# dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
||||||
|
#
|
||||||
print('GENERATING X AND y...')
|
# print('GENERATING X AND y...')
|
||||||
# X = np.zeros((len(content), max_word*len(dictionary)))
|
# # X = np.zeros((len(content), max_word*len(dictionary)))
|
||||||
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
# y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
||||||
|
#
|
||||||
X = []
|
# X = []
|
||||||
|
#
|
||||||
i = 0
|
# i = 0
|
||||||
for el in content:
|
# for el in content:
|
||||||
# j = 0
|
# # j = 0
|
||||||
word = []
|
# word = []
|
||||||
for c in list(el[0]):
|
# for c in list(el[0]):
|
||||||
index = 0
|
# index = 0
|
||||||
character = np.zeros(len(dictionary))
|
# character = np.zeros(len(dictionary))
|
||||||
for d in dictionary:
|
# for d in dictionary:
|
||||||
if c == d:
|
# if c == d:
|
||||||
# X[i][index + j * max_word] = 1
|
# # X[i][index + j * max_word] = 1
|
||||||
character[index] = 1
|
# character[index] = 1
|
||||||
break
|
# break
|
||||||
index += 1
|
# index += 1
|
||||||
word.append(character)
|
# word.append(character)
|
||||||
# j += 1
|
# # j += 1
|
||||||
j = 0
|
# j = 0
|
||||||
X.append(word)
|
# X.append(word)
|
||||||
word_accetuations = []
|
# word_accetuations = []
|
||||||
num_vowels = 0
|
# num_vowels = 0
|
||||||
for c in list(el[3]):
|
# for c in list(el[3]):
|
||||||
index = 0
|
# index = 0
|
||||||
if is_vowel(el[3], j, vowels):
|
# if is_vowel(el[3], j, vowels):
|
||||||
num_vowels += 1
|
# num_vowels += 1
|
||||||
for d in accetuated_vowels:
|
# for d in accetuated_vowels:
|
||||||
if c == d:
|
# if c == d:
|
||||||
word_accetuations.append(num_vowels)
|
# word_accetuations.append(num_vowels)
|
||||||
break
|
# break
|
||||||
index += 1
|
# index += 1
|
||||||
j += 1
|
# j += 1
|
||||||
y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
# y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
||||||
i += 1
|
# i += 1
|
||||||
X = np.array(X)
|
# X = np.array(X)
|
||||||
print('GENERATION SUCCESSFUL!')
|
# print('GENERATION SUCCESSFUL!')
|
||||||
print('SHUFFELING INPUTS...')
|
# print('SHUFFELING INPUTS...')
|
||||||
X, y = shuffle_inputs(X, y)
|
# X, y = shuffle_inputs(X, y)
|
||||||
print('INPUTS SHUFFELED!')
|
# print('INPUTS SHUFFELED!')
|
||||||
return X, y
|
# return X, y
|
||||||
|
|
||||||
|
|
||||||
def generate_full_matrix_inputs():
|
def generate_full_matrix_inputs():
|
||||||
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
||||||
|
|
||||||
|
|
||||||
print('GENERATING X AND y...')
|
print('GENERATING X AND y...')
|
||||||
# X = np.zeros((len(content), max_word*len(dictionary)))
|
# X = np.zeros((len(content), max_word*len(dictionary)))
|
||||||
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
||||||
|
@ -553,3 +555,54 @@ def generate_input_from_word(word, max_word, dictionary):
|
||||||
index += 1
|
index += 1
|
||||||
j += 1
|
j += 1
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
def generate_input_per_vowel_from_word(word, max_word, dictionary, vowels):
|
||||||
|
X_el = np.zeros((max_word, len(dictionary)))
|
||||||
|
j = 0
|
||||||
|
for c in list(word):
|
||||||
|
index = 0
|
||||||
|
for d in dictionary:
|
||||||
|
if c == d:
|
||||||
|
X_el[j][index] = 1
|
||||||
|
break
|
||||||
|
index += 1
|
||||||
|
j += 1
|
||||||
|
|
||||||
|
X = []
|
||||||
|
X_pure = []
|
||||||
|
vowel_i = 0
|
||||||
|
for i in range(len(word)):
|
||||||
|
if is_vowel(list(word), i, vowels):
|
||||||
|
X.append(X_el)
|
||||||
|
X_pure.append(vowel_i)
|
||||||
|
vowel_i += 1
|
||||||
|
return np.array(X), np.array(X_pure)
|
||||||
|
|
||||||
|
def decode_position_from_vowel_to_final_number(y):
|
||||||
|
res = []
|
||||||
|
for i in range(len(y)):
|
||||||
|
if y[i][0] > 0.5:
|
||||||
|
res.append(i + 1)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def split_content(content, ratio):
|
||||||
|
expanded_content = [el[1] if el[1] != '=' else el[0] for el in content]
|
||||||
|
# print(len(content))
|
||||||
|
unique_content = sorted(set(expanded_content))
|
||||||
|
|
||||||
|
s = np.arange(len(unique_content))
|
||||||
|
np.random.shuffle(s)
|
||||||
|
|
||||||
|
split_num = math.floor(len(unique_content) * ratio)
|
||||||
|
validate_content = []
|
||||||
|
shuffled_unique_train_content = [unique_content[i] for i in range(len(s)) if s[i] >= split_num]
|
||||||
|
|
||||||
|
shuffled_unique_train_content_set = set(shuffled_unique_train_content)
|
||||||
|
shuffled_unique_validate_content = [unique_content[i] for i in range(len(s)) if s[i] < split_num]
|
||||||
|
|
||||||
|
shuffled_unique_validate_content_set = set(shuffled_unique_validate_content)
|
||||||
|
|
||||||
|
train_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_train_content_set]
|
||||||
|
validate_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_validate_content_set]
|
||||||
|
return train_content, validate_content
|
Loading…
Reference in New Issue
Block a user