Managing validate data to counter overfitting
This commit is contained in:
parent
0e3dd4f88c
commit
3c9edd5d1c
|
@ -4,6 +4,7 @@
|
|||
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" afterPath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" afterPath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
|
||||
</list>
|
||||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||
|
@ -34,8 +35,8 @@
|
|||
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="54">
|
||||
<caret line="438" column="14" lean-forward="false" selection-start-line="438" selection-start-column="14" selection-end-line="438" selection-end-column="14" />
|
||||
<state relative-caret-position="266">
|
||||
<caret line="562" column="17" lean-forward="true" selection-start-line="562" selection-start-column="17" selection-end-line="562" selection-end-column="17" />
|
||||
<folding>
|
||||
<element signature="e#24#63#0" expanded="true" />
|
||||
</folding>
|
||||
|
@ -132,6 +133,7 @@
|
|||
<find>load_extended_inputs</find>
|
||||
<find>create_and_save_shuffle_vector</find>
|
||||
<find>h5f</find>
|
||||
<find>generate_inputs</find>
|
||||
</findStrings>
|
||||
</component>
|
||||
<component name="Git.Settings">
|
||||
|
@ -171,6 +173,7 @@
|
|||
<foldersAlwaysOnTop value="true" />
|
||||
</navigator>
|
||||
<panes>
|
||||
<pane id="Scratches" />
|
||||
<pane id="Scope" />
|
||||
<pane id="ProjectPane">
|
||||
<subPane>
|
||||
|
@ -218,7 +221,6 @@
|
|||
</PATH>
|
||||
</subPane>
|
||||
</pane>
|
||||
<pane id="Scratches" />
|
||||
</panes>
|
||||
</component>
|
||||
<component name="PropertiesComponent">
|
||||
|
@ -951,8 +953,8 @@
|
|||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="54">
|
||||
<caret line="438" column="14" lean-forward="false" selection-start-line="438" selection-start-column="14" selection-end-line="438" selection-end-column="14" />
|
||||
<state relative-caret-position="266">
|
||||
<caret line="562" column="17" lean-forward="true" selection-start-line="562" selection-start-column="17" selection-end-line="562" selection-end-column="17" />
|
||||
<folding>
|
||||
<element signature="e#24#63#0" expanded="true" />
|
||||
</folding>
|
||||
|
|
229
prepare_data.py
229
prepare_data.py
|
@ -5,6 +5,7 @@ from __future__ import unicode_literals
|
|||
import numpy as np
|
||||
import h5py
|
||||
import gc
|
||||
import math
|
||||
import StringIO
|
||||
import copy
|
||||
|
||||
|
@ -75,7 +76,7 @@ def load_model(file_name):
|
|||
|
||||
def read_content():
|
||||
print('READING CONTENT...')
|
||||
with open('../data/SlovarIJS_BESEDE_utf8.lex') as f:
|
||||
with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f:
|
||||
content = f.readlines()
|
||||
print('CONTENT READ SUCCESSFULY')
|
||||
return [x.decode('utf8').split('\t') for x in content]
|
||||
|
@ -159,97 +160,98 @@ def shuffle_inputs(X, y, X_pure):
|
|||
X_pure = X_pure[s]
|
||||
return X, y, X_pure
|
||||
|
||||
def generate_inputs():
|
||||
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
||||
|
||||
print('GENERATING X AND y...')
|
||||
X = np.zeros((len(content), max_word*len(dictionary)))
|
||||
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
||||
|
||||
i = 0
|
||||
for el in content:
|
||||
j = 0
|
||||
for c in list(el[0]):
|
||||
index = 0
|
||||
for d in dictionary:
|
||||
if c == d:
|
||||
X[i][index + j * max_word] = 1
|
||||
break
|
||||
index += 1
|
||||
j += 1
|
||||
j = 0
|
||||
word_accetuations = []
|
||||
num_vowels = 0
|
||||
for c in list(el[3]):
|
||||
index = 0
|
||||
if is_vowel(el[3], j, vowels):
|
||||
num_vowels += 1
|
||||
for d in accetuated_vowels:
|
||||
if c == d:
|
||||
word_accetuations.append(num_vowels)
|
||||
break
|
||||
index += 1
|
||||
j += 1
|
||||
y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
||||
i += 1
|
||||
print('GENERATION SUCCESSFUL!')
|
||||
print('SHUFFELING INPUTS...')
|
||||
X, y = shuffle_inputs(X, y)
|
||||
print('INPUTS SHUFFELED!')
|
||||
return X, y
|
||||
|
||||
|
||||
def generate_matrix_inputs():
|
||||
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
||||
|
||||
print('GENERATING X AND y...')
|
||||
# X = np.zeros((len(content), max_word*len(dictionary)))
|
||||
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
||||
|
||||
X = []
|
||||
|
||||
i = 0
|
||||
for el in content:
|
||||
# j = 0
|
||||
word = []
|
||||
for c in list(el[0]):
|
||||
index = 0
|
||||
character = np.zeros(len(dictionary))
|
||||
for d in dictionary:
|
||||
if c == d:
|
||||
# X[i][index + j * max_word] = 1
|
||||
character[index] = 1
|
||||
break
|
||||
index += 1
|
||||
word.append(character)
|
||||
# j += 1
|
||||
j = 0
|
||||
X.append(word)
|
||||
word_accetuations = []
|
||||
num_vowels = 0
|
||||
for c in list(el[3]):
|
||||
index = 0
|
||||
if is_vowel(el[3], j, vowels):
|
||||
num_vowels += 1
|
||||
for d in accetuated_vowels:
|
||||
if c == d:
|
||||
word_accetuations.append(num_vowels)
|
||||
break
|
||||
index += 1
|
||||
j += 1
|
||||
y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
||||
i += 1
|
||||
X = np.array(X)
|
||||
print('GENERATION SUCCESSFUL!')
|
||||
print('SHUFFELING INPUTS...')
|
||||
X, y = shuffle_inputs(X, y)
|
||||
print('INPUTS SHUFFELED!')
|
||||
return X, y
|
||||
# def generate_inputs():
|
||||
# dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
||||
#
|
||||
# print('GENERATING X AND y...')
|
||||
# X = np.zeros((len(content), max_word*len(dictionary)))
|
||||
# y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
||||
#
|
||||
# i = 0
|
||||
# for el in content:
|
||||
# j = 0
|
||||
# for c in list(el[0]):
|
||||
# index = 0
|
||||
# for d in dictionary:
|
||||
# if c == d:
|
||||
# X[i][index + j * max_word] = 1
|
||||
# break
|
||||
# index += 1
|
||||
# j += 1
|
||||
# j = 0
|
||||
# word_accetuations = []
|
||||
# num_vowels = 0
|
||||
# for c in list(el[3]):
|
||||
# index = 0
|
||||
# if is_vowel(el[3], j, vowels):
|
||||
# num_vowels += 1
|
||||
# for d in accetuated_vowels:
|
||||
# if c == d:
|
||||
# word_accetuations.append(num_vowels)
|
||||
# break
|
||||
# index += 1
|
||||
# j += 1
|
||||
# y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
||||
# i += 1
|
||||
# print('GENERATION SUCCESSFUL!')
|
||||
# print('SHUFFELING INPUTS...')
|
||||
# X, y = shuffle_inputs(X, y)
|
||||
# print('INPUTS SHUFFELED!')
|
||||
# return X, y
|
||||
#
|
||||
#
|
||||
# def generate_matrix_inputs():
|
||||
# dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
||||
#
|
||||
# print('GENERATING X AND y...')
|
||||
# # X = np.zeros((len(content), max_word*len(dictionary)))
|
||||
# y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
||||
#
|
||||
# X = []
|
||||
#
|
||||
# i = 0
|
||||
# for el in content:
|
||||
# # j = 0
|
||||
# word = []
|
||||
# for c in list(el[0]):
|
||||
# index = 0
|
||||
# character = np.zeros(len(dictionary))
|
||||
# for d in dictionary:
|
||||
# if c == d:
|
||||
# # X[i][index + j * max_word] = 1
|
||||
# character[index] = 1
|
||||
# break
|
||||
# index += 1
|
||||
# word.append(character)
|
||||
# # j += 1
|
||||
# j = 0
|
||||
# X.append(word)
|
||||
# word_accetuations = []
|
||||
# num_vowels = 0
|
||||
# for c in list(el[3]):
|
||||
# index = 0
|
||||
# if is_vowel(el[3], j, vowels):
|
||||
# num_vowels += 1
|
||||
# for d in accetuated_vowels:
|
||||
# if c == d:
|
||||
# word_accetuations.append(num_vowels)
|
||||
# break
|
||||
# index += 1
|
||||
# j += 1
|
||||
# y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
||||
# i += 1
|
||||
# X = np.array(X)
|
||||
# print('GENERATION SUCCESSFUL!')
|
||||
# print('SHUFFELING INPUTS...')
|
||||
# X, y = shuffle_inputs(X, y)
|
||||
# print('INPUTS SHUFFELED!')
|
||||
# return X, y
|
||||
|
||||
|
||||
def generate_full_matrix_inputs():
|
||||
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
||||
|
||||
|
||||
|
||||
print('GENERATING X AND y...')
|
||||
# X = np.zeros((len(content), max_word*len(dictionary)))
|
||||
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
||||
|
@ -553,3 +555,54 @@ def generate_input_from_word(word, max_word, dictionary):
|
|||
index += 1
|
||||
j += 1
|
||||
return x
|
||||
|
||||
def generate_input_per_vowel_from_word(word, max_word, dictionary, vowels):
|
||||
X_el = np.zeros((max_word, len(dictionary)))
|
||||
j = 0
|
||||
for c in list(word):
|
||||
index = 0
|
||||
for d in dictionary:
|
||||
if c == d:
|
||||
X_el[j][index] = 1
|
||||
break
|
||||
index += 1
|
||||
j += 1
|
||||
|
||||
X = []
|
||||
X_pure = []
|
||||
vowel_i = 0
|
||||
for i in range(len(word)):
|
||||
if is_vowel(list(word), i, vowels):
|
||||
X.append(X_el)
|
||||
X_pure.append(vowel_i)
|
||||
vowel_i += 1
|
||||
return np.array(X), np.array(X_pure)
|
||||
|
||||
def decode_position_from_vowel_to_final_number(y):
|
||||
res = []
|
||||
for i in range(len(y)):
|
||||
if y[i][0] > 0.5:
|
||||
res.append(i + 1)
|
||||
return res
|
||||
|
||||
|
||||
def split_content(content, ratio):
|
||||
expanded_content = [el[1] if el[1] != '=' else el[0] for el in content]
|
||||
# print(len(content))
|
||||
unique_content = sorted(set(expanded_content))
|
||||
|
||||
s = np.arange(len(unique_content))
|
||||
np.random.shuffle(s)
|
||||
|
||||
split_num = math.floor(len(unique_content) * ratio)
|
||||
validate_content = []
|
||||
shuffled_unique_train_content = [unique_content[i] for i in range(len(s)) if s[i] >= split_num]
|
||||
|
||||
shuffled_unique_train_content_set = set(shuffled_unique_train_content)
|
||||
shuffled_unique_validate_content = [unique_content[i] for i in range(len(s)) if s[i] < split_num]
|
||||
|
||||
shuffled_unique_validate_content_set = set(shuffled_unique_validate_content)
|
||||
|
||||
train_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_train_content_set]
|
||||
validate_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_validate_content_set]
|
||||
return train_content, validate_content
|
Loading…
Reference in New Issue
Block a user