Created cnn witch look at aditional features as well
This commit is contained in:
parent
ac8f0057c5
commit
d314a9ee4f
|
@ -2,20 +2,7 @@
|
||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="ChangeListManager">
|
<component name="ChangeListManager">
|
||||||
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
|
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
|
||||||
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn.ipynb" afterPath="" />
|
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/character_based_ffnn_keras.ipynb" afterPath="$PROJECT_DIR$/cnn/character_based_ffnn_keras.ipynb" />
|
||||||
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" afterPath="" />
|
|
||||||
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_theano.ipynb" afterPath="" />
|
|
||||||
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/cnn_per_vowel_3epoch.h5" afterPath="" />
|
|
||||||
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/create_and_save_inputs.py" afterPath="" />
|
|
||||||
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65.h5" afterPath="" />
|
|
||||||
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65_10epoch.h5" afterPath="" />
|
|
||||||
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65_5epoch.h5" afterPath="" />
|
|
||||||
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/ffnn_i1_s500relu_121sigmoid_mse_adam_a65.h5" afterPath="" />
|
|
||||||
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/ffnn_i1_s500relu_d20_121sigmoid_mse_adam_a65.h5" afterPath="" />
|
|
||||||
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/test.txt" afterPath="" />
|
|
||||||
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" afterPath="" />
|
|
||||||
<change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65_10epoch_no_overfitting.h5" afterPath="" />
|
|
||||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
|
|
||||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
|
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
|
||||||
</list>
|
</list>
|
||||||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||||
|
@ -46,8 +33,8 @@
|
||||||
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
|
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
|
||||||
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="410">
|
<state relative-caret-position="442">
|
||||||
<caret line="545" column="14" lean-forward="true" selection-start-line="545" selection-start-column="14" selection-end-line="545" selection-end-column="14" />
|
<caret line="494" column="36" lean-forward="true" selection-start-line="494" selection-start-column="36" selection-end-line="494" selection-end-column="36" />
|
||||||
<folding>
|
<folding>
|
||||||
<element signature="e#24#63#0" expanded="true" />
|
<element signature="e#24#63#0" expanded="true" />
|
||||||
</folding>
|
</folding>
|
||||||
|
@ -137,7 +124,6 @@
|
||||||
<find>nearly_zeros</find>
|
<find>nearly_zeros</find>
|
||||||
<find>rand</find>
|
<find>rand</find>
|
||||||
<find>u</find>
|
<find>u</find>
|
||||||
<find>shuffle_inputs</find>
|
|
||||||
<find>num_all_vowels</find>
|
<find>num_all_vowels</find>
|
||||||
<find>load_shuffle_vector</find>
|
<find>load_shuffle_vector</find>
|
||||||
<find>create_and_save_inputs</find>
|
<find>create_and_save_inputs</find>
|
||||||
|
@ -147,6 +133,7 @@
|
||||||
<find>generate_inputs</find>
|
<find>generate_inputs</find>
|
||||||
<find>split_number</find>
|
<find>split_number</find>
|
||||||
<find>StringIO</find>
|
<find>StringIO</find>
|
||||||
|
<find>shuffle_inputs</find>
|
||||||
</findStrings>
|
</findStrings>
|
||||||
</component>
|
</component>
|
||||||
<component name="Git.Settings">
|
<component name="Git.Settings">
|
||||||
|
@ -990,8 +977,8 @@
|
||||||
</entry>
|
</entry>
|
||||||
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="410">
|
<state relative-caret-position="442">
|
||||||
<caret line="545" column="14" lean-forward="true" selection-start-line="545" selection-start-column="14" selection-end-line="545" selection-end-column="14" />
|
<caret line="494" column="36" lean-forward="true" selection-start-line="494" selection-start-column="36" selection-end-line="494" selection-end-column="36" />
|
||||||
<folding>
|
<folding>
|
||||||
<element signature="e#24#63#0" expanded="true" />
|
<element signature="e#24#63#0" expanded="true" />
|
||||||
</folding>
|
</folding>
|
||||||
|
|
100
prepare_data.py
100
prepare_data.py
|
@ -8,28 +8,35 @@ import gc
|
||||||
import math
|
import math
|
||||||
|
|
||||||
# functions for saving, loading and shuffling whole arrays to ram
|
# functions for saving, loading and shuffling whole arrays to ram
|
||||||
def save_inputs(file_name, X, y):
|
def save_inputs(file_name, X, y, other_features=[]):
|
||||||
h5f = h5py.File(file_name, 'w')
|
h5f = h5py.File(file_name, 'w')
|
||||||
adict = dict(X=X, y=y)
|
if other_features == []:
|
||||||
|
adict = dict(X=X, y=y)
|
||||||
|
else:
|
||||||
|
adict = dict(X=X, X_other_features=other_features, y=y)
|
||||||
for k, v in adict.items():
|
for k, v in adict.items():
|
||||||
h5f.create_dataset(k,data=v)
|
h5f.create_dataset(k, data=v)
|
||||||
h5f.close()
|
h5f.close()
|
||||||
|
|
||||||
def load_inputs(file_name):
|
def load_inputs(file_name, other_features=False):
|
||||||
h5f = h5py.File(file_name,'r')
|
h5f = h5py.File(file_name,'r')
|
||||||
X = h5f['X'][:]
|
X = h5f['X'][:]
|
||||||
y = h5f['y'][:]
|
y = h5f['y'][:]
|
||||||
|
if other_features:
|
||||||
|
X_other_features = h5f['X_other_features'][:]
|
||||||
|
h5f.close()
|
||||||
|
return X, X_other_features, y
|
||||||
|
|
||||||
h5f.close()
|
h5f.close()
|
||||||
return X, y
|
return X, y
|
||||||
|
|
||||||
|
|
||||||
def shuffle_inputs(X, y, X_pure=False):
|
def shuffle_inputs(X, y, X_pure=[]):
|
||||||
s = np.arange(X.shape[0])
|
s = np.arange(X.shape[0])
|
||||||
np.random.shuffle(s)
|
np.random.shuffle(s)
|
||||||
X = X[s]
|
X = X[s]
|
||||||
y = y[s]
|
y = y[s]
|
||||||
if X_pure:
|
if X_pure != []:
|
||||||
X_pure = X_pure[s]
|
X_pure = X_pure[s]
|
||||||
return X, y, X_pure
|
return X, y, X_pure
|
||||||
else:
|
else:
|
||||||
|
@ -40,7 +47,7 @@ def create_and_save_inputs(file_name, part, X, y, X_pure):
|
||||||
# X, y, X_pure = generate_full_vowel_matrix_inputs()
|
# X, y, X_pure = generate_full_vowel_matrix_inputs()
|
||||||
h5f = h5py.File(file_name + part + '.h5', 'w')
|
h5f = h5py.File(file_name + part + '.h5', 'w')
|
||||||
adict=dict(X=X, y=y, X_pure=X_pure)
|
adict=dict(X=X, y=y, X_pure=X_pure)
|
||||||
for k,v in adict.items():
|
for k, v in adict.items():
|
||||||
h5f.create_dataset(k,data=v)
|
h5f.create_dataset(k,data=v)
|
||||||
h5f.close()
|
h5f.close()
|
||||||
|
|
||||||
|
@ -94,7 +101,7 @@ def load_model(file_name):
|
||||||
# functions for creating X and y from content
|
# functions for creating X and y from content
|
||||||
def read_content():
|
def read_content():
|
||||||
print('READING CONTENT...')
|
print('READING CONTENT...')
|
||||||
with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f:
|
with open('../../../data/SlovarIJS_BESEDE_utf8.lex') as f:
|
||||||
content = f.readlines()
|
content = f.readlines()
|
||||||
print('CONTENT READ SUCCESSFULY')
|
print('CONTENT READ SUCCESSFULY')
|
||||||
return [x.split('\t') for x in content]
|
return [x.split('\t') for x in content]
|
||||||
|
@ -262,17 +269,22 @@ def generate_presentable_y(accetuations_list, word_list, max_num_vowels):
|
||||||
def generate_full_matrix_inputs():
|
def generate_full_matrix_inputs():
|
||||||
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
||||||
train_content, validate_content = split_content(content, 0.2)
|
train_content, validate_content = split_content(content, 0.2)
|
||||||
|
feature_dictionary = create_feature_dictionary(content)
|
||||||
|
|
||||||
# Generate X and y
|
# Generate X and y
|
||||||
print('GENERATING X AND y...')
|
print('GENERATING X AND y...')
|
||||||
X_train, y_train = generate_X_and_y(dictionary, max_word, max_num_vowels, train_content, vowels, accetuated_vowels)
|
X_train, X_other_features_train, y_train = generate_X_and_y(dictionary, max_word, max_num_vowels, train_content, vowels, accetuated_vowels, feature_dictionary)
|
||||||
X_validate, y_validate = generate_X_and_y(dictionary, max_word, max_num_vowels, validate_content, vowels, accetuated_vowels)
|
X_validate, X_other_features_validate, y_validate = generate_X_and_y(dictionary, max_word, max_num_vowels, validate_content, vowels, accetuated_vowels, feature_dictionary)
|
||||||
print('GENERATION SUCCESSFUL!')
|
print('GENERATION SUCCESSFUL!')
|
||||||
return X_train, y_train, X_validate, y_validate
|
return X_train, X_other_features_train, y_train, X_validate, X_other_features_validate, y_validate
|
||||||
|
|
||||||
def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels):
|
|
||||||
|
def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary):
|
||||||
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
||||||
X = np.zeros((len(content), max_word, len(dictionary)))
|
X = np.zeros((len(content), max_word, len(dictionary)))
|
||||||
|
print('CREATING OTHER FEATURES...')
|
||||||
|
X_other_features = create_X_features(content, feature_dictionary)
|
||||||
|
print('OTHER FEATURES CREATED!')
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
for el in content:
|
for el in content:
|
||||||
|
@ -302,9 +314,9 @@ def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, acce
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
print('SHUFFELING INPUTS...')
|
print('SHUFFELING INPUTS...')
|
||||||
X, y = shuffle_inputs(X, y)
|
X, y, X_other_features = shuffle_inputs(X, y, X_pure=X_other_features)
|
||||||
print('INPUTS SHUFFELED!')
|
print('INPUTS SHUFFELED!')
|
||||||
return X, y
|
return X, X_other_features, y
|
||||||
|
|
||||||
|
|
||||||
def count_vowels(content, vowels):
|
def count_vowels(content, vowels):
|
||||||
|
@ -473,6 +485,27 @@ def shuffle_full_vowel_inputs(name, orderd_name, parts):
|
||||||
|
|
||||||
|
|
||||||
# Decoders for inputs and outputs
|
# Decoders for inputs and outputs
|
||||||
|
def decode_X_features(feature_dictionary, X_other_features):
|
||||||
|
for word in X_other_features:
|
||||||
|
final_word = []
|
||||||
|
i = 0
|
||||||
|
for z in range(len(feature_dictionary)):
|
||||||
|
for j in range(1, len(feature_dictionary[z])):
|
||||||
|
if j == 1:
|
||||||
|
if word[i] == 1:
|
||||||
|
# print feature_dictionary[z][1]
|
||||||
|
final_word.append(feature_dictionary[z][1])
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
for k in range(len(feature_dictionary[z][j])):
|
||||||
|
# print (i)
|
||||||
|
if word[i] == 1:
|
||||||
|
# print feature_dictionary[z][j][k]
|
||||||
|
final_word.append(feature_dictionary[z][j][k])
|
||||||
|
i += 1
|
||||||
|
print(u''.join(final_word))
|
||||||
|
|
||||||
|
|
||||||
def decode_position(y, max_num_vowels):
|
def decode_position(y, max_num_vowels):
|
||||||
max_el = 0
|
max_el = 0
|
||||||
i = 0
|
i = 0
|
||||||
|
@ -566,3 +599,42 @@ def split_content(content, ratio):
|
||||||
train_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_train_content_set]
|
train_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_train_content_set]
|
||||||
validate_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_validate_content_set]
|
validate_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_validate_content_set]
|
||||||
return train_content, validate_content
|
return train_content, validate_content
|
||||||
|
|
||||||
|
|
||||||
|
# create feature dictionary
|
||||||
|
def create_feature_dictionary(content):
|
||||||
|
additional_data = [el[2] for el in content]
|
||||||
|
possible_variants = sorted(set(additional_data))
|
||||||
|
categories = sorted(set([el[0] for el in possible_variants]))
|
||||||
|
|
||||||
|
feature_dictionary = []
|
||||||
|
for category in categories:
|
||||||
|
category_features = [1, category]
|
||||||
|
examples_per_category = [el for el in possible_variants if el[0] == category]
|
||||||
|
longest_element = max(examples_per_category, key=len)
|
||||||
|
for i in range(1, len(longest_element)):
|
||||||
|
possibilities_per_el = sorted(set([el[i] for el in examples_per_category if i < len(el)]))
|
||||||
|
category_features[0] += len(possibilities_per_el)
|
||||||
|
category_features.append(possibilities_per_el)
|
||||||
|
feature_dictionary.append(category_features)
|
||||||
|
return feature_dictionary
|
||||||
|
|
||||||
|
|
||||||
|
def create_X_features(content, feature_dictionary):
|
||||||
|
content = content
|
||||||
|
X_other_features = []
|
||||||
|
for el in content:
|
||||||
|
X_el_other_features = []
|
||||||
|
for feature in feature_dictionary:
|
||||||
|
if el[2][0] == feature[1]:
|
||||||
|
X_el_other_features.append(1)
|
||||||
|
for i in range(2, len(feature)):
|
||||||
|
for j in range(len(feature[i])):
|
||||||
|
if i-1 < len(el[2]) and feature[i][j] == el[2][i-1]:
|
||||||
|
X_el_other_features.append(1)
|
||||||
|
else:
|
||||||
|
X_el_other_features.append(0)
|
||||||
|
else:
|
||||||
|
X_el_other_features.extend([0] * feature[0])
|
||||||
|
X_other_features.append(X_el_other_features)
|
||||||
|
return np.array(X_other_features)
|
Loading…
Reference in New Issue
Block a user