|
|
|
@ -5,6 +5,7 @@ from __future__ import unicode_literals
|
|
|
|
|
import numpy as np
|
|
|
|
|
import h5py
|
|
|
|
|
import gc
|
|
|
|
|
import StringIO
|
|
|
|
|
|
|
|
|
|
def save_inputs(file_name, X, y):
|
|
|
|
|
h5f = h5py.File(file_name, 'w')
|
|
|
|
@ -13,14 +14,29 @@ def save_inputs(file_name, X, y):
|
|
|
|
|
h5f.create_dataset(k,data=v)
|
|
|
|
|
h5f.close()
|
|
|
|
|
|
|
|
|
|
def create_and_save_inputs(file_name):
|
|
|
|
|
X, y, X_pure = generate_full_vowel_matrix_inputs()
|
|
|
|
|
h5f = h5py.File(file_name, 'w')
|
|
|
|
|
def create_and_save_inputs(file_name, part, X, y, X_pure):
|
|
|
|
|
# X, y, X_pure = generate_full_vowel_matrix_inputs()
|
|
|
|
|
h5f = h5py.File(file_name + part + '.h5', 'w')
|
|
|
|
|
adict=dict(X=X, y=y, X_pure=X_pure)
|
|
|
|
|
for k,v in adict.items():
|
|
|
|
|
h5f.create_dataset(k,data=v)
|
|
|
|
|
h5f.close()
|
|
|
|
|
|
|
|
|
|
def create_and_save_shuffle_vector(file_name, shuffle_vector):
|
|
|
|
|
# X, y, X_pure = generate_full_vowel_matrix_inputs()
|
|
|
|
|
h5f = h5py.File(file_name + '_shuffle_vector.h5', 'w')
|
|
|
|
|
adict=dict(shuffle_vector=shuffle_vector)
|
|
|
|
|
for k,v in adict.items():
|
|
|
|
|
h5f.create_dataset(k,data=v)
|
|
|
|
|
h5f.close()
|
|
|
|
|
|
|
|
|
|
def load_shuffle_vector(file_name):
|
|
|
|
|
h5f = h5py.File(file_name,'r')
|
|
|
|
|
shuffle_vector = h5f['shuffle_vector'][[179859, 385513, 893430]]
|
|
|
|
|
|
|
|
|
|
h5f.close()
|
|
|
|
|
return shuffle_vector
|
|
|
|
|
|
|
|
|
|
def load_inputs(file_name):
|
|
|
|
|
h5f = h5py.File(file_name,'r')
|
|
|
|
|
X = h5f['X'][:]
|
|
|
|
@ -29,6 +45,15 @@ def load_inputs(file_name):
|
|
|
|
|
h5f.close()
|
|
|
|
|
return X, y
|
|
|
|
|
|
|
|
|
|
def load_extended_inputs(file_name):
|
|
|
|
|
h5f = h5py.File(file_name,'r')
|
|
|
|
|
X = h5f['X'][:]
|
|
|
|
|
y = h5f['y'][:]
|
|
|
|
|
X_pure = h5f['X_pure'][:]
|
|
|
|
|
|
|
|
|
|
h5f.close()
|
|
|
|
|
return X, y, X_pure
|
|
|
|
|
|
|
|
|
|
def save_model(model, file_name):
|
|
|
|
|
h5f = h5py.File(file_name, 'w')
|
|
|
|
|
adict=dict(W1=model['W1'], b1=model['b1'], W2=model['W2'], b2=model['b2'])
|
|
|
|
@ -49,7 +74,7 @@ def load_model(file_name):
|
|
|
|
|
|
|
|
|
|
def read_content():
|
|
|
|
|
print('READING CONTENT...')
|
|
|
|
|
with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f:
|
|
|
|
|
with open('../data/SlovarIJS_BESEDE_utf8.lex') as f:
|
|
|
|
|
content = f.readlines()
|
|
|
|
|
print('CONTENT READ SUCCESSFULY')
|
|
|
|
|
return [x.decode('utf8').split('\t') for x in content]
|
|
|
|
@ -62,6 +87,11 @@ def is_vowel(word_list, position, vowels):
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def is_accetuated_vowel(word_list, position, accetuated_vowels):
|
|
|
|
|
if word_list[position] in accetuated_vowels:
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def create_dict():
|
|
|
|
|
|
|
|
|
|
content = read_content()
|
|
|
|
@ -271,67 +301,146 @@ def count_vowels(content, vowels):
|
|
|
|
|
return num_all_vowels
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_full_vowel_matrix_inputs():
|
|
|
|
|
def generate_full_vowel_matrix_inputs(name, split_number):
|
|
|
|
|
h5f = h5py.File(name + '.h5', 'w')
|
|
|
|
|
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
|
|
|
|
num_all_vowels = count_vowels(content, vowels)
|
|
|
|
|
data_X = h5f.create_dataset('X', (num_all_vowels, max_word, len(dictionary)),
|
|
|
|
|
maxshape=(num_all_vowels, max_word, len(dictionary)),
|
|
|
|
|
dtype=np.uint8)
|
|
|
|
|
data_y = h5f.create_dataset('y', (num_all_vowels,),
|
|
|
|
|
maxshape=(num_all_vowels,),
|
|
|
|
|
dtype=np.uint8)
|
|
|
|
|
data_X_pure = h5f.create_dataset('X_pure', (num_all_vowels,),
|
|
|
|
|
maxshape=(num_all_vowels,),
|
|
|
|
|
dtype=np.uint8)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gc.collect()
|
|
|
|
|
# print (2018553 * max_word * len(dictionary) / (2**30.0))
|
|
|
|
|
print('GENERATING X AND y...')
|
|
|
|
|
# X = np.zeros((len(content), max_word*len(dictionary)))
|
|
|
|
|
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
|
|
|
|
# y = np.zeros((len(content), max_num_vowels * max_num_vowels))
|
|
|
|
|
# X = np.zeros((2018553, max_word, len(dictionary)))
|
|
|
|
|
X_pure = []
|
|
|
|
|
X = []
|
|
|
|
|
y = []
|
|
|
|
|
part_len = len(content)/float(split_number)
|
|
|
|
|
current_part_generation = 1
|
|
|
|
|
|
|
|
|
|
i = 0
|
|
|
|
|
num_all_vowels = 0
|
|
|
|
|
old_num_all_vowels = 0
|
|
|
|
|
for el in content:
|
|
|
|
|
j = 0
|
|
|
|
|
# word = []
|
|
|
|
|
X_el = np.zeros((max_word, len(dictionary)))
|
|
|
|
|
for c in list(el[0]):
|
|
|
|
|
index = 0
|
|
|
|
|
# character = np.zeros(len(dictionary))
|
|
|
|
|
for d in dictionary:
|
|
|
|
|
if c == d:
|
|
|
|
|
X_el[j][index] = 1
|
|
|
|
|
# character[index] = 1
|
|
|
|
|
break
|
|
|
|
|
index += 1
|
|
|
|
|
# word.append(character)
|
|
|
|
|
j += 1
|
|
|
|
|
# for c in list(el[0]):
|
|
|
|
|
vowel_i = 0
|
|
|
|
|
for m in range(len(el[0])):
|
|
|
|
|
if is_vowel(list(el[0]), m, vowels):
|
|
|
|
|
X.append(X_el)
|
|
|
|
|
X_pure.append(vowel_i)
|
|
|
|
|
vowel_i += 1
|
|
|
|
|
if is_accetuated_vowel(list(el[3]), m, accetuated_vowels):
|
|
|
|
|
y.append(1)
|
|
|
|
|
else:
|
|
|
|
|
y.append(0)
|
|
|
|
|
|
|
|
|
|
if current_part_generation * part_len <= i:
|
|
|
|
|
print('Saving part '+ str(current_part_generation))
|
|
|
|
|
# create_and_save_inputs(name, str(current_part_generation), np.array(X), np.zeros(len(X)), np.array(X_pure))
|
|
|
|
|
|
|
|
|
|
# adict = dict(X=np.array(X), y=np.zeros(len(X)), X_pure=np.array(X_pure))
|
|
|
|
|
# for k, v in adict.items():
|
|
|
|
|
# h5f.create_dataset(k, data=v)
|
|
|
|
|
# print (len(np.array(X)))
|
|
|
|
|
data_X[old_num_all_vowels:num_all_vowels + 1] = np.array(X)
|
|
|
|
|
data_y[old_num_all_vowels:num_all_vowels + 1] = np.array(y)
|
|
|
|
|
data_X_pure[old_num_all_vowels:num_all_vowels + 1] = np.array(X_pure)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
old_num_all_vowels = num_all_vowels + 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
X_pure = []
|
|
|
|
|
X = []
|
|
|
|
|
y = []
|
|
|
|
|
current_part_generation += 1
|
|
|
|
|
num_all_vowels += 1
|
|
|
|
|
if i%10000 == 0:
|
|
|
|
|
print i
|
|
|
|
|
# text_file.write("Purchase Amount: %s" % TotalAmount)
|
|
|
|
|
j = 0
|
|
|
|
|
# X.append(word)
|
|
|
|
|
word_accetuations = []
|
|
|
|
|
num_vowels = 0
|
|
|
|
|
for c in list(el[3]):
|
|
|
|
|
index = 0
|
|
|
|
|
if is_vowel(el[3], j, vowels):
|
|
|
|
|
num_vowels += 1
|
|
|
|
|
for d in accetuated_vowels:
|
|
|
|
|
if c == d:
|
|
|
|
|
word_accetuations.append(num_vowels)
|
|
|
|
|
break
|
|
|
|
|
index += 1
|
|
|
|
|
j += 1
|
|
|
|
|
y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
|
|
|
|
# word_accetuations = []
|
|
|
|
|
# num_vowels = 0
|
|
|
|
|
# for c in list(el[3]):
|
|
|
|
|
# index = 0
|
|
|
|
|
# if is_vowel(el[3], j, vowels):
|
|
|
|
|
# num_vowels += 1
|
|
|
|
|
# for d in accetuated_vowels:
|
|
|
|
|
# if c == d:
|
|
|
|
|
# word_accetuations.append(num_vowels)
|
|
|
|
|
# break
|
|
|
|
|
# index += 1
|
|
|
|
|
# j += 1
|
|
|
|
|
# y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
|
|
|
|
i += 1
|
|
|
|
|
# print(len(X))
|
|
|
|
|
# del X_pure
|
|
|
|
|
# del dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels
|
|
|
|
|
|
|
|
|
|
X = np.array(X)
|
|
|
|
|
X_pure = np.array(X_pure)
|
|
|
|
|
print('GENERATION SUCCESSFUL!')
|
|
|
|
|
print('SHUFFELING INPUTS...')
|
|
|
|
|
X, y, X_pure = shuffle_inputs(X, y, X_pure)
|
|
|
|
|
print('INPUTS SHUFFELED!')
|
|
|
|
|
return X, y, X_pure
|
|
|
|
|
|
|
|
|
|
print('Saving part ' + str(current_part_generation))
|
|
|
|
|
# create_and_save_inputs(name, str(current_part_generation), np.array(X), np.zeros(len(X)), np.array(X_pure))
|
|
|
|
|
|
|
|
|
|
data_X[old_num_all_vowels:num_all_vowels] = np.array(X)
|
|
|
|
|
data_y[old_num_all_vowels:num_all_vowels] = np.array(y)
|
|
|
|
|
data_X_pure[old_num_all_vowels:num_all_vowels] = np.array(X_pure)
|
|
|
|
|
|
|
|
|
|
# adict = dict(X=X, y=y, X_pure=X_pure)
|
|
|
|
|
# for k, v in adict.items():
|
|
|
|
|
# h5f.create_dataset(k, data=v)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
h5f.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def shuffle_full_vowel_inputs(name, orderd_name, parts):
|
|
|
|
|
# internal_representations/inputs/X_ordered_part
|
|
|
|
|
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
|
|
|
|
num_all_vowels = count_vowels(content, vowels)
|
|
|
|
|
# s = np.arange(num_all_vowels)
|
|
|
|
|
# np.random.shuffle(s)
|
|
|
|
|
# create_and_save_shuffle_vector(name, s)
|
|
|
|
|
s = load_shuffle_vector('internal_representations/inputs/X_shuffled_part_shuffle_vector.h5')
|
|
|
|
|
print('Shuffled vector loaded!')
|
|
|
|
|
section_range = [0, (num_all_vowels + 1)/parts]
|
|
|
|
|
for h in range(3, parts+1):
|
|
|
|
|
gc.collect()
|
|
|
|
|
new_X = np.zeros((section_range[1], max_word, len(dictionary)))
|
|
|
|
|
new_X_pure = np.zeros(section_range[1])
|
|
|
|
|
new_y = np.zeros(section_range[1])
|
|
|
|
|
for i in range(1, parts+1):
|
|
|
|
|
X, y, X_pure = load_extended_inputs(orderd_name + str(parts) + '.h5')
|
|
|
|
|
for j in range(X.shape[0]):
|
|
|
|
|
if s[j] >= section_range[0] and s[j] < section_range[1]:
|
|
|
|
|
new_X[s[j]] = X[j]
|
|
|
|
|
new_y[s[j]] = y[j]
|
|
|
|
|
new_X_pure[s[j]] = X_pure[j]
|
|
|
|
|
print('CREATED ' + str(h) + '. PART OF SHUFFLED MATRIX')
|
|
|
|
|
create_and_save_inputs(name, str(h), new_X, new_y, new_X_pure)
|
|
|
|
|
section_range[0] = section_range[1]
|
|
|
|
|
if section_range[1] + (num_all_vowels + 1)/parts < num_all_vowels:
|
|
|
|
|
section_range[1] += (num_all_vowels + 1)/parts
|
|
|
|
|
else:
|
|
|
|
|
section_range[1] = num_all_vowels
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def decode_position(y, max_num_vowels):
|
|
|
|
@ -345,6 +454,17 @@ def decode_position(y, max_num_vowels):
|
|
|
|
|
i += 1
|
|
|
|
|
return [pos % max_num_vowels, pos / max_num_vowels]
|
|
|
|
|
|
|
|
|
|
def decode_input(word_encoded, dictionary):
|
|
|
|
|
word = ''
|
|
|
|
|
for el in word_encoded:
|
|
|
|
|
i = 0
|
|
|
|
|
for num in el:
|
|
|
|
|
if num == 1:
|
|
|
|
|
word += dictionary[i]
|
|
|
|
|
break
|
|
|
|
|
i += 1
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def decode_position_from_number(y, max_num_vowels):
|
|
|
|
|
return [y % max_num_vowels, y / max_num_vowels]
|
|
|
|
|