From dfe4b9a362840e03b9b72f0a2e0b821f8b4d2cb3 Mon Sep 17 00:00:00 2001 From: lkrsnik Date: Fri, 23 Jun 2017 11:49:21 +0200 Subject: [PATCH] Added creation of inputs per vowels --- .idea/workspace.xml | 98 +++++++++++++++++++---- prepare_data.py | 190 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 236 insertions(+), 52 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 2f69d75..9779511 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,7 +2,9 @@ - + + + - + - + @@ -468,16 +532,6 @@ - - - - - - - - - - @@ -940,7 +994,7 @@ - + @@ -948,5 +1002,15 @@ + + + + + + + + + + \ No newline at end of file diff --git a/prepare_data.py b/prepare_data.py index aec0902..8238fb5 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -5,6 +5,7 @@ from __future__ import unicode_literals import numpy as np import h5py import gc +import StringIO def save_inputs(file_name, X, y): h5f = h5py.File(file_name, 'w') @@ -13,14 +14,29 @@ def save_inputs(file_name, X, y): h5f.create_dataset(k,data=v) h5f.close() -def create_and_save_inputs(file_name): - X, y, X_pure = generate_full_vowel_matrix_inputs() - h5f = h5py.File(file_name, 'w') +def create_and_save_inputs(file_name, part, X, y, X_pure): + # X, y, X_pure = generate_full_vowel_matrix_inputs() + h5f = h5py.File(file_name + part + '.h5', 'w') adict=dict(X=X, y=y, X_pure=X_pure) for k,v in adict.items(): h5f.create_dataset(k,data=v) h5f.close() +def create_and_save_shuffle_vector(file_name, shuffle_vector): + # X, y, X_pure = generate_full_vowel_matrix_inputs() + h5f = h5py.File(file_name + '_shuffle_vector.h5', 'w') + adict=dict(shuffle_vector=shuffle_vector) + for k,v in adict.items(): + h5f.create_dataset(k,data=v) + h5f.close() + +def load_shuffle_vector(file_name): + h5f = h5py.File(file_name,'r') + shuffle_vector = h5f['shuffle_vector'][[179859, 385513, 893430]] + + h5f.close() + return shuffle_vector + def load_inputs(file_name): h5f = h5py.File(file_name,'r') X = h5f['X'][:] @@ -29,6 +45,15 @@ def load_inputs(file_name): h5f.close() return X, y +def load_extended_inputs(file_name): + h5f = h5py.File(file_name,'r') + X = h5f['X'][:] + y = h5f['y'][:] + X_pure = h5f['X_pure'][:] + + h5f.close() + return X, y, X_pure + def save_model(model, file_name): h5f = h5py.File(file_name, 'w') adict=dict(W1=model['W1'], b1=model['b1'], W2=model['W2'], b2=model['b2']) @@ -49,7 +74,7 @@ def load_model(file_name): def read_content(): print('READING CONTENT...') - with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f: + with open('../data/SlovarIJS_BESEDE_utf8.lex') as f: content = f.readlines() print('CONTENT READ SUCCESSFULY') return [x.decode('utf8').split('\t') for x in content] @@ -62,6 +87,11 @@ def is_vowel(word_list, position, vowels): return True return False +def is_accetuated_vowel(word_list, position, accetuated_vowels): + if word_list[position] in accetuated_vowels: + return True + return False + def create_dict(): content = read_content() @@ -271,67 +301,146 @@ def count_vowels(content, vowels): return num_all_vowels -def generate_full_vowel_matrix_inputs(): +def generate_full_vowel_matrix_inputs(name, split_number): + h5f = h5py.File(name + '.h5', 'w') dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict() + num_all_vowels = count_vowels(content, vowels) + data_X = h5f.create_dataset('X', (num_all_vowels, max_word, len(dictionary)), + maxshape=(num_all_vowels, max_word, len(dictionary)), + dtype=np.uint8) + data_y = h5f.create_dataset('y', (num_all_vowels,), + maxshape=(num_all_vowels,), + dtype=np.uint8) + data_X_pure = h5f.create_dataset('X_pure', (num_all_vowels,), + maxshape=(num_all_vowels,), + dtype=np.uint8) + + + gc.collect() # print (2018553 * max_word * len(dictionary) / (2**30.0)) print('GENERATING X AND y...') # X = np.zeros((len(content), max_word*len(dictionary))) - y = np.zeros((len(content), max_num_vowels * max_num_vowels )) + # y = np.zeros((len(content), max_num_vowels * max_num_vowels)) # X = np.zeros((2018553, max_word, len(dictionary))) X_pure = [] X = [] + y = [] + part_len = len(content)/float(split_number) + current_part_generation = 1 i = 0 + num_all_vowels = 0 + old_num_all_vowels = 0 for el in content: j = 0 - # word = [] X_el = np.zeros((max_word, len(dictionary))) for c in list(el[0]): index = 0 - # character = np.zeros(len(dictionary)) for d in dictionary: if c == d: X_el[j][index] = 1 - # character[index] = 1 break index += 1 - # word.append(character) j += 1 - # for c in list(el[0]): vowel_i = 0 for m in range(len(el[0])): if is_vowel(list(el[0]), m, vowels): X.append(X_el) X_pure.append(vowel_i) vowel_i += 1 + if is_accetuated_vowel(list(el[3]), m, accetuated_vowels): + y.append(1) + else: + y.append(0) + + if current_part_generation * part_len <= i: + print('Saving part '+ str(current_part_generation)) + # create_and_save_inputs(name, str(current_part_generation), np.array(X), np.zeros(len(X)), np.array(X_pure)) + + # adict = dict(X=np.array(X), y=np.zeros(len(X)), X_pure=np.array(X_pure)) + # for k, v in adict.items(): + # h5f.create_dataset(k, data=v) + # print (len(np.array(X))) + data_X[old_num_all_vowels:num_all_vowels + 1] = np.array(X) + data_y[old_num_all_vowels:num_all_vowels + 1] = np.array(y) + data_X_pure[old_num_all_vowels:num_all_vowels + 1] = np.array(X_pure) + + + old_num_all_vowels = num_all_vowels + 1 + + + X_pure = [] + X = [] + y = [] + current_part_generation += 1 + num_all_vowels += 1 + if i%10000 == 0: + print i + # text_file.write("Purchase Amount: %s" % TotalAmount) j = 0 # X.append(word) - word_accetuations = [] - num_vowels = 0 - for c in list(el[3]): - index = 0 - if is_vowel(el[3], j, vowels): - num_vowels += 1 - for d in accetuated_vowels: - if c == d: - word_accetuations.append(num_vowels) - break - index += 1 - j += 1 - y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1 + # word_accetuations = [] + # num_vowels = 0 + # for c in list(el[3]): + # index = 0 + # if is_vowel(el[3], j, vowels): + # num_vowels += 1 + # for d in accetuated_vowels: + # if c == d: + # word_accetuations.append(num_vowels) + # break + # index += 1 + # j += 1 + # y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1 i += 1 - # print(len(X)) - # del X_pure - # del dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels - - X = np.array(X) - X_pure = np.array(X_pure) - print('GENERATION SUCCESSFUL!') - print('SHUFFELING INPUTS...') - X, y, X_pure = shuffle_inputs(X, y, X_pure) - print('INPUTS SHUFFELED!') - return X, y, X_pure + + print('Saving part ' + str(current_part_generation)) + # create_and_save_inputs(name, str(current_part_generation), np.array(X), np.zeros(len(X)), np.array(X_pure)) + + data_X[old_num_all_vowels:num_all_vowels] = np.array(X) + data_y[old_num_all_vowels:num_all_vowels] = np.array(y) + data_X_pure[old_num_all_vowels:num_all_vowels] = np.array(X_pure) + + # adict = dict(X=X, y=y, X_pure=X_pure) + # for k, v in adict.items(): + # h5f.create_dataset(k, data=v) + + + h5f.close() + + +def shuffle_full_vowel_inputs(name, orderd_name, parts): +# internal_representations/inputs/X_ordered_part + dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict() + num_all_vowels = count_vowels(content, vowels) + # s = np.arange(num_all_vowels) + # np.random.shuffle(s) + # create_and_save_shuffle_vector(name, s) + s = load_shuffle_vector('internal_representations/inputs/X_shuffled_part_shuffle_vector.h5') + print('Shuffled vector loaded!') + section_range = [0, (num_all_vowels + 1)/parts] + for h in range(3, parts+1): + gc.collect() + new_X = np.zeros((section_range[1], max_word, len(dictionary))) + new_X_pure = np.zeros(section_range[1]) + new_y = np.zeros(section_range[1]) + for i in range(1, parts+1): + X, y, X_pure = load_extended_inputs(orderd_name + str(parts) + '.h5') + for j in range(X.shape[0]): + if s[j] >= section_range[0] and s[j] < section_range[1]: + new_X[s[j]] = X[j] + new_y[s[j]] = y[j] + new_X_pure[s[j]] = X_pure[j] + print('CREATED ' + str(h) + '. PART OF SHUFFLED MATRIX') + create_and_save_inputs(name, str(h), new_X, new_y, new_X_pure) + section_range[0] = section_range[1] + if section_range[1] + (num_all_vowels + 1)/parts < num_all_vowels: + section_range[1] += (num_all_vowels + 1)/parts + else: + section_range[1] = num_all_vowels + + def decode_position(y, max_num_vowels): @@ -345,6 +454,17 @@ def decode_position(y, max_num_vowels): i += 1 return [pos % max_num_vowels, pos / max_num_vowels] +def decode_input(word_encoded, dictionary): + word = '' + for el in word_encoded: + i = 0 + for num in el: + if num == 1: + word += dictionary[i] + break + i += 1 + return word + def decode_position_from_number(y, max_num_vowels): return [y % max_num_vowels, y / max_num_vowels]