From ac8f0057c5b59512563fbea3890c18eb90e08d65 Mon Sep 17 00:00:00 2001
From: lkrsnik <krsnik.luka92@gmail.com>
Date: Sat, 1 Jul 2017 12:19:09 +0200
Subject: [PATCH] Files renamed, orderd plus ordered prepare_data file

---
 .gitignore                  |   2 +-
 .idea/dictionaries/luka.xml |   7 ++
 .idea/workspace.xml         |  60 +++++--------
 prepare_data.py             | 162 ++++++++++++------------------------
 4 files changed, 82 insertions(+), 149 deletions(-)
 create mode 100644 .idea/dictionaries/luka.xml
diff --git a/.gitignore b/.gitignore
index 8fc9c79..4e8e85e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -90,4 +90,4 @@ ENV/
 
 # Custom
 data/
-character_based_ffnn/internal_representations/inputs/
+cnn/internal_representations/inputs/
diff --git a/.idea/dictionaries/luka.xml b/.idea/dictionaries/luka.xml
new file mode 100644
index 0000000..beed84f
--- /dev/null
+++ b/.idea/dictionaries/luka.xml
@@ -0,0 +1,7 @@
+<component name="ProjectDictionaryState">
+  <dictionary name="luka">
+    <words>
+      <w>overfitting</w>
+    </words>
+  </dictionary>
+</component>
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index b6ea317..8a919a4 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -2,8 +2,20 @@
 <project version="4">
   <component name="ChangeListManager">
     <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
-      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_theano.ipynb" afterPath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_theano.ipynb" />
-      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" afterPath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" />
+      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn.ipynb" afterPath="" />
+      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" afterPath="" />
+      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_theano.ipynb" afterPath="" />
+      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/cnn_per_vowel_3epoch.h5" afterPath="" />
+      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/create_and_save_inputs.py" afterPath="" />
+      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65.h5" afterPath="" />
+      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65_10epoch.h5" afterPath="" />
+      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65_5epoch.h5" afterPath="" />
+      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/ffnn_i1_s500relu_121sigmoid_mse_adam_a65.h5" afterPath="" />
+      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/ffnn_i1_s500relu_d20_121sigmoid_mse_adam_a65.h5" afterPath="" />
+      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/test.txt" afterPath="" />
+      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" afterPath="" />
+      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65_10epoch_no_overfitting.h5" afterPath="" />
+      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
       <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
     </list>
     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
@@ -34,8 +46,8 @@
       <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
         <entry file="file://$PROJECT_DIR$/prepare_data.py">
           <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="630">
-              <caret line="80" column="13" lean-forward="false" selection-start-line="80" selection-start-column="13" selection-end-line="80" selection-end-column="13" />
+            <state relative-caret-position="410">
+              <caret line="545" column="14" lean-forward="true" selection-start-line="545" selection-start-column="14" selection-end-line="545" selection-end-column="14" />
               <folding>
                 <element signature="e#24#63#0" expanded="true" />
               </folding>
@@ -174,8 +186,6 @@
       <foldersAlwaysOnTop value="true" />
     </navigator>
     <panes>
-      <pane id="Scratches" />
-      <pane id="Scope" />
       <pane id="ProjectPane">
         <subPane>
           <PATH>
@@ -188,40 +198,10 @@
               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
             </PATH_ELEMENT>
           </PATH>
-          <PATH>
-            <PATH_ELEMENT>
-              <option name="myItemId" value="accetuation" />
-              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
-            </PATH_ELEMENT>
-            <PATH_ELEMENT>
-              <option name="myItemId" value="accetuation" />
-              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
-            </PATH_ELEMENT>
-            <PATH_ELEMENT>
-              <option name="myItemId" value="character_based_ffnn" />
-              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
-            </PATH_ELEMENT>
-          </PATH>
-          <PATH>
-            <PATH_ELEMENT>
-              <option name="myItemId" value="accetuation" />
-              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
-            </PATH_ELEMENT>
-            <PATH_ELEMENT>
-              <option name="myItemId" value="accetuation" />
-              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
-            </PATH_ELEMENT>
-            <PATH_ELEMENT>
-              <option name="myItemId" value="character_based_ffnn" />
-              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
-            </PATH_ELEMENT>
-            <PATH_ELEMENT>
-              <option name="myItemId" value="internal_representations" />
-              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
-            </PATH_ELEMENT>
-          </PATH>
         </subPane>
       </pane>
+      <pane id="Scratches" />
+      <pane id="Scope" />
     </panes>
   </component>
   <component name="PropertiesComponent">
@@ -1010,8 +990,8 @@
     </entry>
     <entry file="file://$PROJECT_DIR$/prepare_data.py">
       <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="630">
-          <caret line="80" column="13" lean-forward="false" selection-start-line="80" selection-start-column="13" selection-end-line="80" selection-end-column="13" />
+        <state relative-caret-position="410">
+          <caret line="545" column="14" lean-forward="true" selection-start-line="545" selection-start-column="14" selection-end-line="545" selection-end-column="14" />
           <folding>
             <element signature="e#24#63#0" expanded="true" />
           </folding>
diff --git a/prepare_data.py b/prepare_data.py
index 7d4927d..821edb4 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -6,15 +6,36 @@ import numpy as np
 import h5py
 import gc
 import math
-import copy
 
+# functions for saving, loading and shuffling whole arrays to ram
 def save_inputs(file_name, X, y):
     h5f = h5py.File(file_name, 'w')
-    adict=dict(X=X, y=y)
-    for k,v in adict.items():
+    adict = dict(X=X, y=y)
+    for k, v in adict.items():
         h5f.create_dataset(k,data=v)
     h5f.close()
 
+def load_inputs(file_name):
+    h5f = h5py.File(file_name,'r')
+    X = h5f['X'][:]
+    y = h5f['y'][:]
+
+    h5f.close()
+    return X, y
+
+
+def shuffle_inputs(X, y, X_pure=False):
+    s = np.arange(X.shape[0])
+    np.random.shuffle(s)
+    X = X[s]
+    y = y[s]
+    if X_pure:
+        X_pure = X_pure[s]
+        return X, y, X_pure
+    else:
+        return X, y
+
+# functions for saving and loading partial arrays to ram
 def create_and_save_inputs(file_name, part, X, y, X_pure):
     # X, y, X_pure = generate_full_vowel_matrix_inputs()
     h5f = h5py.File(file_name + part + '.h5', 'w')
@@ -23,11 +44,22 @@ def create_and_save_inputs(file_name, part, X, y, X_pure):
         h5f.create_dataset(k,data=v)
     h5f.close()
 
+def load_extended_inputs(file_name, obtain_range):
+    h5f = h5py.File(file_name,'r')
+    X = h5f['X'][obtain_range[0]:obtain_range[1]]
+    y = h5f['y'][obtain_range[0]:obtain_range[1]]
+    X_pure = h5f['X_pure'][obtain_range[0]:obtain_range[1]]
+
+    h5f.close()
+    return X, y, X_pure
+
+
+# functions for creating and loading shuffle vector
 def create_and_save_shuffle_vector(file_name, shuffle_vector):
     # X, y, X_pure = generate_full_vowel_matrix_inputs()
     h5f = h5py.File(file_name + '_shuffle_vector.h5', 'w')
     adict=dict(shuffle_vector=shuffle_vector)
-    for k,v in adict.items():
+    for k, v in adict.items():
         h5f.create_dataset(k,data=v)
     h5f.close()
 
@@ -38,31 +70,17 @@ def load_shuffle_vector(file_name):
     h5f.close()
     return shuffle_vector
 
-def load_inputs(file_name):
-    h5f = h5py.File(file_name,'r')
-    X = h5f['X'][:]
-    y = h5f['y'][:]
-
-    h5f.close()
-    return X, y
-
-def load_extended_inputs(file_name, obtain_range):
-    h5f = h5py.File(file_name,'r')
-    X = h5f['X'][obtain_range[0]:obtain_range[1]]
-    y = h5f['y'][obtain_range[0]:obtain_range[1]]
-    X_pure = h5f['X_pure'][obtain_range[0]:obtain_range[1]]
-
-    h5f.close()
-    return X, y, X_pure
 
+# functions for saving and loading model - ONLY WHERE KERAS IS NOT NEEDED
 def save_model(model, file_name):
     h5f = h5py.File(file_name, 'w')
-    adict=dict(W1=model['W1'], b1=model['b1'], W2=model['W2'], b2=model['b2'])
+    adict = dict(W1=model['W1'], b1=model['b1'], W2=model['W2'], b2=model['b2'])
     for k,v in adict.items():
         h5f.create_dataset(k,data=v)
 
     h5f.close()
 
+
 def load_model(file_name):
     h5f = h5py.File(file_name,'r')
     model = {}
@@ -73,6 +91,7 @@ def load_model(file_name):
     h5f.close()
     return model
 
+# functions for creating X and y from content
 def read_content():
     print('READING CONTENT...')
     with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f:
@@ -88,15 +107,15 @@ def is_vowel(word_list, position, vowels):
         return True
     return False
 
+
 def is_accetuated_vowel(word_list, position, accetuated_vowels):
     if word_list[position] in accetuated_vowels:
         return True
     return False
 
+
 def create_dict():
-    
     content = read_content()
-    
     print('CREATING DICTIONARY...')
 
     # CREATE dictionary AND max_word
@@ -150,17 +169,7 @@ def generate_presentable_y(accetuations_list, word_list, max_num_vowels):
     accetuations_list = np.array(accetuations_list)
     final_position = accetuations_list[0] + max_num_vowels * accetuations_list[1]
     return final_position
-    
-def shuffle_inputs(X, y, X_pure=False):
-    s = np.arange(X.shape[0])
-    np.random.shuffle(s)
-    X = X[s]
-    y = y[s]
-    if X_pure:
-        X_pure = X_pure[s]
-        return X, y, X_pure
-    else:
-        return X, y
+
 
 # def generate_inputs():
 #     dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
@@ -262,29 +271,21 @@ def generate_full_matrix_inputs():
     return X_train, y_train, X_validate, y_validate
 
 def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels):
-
-    # X = np.zeros((len(content), max_word*len(dictionary)))
     y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
     X = np.zeros((len(content), max_word, len(dictionary)))
-    X_aditional_data = []
 
     i = 0
     for el in content:
         j = 0
-        # word = []
         for c in list(el[0]):
             index = 0
-            # character = np.zeros(len(dictionary))
             for d in dictionary:
                 if c == d:
                     X[i][j][index] = 1
-                    # character[index] = 1
                     break
                 index += 1
-            # word.append(character)
             j += 1
         j = 0
-        # X.append(word)
         word_accetuations = []
         num_vowels = 0
         for c in list(el[3]):
@@ -299,13 +300,13 @@ def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, acce
             j += 1
         y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
         i += 1
-    # X = np.array(X)
 
     print('SHUFFELING INPUTS...')
     X, y = shuffle_inputs(X, y)
     print('INPUTS SHUFFELED!')
     return X, y
 
+
 def count_vowels(content, vowels):
     num_all_vowels = 0
     for el in content:
@@ -314,10 +315,8 @@ def count_vowels(content, vowels):
                 num_all_vowels += 1
     return num_all_vowels
 
-# def generate_full_vowel_matrix_inputs(name, split_number):
-
-
 
+# Data generation for generator inputs
 def generate_X_and_y_RAM_efficient(name, split_number):
     h5f = h5py.File(name + '.h5', 'w')
     dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
@@ -332,14 +331,8 @@ def generate_X_and_y_RAM_efficient(name, split_number):
                                      maxshape=(num_all_vowels,),
                                      dtype=np.uint8)
 
-
-
     gc.collect()
-    # print (2018553 * max_word * len(dictionary) / (2**30.0))
     print('GENERATING X AND y...')
-    # X = np.zeros((len(content), max_word*len(dictionary)))
-    # y = np.zeros((len(content), max_num_vowels * max_num_vowels))
-    # X = np.zeros((2018553, max_word, len(dictionary)))
     X_pure = []
     X = []
     y = []
@@ -373,12 +366,6 @@ def generate_X_and_y_RAM_efficient(name, split_number):
 
                 if current_part_generation * part_len <= i:
                     print('Saving part '+ str(current_part_generation))
-                    # create_and_save_inputs(name, str(current_part_generation), np.array(X), np.zeros(len(X)), np.array(X_pure))
-
-                    # adict = dict(X=np.array(X), y=np.zeros(len(X)), X_pure=np.array(X_pure))
-                    # for k, v in adict.items():
-                    #     h5f.create_dataset(k, data=v)
-                    # print (len(np.array(X)))
                     data_X[old_num_all_vowels:num_all_vowels + 1] = np.array(X)
                     data_y[old_num_all_vowels:num_all_vowels + 1] = np.array(y)
                     data_X_pure[old_num_all_vowels:num_all_vowels + 1] = np.array(X_pure)
@@ -394,39 +381,18 @@ def generate_X_and_y_RAM_efficient(name, split_number):
                 num_all_vowels += 1
         if i%10000 == 0:
             print(i)
-        # text_file.write("Purchase Amount: %s" % TotalAmount)
-        j = 0
-        # X.append(word)
-        # word_accetuations = []
-        # num_vowels = 0
-        # for c in list(el[3]):
-        #     index = 0
-        #     if is_vowel(el[3], j, vowels):
-        #         num_vowels += 1
-        #     for d in accetuated_vowels:
-        #         if c == d:
-        #             word_accetuations.append(num_vowels)
-        #             break
-        #         index += 1
-        #     j += 1
-        # y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
         i += 1
 
     print('Saving part ' + str(current_part_generation))
-    # create_and_save_inputs(name, str(current_part_generation), np.array(X), np.zeros(len(X)), np.array(X_pure))
 
     data_X[old_num_all_vowels:num_all_vowels] = np.array(X)
     data_y[old_num_all_vowels:num_all_vowels] = np.array(y)
     data_X_pure[old_num_all_vowels:num_all_vowels] = np.array(X_pure)
 
-    # adict = dict(X=X, y=y, X_pure=X_pure)
-    # for k, v in adict.items():
-    #     h5f.create_dataset(k, data=v)
-
-
     h5f.close()
 
 
+# generator for inputs
 def generate_arrays_from_file(path, batch_size):
     h5f = h5py.File(path, 'r')
 
@@ -446,25 +412,15 @@ def generate_arrays_from_file(path, batch_size):
     h5f.close()
 
 
-
-
+# shuffle inputs for generator
 def shuffle_full_vowel_inputs(name, orderd_name, parts):
-#     internal_representations/inputs/X_ordered_part
     dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
     num_all_vowels = count_vowels(content, vowels)
-    num_all_vowels = 12
+    # num_all_vowels = 12
 
 
     s = np.arange(num_all_vowels)
     np.random.shuffle(s)
-    # create_and_save_shuffle_vector(name, s)
-
-    # s = load_shuffle_vector('internal_representations/inputs/X_shuffled_part_shuffle_vector.h5')
-
-# try:
-    #     h5f.close()
-    # except Exception, e:
-    #     pass
 
     h5f = h5py.File(name, 'w')
     data_X = h5f.create_dataset('X', (num_all_vowels, max_word, len(dictionary)),
@@ -491,9 +447,6 @@ def shuffle_full_vowel_inputs(name, orderd_name, parts):
         for i in range(1, parts+1):
             X, y, X_pure = load_extended_inputs(orderd_name, targeted_range)
             for j in range(X.shape[0]):
-                # print targeted_range[0]
-                # print targeted_range[1]
-                # print s[j]
                 if s[j + targeted_range[0]] >= section_range[0] and s[j + targeted_range[0]] < section_range[1]:
                     # print 's[j] ' + str(s[j + targeted_range[0]]) + ' section_range[0] ' + str(section_range[0]) + ' section_range[1] ' + str(section_range[1])
                     new_X[s[j + targeted_range[0]] - section_range[0]] = X[j]
@@ -506,15 +459,6 @@ def shuffle_full_vowel_inputs(name, orderd_name, parts):
                 targeted_range[1] = num_all_vowels
             del X, y, X_pure
         print('CREATED ' + str(h) + '. PART OF SHUFFLED MATRIX')
-        # create_and_save_inputs(name, str(h), new_X, new_y, new_X_pure)
-        # a =
-        # print (a.shape)
-        # print s
-        # for el in np.array(new_X):
-        #     print el
-        # print 'new_X ' + str(new_X) + ' section_range[0] ' + str(section_range[0]) + ' section_range[1] ' + str(section_range[1])
-        # print new_X.shape
-        # print type(new_X)
         data_X[section_range[0]:section_range[1]] = new_X
         data_y[section_range[0]:section_range[1]] = new_y
         data_X_pure[section_range[0]:section_range[1]] = new_X_pure
@@ -528,8 +472,7 @@ def shuffle_full_vowel_inputs(name, orderd_name, parts):
     h5f.close()
 
 
-
-
+# Decoders for inputs and outputs
 def decode_position(y, max_num_vowels):
     max_el = 0
     i = 0
@@ -541,6 +484,7 @@ def decode_position(y, max_num_vowels):
         i += 1
     return [pos % max_num_vowels, pos / max_num_vowels]
 
+
 def decode_input(word_encoded, dictionary):
     word = ''
     for el in word_encoded:
@@ -570,6 +514,7 @@ def generate_input_from_word(word, max_word, dictionary):
         j += 1
     return x
 
+
 def generate_input_per_vowel_from_word(word, max_word, dictionary, vowels):
     X_el = np.zeros((max_word, len(dictionary)))
     j = 0
@@ -592,6 +537,7 @@ def generate_input_per_vowel_from_word(word, max_word, dictionary, vowels):
             vowel_i += 1
     return np.array(X), np.array(X_pure)
 
+
 def decode_position_from_vowel_to_final_number(y):
     res = []
     for i in range(len(y)):
@@ -600,6 +546,7 @@ def decode_position_from_vowel_to_final_number(y):
     return res
 
 
+# split content so that there is no overfitting
 def split_content(content, ratio):
     expanded_content = [el[1] if el[1] != '=' else el[0] for el in content]
     # print(len(content))
@@ -609,7 +556,6 @@ def split_content(content, ratio):
     np.random.shuffle(s)
 
     split_num = math.floor(len(unique_content) * ratio)
-    validate_content = []
     shuffled_unique_train_content = [unique_content[i] for i in range(len(s)) if s[i] >= split_num]
 
     shuffled_unique_train_content_set = set(shuffled_unique_train_content)
@@ -619,4 +565,4 @@ def split_content(content, ratio):
 
     train_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_train_content_set]
     validate_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_validate_content_set]
-    return train_content, validate_content
\ No newline at end of file
+    return train_content, validate_content