From d314a9ee4fdff06bd045aeb06585ddbe61a6e998 Mon Sep 17 00:00:00 2001
From: lkrsnik <krsnik.luka92@gmail.com>
Date: Sat, 1 Jul 2017 15:45:46 +0200
Subject: [PATCH] Created cnn witch look at aditional features as well

---
 .idea/workspace.xml |  25 +++--------
 prepare_data.py     | 100 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 92 insertions(+), 33 deletions(-)
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 8a919a4..6344e08 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -2,20 +2,7 @@
 <project version="4">
   <component name="ChangeListManager">
     <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
-      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn.ipynb" afterPath="" />
-      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" afterPath="" />
-      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_theano.ipynb" afterPath="" />
-      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/cnn_per_vowel_3epoch.h5" afterPath="" />
-      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/create_and_save_inputs.py" afterPath="" />
-      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65.h5" afterPath="" />
-      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65_10epoch.h5" afterPath="" />
-      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65_5epoch.h5" afterPath="" />
-      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/ffnn_i1_s500relu_121sigmoid_mse_adam_a65.h5" afterPath="" />
-      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/internal_representations/models/ffnn_i1_s500relu_d20_121sigmoid_mse_adam_a65.h5" afterPath="" />
-      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/test.txt" afterPath="" />
-      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" afterPath="" />
-      <change type="DELETED" beforePath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/cnn_i2_s_c43-3relu_d20_c43-3relu_mp2_f_516relu_d20_121sigmoid_mse_adam_a65_10epoch_no_overfitting.h5" afterPath="" />
-      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
+      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/character_based_ffnn_keras.ipynb" afterPath="$PROJECT_DIR$/cnn/character_based_ffnn_keras.ipynb" />
       <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
     </list>
     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
@@ -46,8 +33,8 @@
       <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
         <entry file="file://$PROJECT_DIR$/prepare_data.py">
           <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="410">
-              <caret line="545" column="14" lean-forward="true" selection-start-line="545" selection-start-column="14" selection-end-line="545" selection-end-column="14" />
+            <state relative-caret-position="442">
+              <caret line="494" column="36" lean-forward="true" selection-start-line="494" selection-start-column="36" selection-end-line="494" selection-end-column="36" />
               <folding>
                 <element signature="e#24#63#0" expanded="true" />
               </folding>
@@ -137,7 +124,6 @@
       <find>nearly_zeros</find>
       <find>rand</find>
       <find>u</find>
-      <find>shuffle_inputs</find>
       <find>num_all_vowels</find>
       <find>load_shuffle_vector</find>
       <find>create_and_save_inputs</find>
@@ -147,6 +133,7 @@
       <find>generate_inputs</find>
       <find>split_number</find>
       <find>StringIO</find>
+      <find>shuffle_inputs</find>
     </findStrings>
   </component>
   <component name="Git.Settings">
@@ -990,8 +977,8 @@
     </entry>
     <entry file="file://$PROJECT_DIR$/prepare_data.py">
       <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="410">
-          <caret line="545" column="14" lean-forward="true" selection-start-line="545" selection-start-column="14" selection-end-line="545" selection-end-column="14" />
+        <state relative-caret-position="442">
+          <caret line="494" column="36" lean-forward="true" selection-start-line="494" selection-start-column="36" selection-end-line="494" selection-end-column="36" />
           <folding>
             <element signature="e#24#63#0" expanded="true" />
           </folding>
diff --git a/prepare_data.py b/prepare_data.py
index 821edb4..e9c714b 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -8,28 +8,35 @@ import gc
 import math
 
 # functions for saving, loading and shuffling whole arrays to ram
-def save_inputs(file_name, X, y):
+def save_inputs(file_name, X, y, other_features=[]):
     h5f = h5py.File(file_name, 'w')
-    adict = dict(X=X, y=y)
+    if other_features == []:
+        adict = dict(X=X, y=y)
+    else:
+        adict = dict(X=X, X_other_features=other_features, y=y)
     for k, v in adict.items():
-        h5f.create_dataset(k,data=v)
+        h5f.create_dataset(k, data=v)
     h5f.close()
 
-def load_inputs(file_name):
+def load_inputs(file_name, other_features=False):
     h5f = h5py.File(file_name,'r')
     X = h5f['X'][:]
     y = h5f['y'][:]
+    if other_features:
+        X_other_features = h5f['X_other_features'][:]
+        h5f.close()
+        return X, X_other_features, y
 
     h5f.close()
     return X, y
 
 
-def shuffle_inputs(X, y, X_pure=False):
+def shuffle_inputs(X, y, X_pure=[]):
     s = np.arange(X.shape[0])
     np.random.shuffle(s)
     X = X[s]
     y = y[s]
-    if X_pure:
+    if X_pure != []:
         X_pure = X_pure[s]
         return X, y, X_pure
     else:
@@ -40,7 +47,7 @@ def create_and_save_inputs(file_name, part, X, y, X_pure):
     # X, y, X_pure = generate_full_vowel_matrix_inputs()
     h5f = h5py.File(file_name + part + '.h5', 'w')
     adict=dict(X=X, y=y, X_pure=X_pure)
-    for k,v in adict.items():
+    for k, v in adict.items():
         h5f.create_dataset(k,data=v)
     h5f.close()
 
@@ -94,7 +101,7 @@ def load_model(file_name):
 # functions for creating X and y from content
 def read_content():
     print('READING CONTENT...')
-    with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f:
+    with open('../../../data/SlovarIJS_BESEDE_utf8.lex') as f:
         content = f.readlines()
     print('CONTENT READ SUCCESSFULY')
     return [x.split('\t') for x in content]
@@ -262,17 +269,22 @@ def generate_presentable_y(accetuations_list, word_list, max_num_vowels):
 def generate_full_matrix_inputs():
     dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
     train_content, validate_content = split_content(content, 0.2)
+    feature_dictionary = create_feature_dictionary(content)
 
     # Generate X and y
     print('GENERATING X AND y...')
-    X_train, y_train = generate_X_and_y(dictionary, max_word, max_num_vowels, train_content, vowels, accetuated_vowels)
-    X_validate, y_validate = generate_X_and_y(dictionary, max_word, max_num_vowels, validate_content, vowels, accetuated_vowels)
+    X_train, X_other_features_train, y_train = generate_X_and_y(dictionary, max_word, max_num_vowels, train_content, vowels, accetuated_vowels, feature_dictionary)
+    X_validate, X_other_features_validate, y_validate = generate_X_and_y(dictionary, max_word, max_num_vowels, validate_content, vowels, accetuated_vowels, feature_dictionary)
     print('GENERATION SUCCESSFUL!')
-    return X_train, y_train, X_validate, y_validate
+    return X_train, X_other_features_train, y_train, X_validate, X_other_features_validate, y_validate
 
-def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels):
+
+def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary):
     y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
     X = np.zeros((len(content), max_word, len(dictionary)))
+    print('CREATING OTHER FEATURES...')
+    X_other_features = create_X_features(content, feature_dictionary)
+    print('OTHER FEATURES CREATED!')
 
     i = 0
     for el in content:
@@ -302,9 +314,9 @@ def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, acce
         i += 1
 
     print('SHUFFELING INPUTS...')
-    X, y = shuffle_inputs(X, y)
+    X, y, X_other_features = shuffle_inputs(X, y, X_pure=X_other_features)
     print('INPUTS SHUFFELED!')
-    return X, y
+    return X, X_other_features, y
 
 
 def count_vowels(content, vowels):
@@ -473,6 +485,27 @@ def shuffle_full_vowel_inputs(name, orderd_name, parts):
 
 
 # Decoders for inputs and outputs
+def decode_X_features(feature_dictionary, X_other_features):
+    for word in X_other_features:
+        final_word = []
+        i = 0
+        for z in range(len(feature_dictionary)):
+            for j in range(1, len(feature_dictionary[z])):
+                if j == 1:
+                    if word[i] == 1:
+#                         print feature_dictionary[z][1]
+                        final_word.append(feature_dictionary[z][1])
+                    i += 1
+                else:
+                    for k in range(len(feature_dictionary[z][j])):
+#                         print (i)
+                        if word[i] == 1:
+#                             print feature_dictionary[z][j][k]
+                            final_word.append(feature_dictionary[z][j][k])
+                        i += 1
+        print(u''.join(final_word))
+
+
 def decode_position(y, max_num_vowels):
     max_el = 0
     i = 0
@@ -566,3 +599,42 @@ def split_content(content, ratio):
     train_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_train_content_set]
     validate_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_validate_content_set]
     return train_content, validate_content
+
+
+#  create feature dictionary
+def create_feature_dictionary(content):
+    additional_data = [el[2] for el in content]
+    possible_variants = sorted(set(additional_data))
+    categories = sorted(set([el[0] for el in possible_variants]))
+
+    feature_dictionary = []
+    for category in categories:
+        category_features = [1, category]
+        examples_per_category = [el for el in possible_variants if el[0] == category]
+        longest_element = max(examples_per_category, key=len)
+        for i in range(1, len(longest_element)):
+            possibilities_per_el = sorted(set([el[i] for el in examples_per_category if i < len(el)]))
+            category_features[0] += len(possibilities_per_el)
+            category_features.append(possibilities_per_el)
+        feature_dictionary.append(category_features)
+    return feature_dictionary
+
+
+def create_X_features(content, feature_dictionary):
+    content = content
+    X_other_features = []
+    for el in content:
+        X_el_other_features = []
+        for feature in feature_dictionary:
+            if el[2][0] == feature[1]:
+                X_el_other_features.append(1)
+                for i in range(2, len(feature)):
+                    for j in range(len(feature[i])):
+                        if i-1 < len(el[2]) and feature[i][j] == el[2][i-1]:
+                            X_el_other_features.append(1)
+                        else:
+                            X_el_other_features.append(0)
+            else:
+                X_el_other_features.extend([0] * feature[0])
+        X_other_features.append(X_el_other_features)
+    return np.array(X_other_features)
\ No newline at end of file