Managing validate data to counter overfitting

2017-06-27 11:40:56 +02:00 · 2017-06-27 11:40:56 +02:00 · 3c9edd5d1c
commit 3c9edd5d1c
parent 0e3dd4f88c
2 changed files with 148 additions and 93 deletions
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@ -4,6 +4,7 @@
    <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" afterPath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" />
+      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" afterPath="$PROJECT_DIR$/character_based_ffnn/word_accetuation/character_based_ffnn_keras.ipynb" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
    </list>
    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
@ -34,8 +35,8 @@
      <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
        <entry file="file://$PROJECT_DIR$/prepare_data.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="54">
-              <caret line="438" column="14" lean-forward="false" selection-start-line="438" selection-start-column="14" selection-end-line="438" selection-end-column="14" />
+            <state relative-caret-position="266">
+              <caret line="562" column="17" lean-forward="true" selection-start-line="562" selection-start-column="17" selection-end-line="562" selection-end-column="17" />
              <folding>
                <element signature="e#24#63#0" expanded="true" />
              </folding>
@ -132,6 +133,7 @@
      <find>load_extended_inputs</find>
      <find>create_and_save_shuffle_vector</find>
      <find>h5f</find>
+      <find>generate_inputs</find>
    </findStrings>
  </component>
  <component name="Git.Settings">
@ -171,6 +173,7 @@
      <foldersAlwaysOnTop value="true" />
    </navigator>
    <panes>
+      <pane id="Scratches" />
      <pane id="Scope" />
      <pane id="ProjectPane">
        <subPane>
@ -218,7 +221,6 @@
          </PATH>
        </subPane>
      </pane>
-      <pane id="Scratches" />
    </panes>
  </component>
  <component name="PropertiesComponent">
@ -951,8 +953,8 @@
    </entry>
    <entry file="file://$PROJECT_DIR$/prepare_data.py">
      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="54">
-          <caret line="438" column="14" lean-forward="false" selection-start-line="438" selection-start-column="14" selection-end-line="438" selection-end-column="14" />
+        <state relative-caret-position="266">
+          <caret line="562" column="17" lean-forward="true" selection-start-line="562" selection-start-column="17" selection-end-line="562" selection-end-column="17" />
          <folding>
            <element signature="e#24#63#0" expanded="true" />
          </folding>
--- a/prepare_data.py
+++ b/prepare_data.py
@ -5,6 +5,7 @@ from __future__ import unicode_literals
 import numpy as np
 import h5py
 import gc
+import math
 import StringIO
 import copy

@ -75,7 +76,7 @@ def load_model(file_name):

 def read_content():
    print('READING CONTENT...')
-    with open('../data/SlovarIJS_BESEDE_utf8.lex') as f:
+    with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f:
        content = f.readlines()
    print('CONTENT READ SUCCESSFULY')
    return [x.decode('utf8').split('\t') for x in content]
@ -159,97 +160,98 @@ def shuffle_inputs(X, y, X_pure):
    X_pure = X_pure[s]
    return X, y, X_pure

-def generate_inputs():
-    dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
-    
-    print('GENERATING X AND y...')
-    X = np.zeros((len(content), max_word*len(dictionary)))
-    y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
-
-    i = 0
-    for el in content:
-        j = 0
-        for c in list(el[0]):
-            index = 0
-            for d in dictionary:
-                if c == d:
-                    X[i][index + j * max_word] = 1
-                    break
-                index += 1
-            j += 1
-        j = 0
-        word_accetuations = []
-        num_vowels = 0
-        for c in list(el[3]):
-            index = 0
-            if is_vowel(el[3], j, vowels):
-                num_vowels += 1
-            for d in accetuated_vowels:
-                if c == d:
-                    word_accetuations.append(num_vowels)
-                    break
-                index += 1
-            j += 1
-        y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
-        i += 1
-    print('GENERATION SUCCESSFUL!')
-    print('SHUFFELING INPUTS...')
-    X, y = shuffle_inputs(X, y)
-    print('INPUTS SHUFFELED!')
-    return X, y
-
-
-def generate_matrix_inputs():
-    dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
-    
-    print('GENERATING X AND y...')
-    # X = np.zeros((len(content), max_word*len(dictionary)))
-    y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
-
-    X = []
-
-    i = 0
-    for el in content:
-        # j = 0
-        word = []
-        for c in list(el[0]):
-            index = 0
-            character = np.zeros(len(dictionary))
-            for d in dictionary:
-                if c == d:
-                    # X[i][index + j * max_word] = 1
-                    character[index] = 1
-                    break
-                index += 1
-            word.append(character)
-            # j += 1
-        j = 0
-        X.append(word)
-        word_accetuations = []
-        num_vowels = 0
-        for c in list(el[3]):
-            index = 0
-            if is_vowel(el[3], j, vowels):
-                num_vowels += 1
-            for d in accetuated_vowels:
-                if c == d:
-                    word_accetuations.append(num_vowels)
-                    break
-                index += 1
-            j += 1
-        y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
-        i += 1
-    X = np.array(X)
-    print('GENERATION SUCCESSFUL!')
-    print('SHUFFELING INPUTS...')
-    X, y = shuffle_inputs(X, y)
-    print('INPUTS SHUFFELED!')
-    return X, y
+# def generate_inputs():
+#     dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
+#
+#     print('GENERATING X AND y...')
+#     X = np.zeros((len(content), max_word*len(dictionary)))
+#     y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
+#
+#     i = 0
+#     for el in content:
+#         j = 0
+#         for c in list(el[0]):
+#             index = 0
+#             for d in dictionary:
+#                 if c == d:
+#                     X[i][index + j * max_word] = 1
+#                     break
+#                 index += 1
+#             j += 1
+#         j = 0
+#         word_accetuations = []
+#         num_vowels = 0
+#         for c in list(el[3]):
+#             index = 0
+#             if is_vowel(el[3], j, vowels):
+#                 num_vowels += 1
+#             for d in accetuated_vowels:
+#                 if c == d:
+#                     word_accetuations.append(num_vowels)
+#                     break
+#                 index += 1
+#             j += 1
+#         y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
+#         i += 1
+#     print('GENERATION SUCCESSFUL!')
+#     print('SHUFFELING INPUTS...')
+#     X, y = shuffle_inputs(X, y)
+#     print('INPUTS SHUFFELED!')
+#     return X, y
+#
+#
+# def generate_matrix_inputs():
+#     dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
+#
+#     print('GENERATING X AND y...')
+#     # X = np.zeros((len(content), max_word*len(dictionary)))
+#     y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
+#
+#     X = []
+#
+#     i = 0
+#     for el in content:
+#         # j = 0
+#         word = []
+#         for c in list(el[0]):
+#             index = 0
+#             character = np.zeros(len(dictionary))
+#             for d in dictionary:
+#                 if c == d:
+#                     # X[i][index + j * max_word] = 1
+#                     character[index] = 1
+#                     break
+#                 index += 1
+#             word.append(character)
+#             # j += 1
+#         j = 0
+#         X.append(word)
+#         word_accetuations = []
+#         num_vowels = 0
+#         for c in list(el[3]):
+#             index = 0
+#             if is_vowel(el[3], j, vowels):
+#                 num_vowels += 1
+#             for d in accetuated_vowels:
+#                 if c == d:
+#                     word_accetuations.append(num_vowels)
+#                     break
+#                 index += 1
+#             j += 1
+#         y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
+#         i += 1
+#     X = np.array(X)
+#     print('GENERATION SUCCESSFUL!')
+#     print('SHUFFELING INPUTS...')
+#     X, y = shuffle_inputs(X, y)
+#     print('INPUTS SHUFFELED!')
+#     return X, y


 def generate_full_matrix_inputs():
    dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
-    
+
+
    print('GENERATING X AND y...')
    # X = np.zeros((len(content), max_word*len(dictionary)))
    y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
@ -553,3 +555,54 @@ def generate_input_from_word(word, max_word, dictionary):
            index += 1
        j += 1
    return x
+
+def generate_input_per_vowel_from_word(word, max_word, dictionary, vowels):
+    X_el = np.zeros((max_word, len(dictionary)))
+    j = 0
+    for c in list(word):
+        index = 0
+        for d in dictionary:
+            if c == d:
+                X_el[j][index] = 1
+                break
+            index += 1
+        j += 1
+
+    X = []
+    X_pure = []
+    vowel_i = 0
+    for i in range(len(word)):
+        if is_vowel(list(word), i, vowels):
+            X.append(X_el)
+            X_pure.append(vowel_i)
+            vowel_i += 1
+    return np.array(X), np.array(X_pure)
+
+def decode_position_from_vowel_to_final_number(y):
+    res = []
+    for i in range(len(y)):
+        if y[i][0] > 0.5:
+            res.append(i + 1)
+    return res
+
+
+def split_content(content, ratio):
+    expanded_content = [el[1] if el[1] != '=' else el[0] for el in content]
+    # print(len(content))
+    unique_content = sorted(set(expanded_content))
+
+    s = np.arange(len(unique_content))
+    np.random.shuffle(s)
+
+    split_num = math.floor(len(unique_content) * ratio)
+    validate_content = []
+    shuffled_unique_train_content = [unique_content[i] for i in range(len(s)) if s[i] >= split_num]
+
+    shuffled_unique_train_content_set = set(shuffled_unique_train_content)
+    shuffled_unique_validate_content = [unique_content[i] for i in range(len(s)) if s[i] < split_num]
+
+    shuffled_unique_validate_content_set = set(shuffled_unique_validate_content)
+
+    train_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_train_content_set]
+    validate_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_validate_content_set]
+    return train_content, validate_content