[MAJOR UPDATE]Added syllabled letters functionallity and added letters attributes to letter accetuation

2017-07-25 09:14:12 +02:00 · 2017-07-25 09:14:12 +02:00 · 76dc71ce55
commit 76dc71ce55
parent 38797b37d4
2 changed files with 127 additions and 21 deletions
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@ -4,7 +4,7 @@
    <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
-      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" />
+      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
    </list>
    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
@ -35,8 +35,8 @@
      <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
        <entry file="file://$PROJECT_DIR$/prepare_data.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="662">
-              <caret line="1166" column="15" lean-forward="false" selection-start-line="1166" selection-start-column="15" selection-end-line="1166" selection-end-column="15" />
+            <state relative-caret-position="378">
+              <caret line="1024" column="33" lean-forward="false" selection-start-line="1024" selection-start-column="33" selection-end-line="1024" selection-end-column="33" />
              <folding>
                <element signature="e#24#63#0" expanded="true" />
              </folding>
@ -156,13 +156,14 @@
      <find>generate</find>
      <find>shuffle</find>
      <find>X_</find>
-      <find>dictionary</find>
      <find>create_dict</find>
      <find>split_content</find>
      <find>decode_position</find>
      <find>'r'</find>
      <find>decode</find>
      <find>complete_feature_dict</find>
+      <find>dictionary</find>
+      <find>voiced_consonants</find>
    </findStrings>
  </component>
  <component name="Git.Settings">
@ -207,6 +208,8 @@
      <foldersAlwaysOnTop value="true" />
    </navigator>
    <panes>
+      <pane id="Scope" />
+      <pane id="Scratches" />
      <pane id="ProjectPane">
        <subPane>
          <PATH>
@ -221,8 +224,6 @@
          </PATH>
        </subPane>
      </pane>
-      <pane id="Scope" />
-      <pane id="Scratches" />
    </panes>
  </component>
  <component name="PropertiesComponent">
@ -881,8 +882,8 @@
    </entry>
    <entry file="file://$PROJECT_DIR$/prepare_data.py">
      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="662">
-          <caret line="1166" column="15" lean-forward="false" selection-start-line="1166" selection-start-column="15" selection-end-line="1166" selection-end-column="15" />
+        <state relative-caret-position="378">
+          <caret line="1024" column="33" lean-forward="false" selection-start-line="1024" selection-start-column="33" selection-end-line="1024" selection-end-column="33" />
          <folding>
            <element signature="e#24#63#0" expanded="true" />
          </folding>
--- a/prepare_data.py
+++ b/prepare_data.py
@ -310,9 +310,16 @@ def old_generate_full_matrix_inputs():


 # Generate each y as an array of 11 numbers (with possible values between 0 and 1)
-def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location, shuffle=True):
+def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location, shuffle=True, aditional_letter_attributes=True):
    y = np.zeros((len(content), max_num_vowels))
-    X = np.zeros((len(content), max_word, len(dictionary)))
+    if aditional_letter_attributes:
+        X = np.zeros((len(content), max_word, len(dictionary) + 6))
+        voiced_consonants = get_voiced_consonants()
+        resonant_silent_consonants = get_resonant_silent_consonants()
+        unresonant_silent_consonants = get_unresonant_silent_consonants()
+
+    else:
+        X = np.zeros((len(content), max_word, len(dictionary)))
    print('CREATING OTHER FEATURES...')
    X_other_features = create_X_features(content, feature_dictionary)
    print('OTHER FEATURES CREATED!')
@ -327,6 +334,20 @@ def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, acce
                    X[i][j][index] = 1
                    break
                index += 1
+            if aditional_letter_attributes:
+                if is_vowel(el[0], j, vowels):
+                    X[i][j][len(dictionary)] = 1
+                else:
+                    X[i][j][len(dictionary) + 1] = 1
+                    if c in voiced_consonants:
+                        X[i][j][len(dictionary) + 2] = 1
+                    else:
+                        X[i][j][len(dictionary) + 3] = 1
+                        if c in resonant_silent_consonants:
+                            X[i][j][len(dictionary) + 4] = 1
+                        elif c in unresonant_silent_consonants:
+                            X[i][j][len(dictionary) + 5] = 1
+
            j += 1
        j = 0
        word_accetuations = []
@ -991,7 +1012,6 @@ def dict_occurances_in_dataset_rate(content):
    case_numbers = np.sum(X_other_features, axis=0)
    print(case_numbers)

-
 def get_voiced_consonants():
    return ['m', 'n', 'v', 'l', 'r', 'j', 'y', 'w']

@ -1003,15 +1023,6 @@ def get_unresonant_silent_consonants():


 def split_consonants(consonants):
-    # def voiced_consonants():
-    #     return ['m', 'n', 'v', 'l', 'r', 'j', 'y', 'w']
-    #
-    # def resonant_silent_consonants():
-    #     return ['b', 'd', 'z', 'ž', 'g']
-    #
-    # def unresonant_silent_consonants():
-    #     return ['p', 't', 's', 'š', 'č', 'k', 'f', 'h', 'c']
-    # test = get_voiced_consonants()
    voiced_consonants = get_voiced_consonants()
    resonant_silent_consonants = get_resonant_silent_consonants()
    unresonant_silent_consonants = get_unresonant_silent_consonants()
@ -1146,7 +1157,7 @@ def generate_syllable_X_and_y(dictionary, max_word, max_num_vowels, content, vow
        i += 1
    # print(len(X))
    # print(X[0])
-    X = np.array(X)
+    # X = np.array(X)
    # print(X.shape)
    # print(X[0])
    # print(len(X))
@ -1174,3 +1185,97 @@ def generate_fake_epoch_syllables(orig_X, orig_X_additional, orig_y, batch_size,
                gen_orig_X = eye[orig_X[loc:loc + batch_size]]
                yield([gen_orig_X, orig_X_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
            loc += batch_size
+
+
+def get_max_syllable(syllable_dictionary):
+    max_len = 0
+    for el in syllable_dictionary:
+        if len(el) > max_len:
+            max_len = len(el)
+    return max_len
+
+
+def generate_syllabled_letters_inputs(content_shuffle_vector_location, shuffle_vector_location):
+    dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
+    train_content, test_content, validate_content = split_content(content, 0.2, content_shuffle_vector_location)
+    feature_dictionary = create_feature_dictionary()
+    print('CREATING SYLLABLE DICTIONARY...')
+    syllable_dictionary = create_syllables_dictionary(content, vowels)
+    max_syllable = get_max_syllable(syllable_dictionary)
+    print('CREATION SUCCESSFUL!')
+
+    # Generate X and y
+    print('GENERATING X AND y...')
+    X_train, X_other_features_train, y_train = generate_syllable_X_and_y(syllable_dictionary, max_word, max_num_vowels, train_content, vowels,
+                                                                         accetuated_vowels, feature_dictionary, shuffle_vector_location + '_train.h5')
+    X_test, X_other_features_test, y_test = generate_syllable_X_and_y(syllable_dictionary, max_word, max_num_vowels, test_content, vowels,
+                                                                      accetuated_vowels, feature_dictionary, shuffle_vector_location + '_test.h5')
+    X_validate, X_other_features_validate, y_validate = generate_syllable_X_and_y(syllable_dictionary, max_word, max_num_vowels, validate_content,
+                                                                                  vowels, accetuated_vowels, feature_dictionary,
+                                                                                  shuffle_vector_location + '_validate.h5')
+
+
+    print('GENERATION SUCCESSFUL!')
+    return X_train, X_other_features_train, y_train, X_test, X_other_features_test, y_test, X_validate, X_other_features_validate, y_validate
+
+
+# generator for inputs for tracking of data fitting
+def generate_fake_epoch_syllabled_letters(orig_X, orig_X_additional, orig_y, batch_size, syllable_letters_translator):
+    size = orig_X.shape[0]
+    # eye = np.eye(dictionary_size, dtype=int)
+    while 1:
+        loc = 0
+        while loc < size:
+            if loc + batch_size >= size:
+                # [eye[i] for i in range(size-loc)]
+                # gen_orig_X = eye[orig_X[loc:size]]
+                # gen_orig_X = [eye[i] for i in range(size-loc)]
+                gen_orig_X = syllable_letters_translator[orig_X[loc:size]]
+                yield([gen_orig_X, orig_X_additional[loc:size]], orig_y[loc:size])
+            else:
+                gen_orig_X = syllable_letters_translator[orig_X[loc:loc + batch_size]]
+                yield([gen_orig_X, orig_X_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
+            loc += batch_size
+
+
+def create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels, aditional_letter_attributes=True):
+    if aditional_letter_attributes:
+        voiced_consonants = get_voiced_consonants()
+        resonant_silent_consonants = get_resonant_silent_consonants()
+        unresonant_silent_consonants = get_unresonant_silent_consonants()
+
+    syllable_letters_translator = []
+    for syllable in syllable_dictionary:
+        di_syllable = []
+        for let in range(max_syllable):
+            # di_let = []
+            for a in dictionary:
+                if let < len(syllable) and a == list(syllable)[let]:
+                    di_syllable.append(1)
+                else:
+                    di_syllable.append(0)
+
+            if aditional_letter_attributes:
+                if let >= len(syllable):
+                    di_syllable.extend([0, 0, 0, 0, 0, 0])
+                elif is_vowel(list(syllable), let, vowels):
+                    di_syllable.extend([1, 0, 0, 0, 0, 0])
+                else:
+                    # X[i][j][len(dictionary) + 1] = 1
+                    if list(syllable)[let] in voiced_consonants:
+                        # X[i][j][len(dictionary) + 2] = 1
+                        di_syllable.extend([0, 1, 1, 0, 0, 0])
+                    else:
+                        # X[i][j][len(dictionary) + 3] = 1
+                        if list(syllable)[let] in resonant_silent_consonants:
+                            # X[i][j][len(dictionary) + 4] = 1
+                            di_syllable.extend([0, 1, 0, 1, 1, 0])
+                        elif list(syllable)[let] in unresonant_silent_consonants:
+                            # X[i][j][len(dictionary) + 5] = 1
+                            di_syllable.extend([0, 1, 0, 1, 0, 1])
+                        else:
+                            di_syllable.extend([0, 0, 0, 0, 0, 0])
+            # di_syllable.append(di_let)
+        syllable_letters_translator.append(di_syllable)
+    syllable_letters_translator = np.array(syllable_letters_translator, dtype=int)
+    return syllable_letters_translator