From 38797b37d40f75d10e68989b985e39f0347e0663 Mon Sep 17 00:00:00 2001
From: lkrsnik <krsnik.luka92@gmail.com>
Date: Fri, 21 Jul 2017 10:48:50 +0200
Subject: [PATCH] [MAJOR UPDATE] Added syllable neural networks

---
 .idea/workspace.xml |  19 ++--
 prepare_data.py     | 216 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 219 insertions(+), 16 deletions(-)
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 576c140..5abb6b2 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -2,9 +2,9 @@
 <project version="4">
   <component name="ChangeListManager">
     <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
-      <change type="NEW" beforePath="" afterPath="$PROJECT_DIR$/tex_hyphenation.py" />
       <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
       <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
+      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" />
       <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
     </list>
     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
@@ -35,8 +35,8 @@
       <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
         <entry file="file://$PROJECT_DIR$/prepare_data.py">
           <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="284">
-              <caret line="592" column="36" lean-forward="true" selection-start-line="592" selection-start-column="36" selection-end-line="592" selection-end-column="36" />
+            <state relative-caret-position="662">
+              <caret line="1166" column="15" lean-forward="false" selection-start-line="1166" selection-start-column="15" selection-end-line="1166" selection-end-column="15" />
               <folding>
                 <element signature="e#24#63#0" expanded="true" />
               </folding>
@@ -150,7 +150,6 @@
       <find>shuffle_inputs</find>
       <find>generator</find>
       <find>content, feature_dictionary</find>
-      <find>decode</find>
       <find>create_feature_dictionary</find>
       <find>with</find>
       <find>read</find>
@@ -160,6 +159,10 @@
       <find>dictionary</find>
       <find>create_dict</find>
       <find>split_content</find>
+      <find>decode_position</find>
+      <find>'r'</find>
+      <find>decode</find>
+      <find>complete_feature_dict</find>
     </findStrings>
   </component>
   <component name="Git.Settings">
@@ -187,7 +190,7 @@
     <option name="x" value="65" />
     <option name="y" value="24" />
     <option name="width" value="1855" />
-    <option name="height" value="1176" />
+    <option name="height" value="1056" />
   </component>
   <component name="ProjectView">
     <navigator currentView="ProjectPane" proportions="" version="1">
@@ -482,7 +485,7 @@
     <servers />
   </component>
   <component name="ToolWindowManager">
-    <frame x="65" y="24" width="1855" height="1176" extended-state="6" />
+    <frame x="65" y="24" width="1855" height="1056" extended-state="6" />
     <editor active="true" />
     <layout>
       <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.16375546" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
@@ -878,8 +881,8 @@
     </entry>
     <entry file="file://$PROJECT_DIR$/prepare_data.py">
       <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="284">
-          <caret line="592" column="36" lean-forward="true" selection-start-line="592" selection-start-column="36" selection-end-line="592" selection-end-column="36" />
+        <state relative-caret-position="662">
+          <caret line="1166" column="15" lean-forward="false" selection-start-line="1166" selection-start-column="15" selection-end-line="1166" selection-end-column="15" />
           <folding>
             <element signature="e#24#63#0" expanded="true" />
           </folding>
diff --git a/prepare_data.py b/prepare_data.py
index afda497..a7b2eb7 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -124,7 +124,7 @@ def read_content():
 def is_vowel(word_list, position, vowels):
     if word_list[position] in vowels:
         return True
-    if word_list[position] == u'r' and     (position - 1 < 0 or word_list[position - 1] not in vowels) and     (position + 1 >= len(word_list) or word_list[position + 1] not in vowels):
+    if (word_list[position] == u'r' or word_list[position] == u'R') and     (position - 1 < 0 or word_list[position - 1] not in vowels) and     (position + 1 >= len(word_list) or word_list[position + 1] not in vowels):
         return True
     return False
 
@@ -310,7 +310,7 @@ def old_generate_full_matrix_inputs():
 
 
 # Generate each y as an array of 11 numbers (with possible values between 0 and 1)
-def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location):
+def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location, shuffle=True):
     y = np.zeros((len(content), max_num_vowels))
     X = np.zeros((len(content), max_word, len(dictionary)))
     print('CREATING OTHER FEATURES...')
@@ -350,10 +350,10 @@ def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, acce
             y[i][0] = 1
         # y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
         i += 1
-
-    print('SHUFFELING INPUTS...')
-    X, y, X_other_features = shuffle_inputs(X, y, shuffle_vector_location, X_pure=X_other_features)
-    print('INPUTS SHUFFELED!')
+    if shuffle:
+        print('SHUFFELING INPUTS...')
+        X, y, X_other_features = shuffle_inputs(X, y, shuffle_vector_location, X_pure=X_other_features)
+        print('INPUTS SHUFFELED!')
     return X, X_other_features, y
 
 
@@ -484,6 +484,12 @@ def generate_X_and_y_RAM_efficient(name, split_number):
 
 
 # metric for calculation of correct results
+# test with:
+# print(mean_pred(y_validate[pos], predictions[pos]).eval())
+# print(mean_pred(np.array([[ 0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
+#                           [ 0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]]),
+#                 np.array([[ 0.,  0.51,  0.,  0.51,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
+#                           [ 0.,  0.92,  0.,  0.51,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])).eval())
 def actual_accuracy(y_true, y_pred):
     return K.mean(K.equal(K.mean(K.equal(K.round(y_true), K.round(y_pred)), axis=-1), 1.0))
 
@@ -602,7 +608,17 @@ def decode_X_features(feature_dictionary, X_other_features):
     return u''.join(final_word)
 
 
-def decode_position(y, max_num_vowels):
+def decode_position(y):
+    i = 0
+    res = []
+    for el in y:
+        if el >= 0.5:
+            res.append(i)
+        i += 1
+    return res
+
+
+def old_decode_position(y, max_num_vowels):
     max_el = 0
     i = 0
     pos = -1
@@ -785,7 +801,7 @@ def convert_to_MULTEXT_east_v4(old_features, feature_dictionary):
             new_features[2] = '-'
         return new_features[:len(feature_dictionary[3]) - 1]
     if old_features[0] == 'N':
-        if len(old_features) > 5:
+        if len(old_features) >= 7:
             new_features[5] = old_features[7]
         return new_features[:len(feature_dictionary[4]) - 1]
     if old_features[0] == 'P':
@@ -974,3 +990,187 @@ def dict_occurances_in_dataset_rate(content):
 
     case_numbers = np.sum(X_other_features, axis=0)
     print(case_numbers)
+
+
+def get_voiced_consonants():
+    return ['m', 'n', 'v', 'l', 'r', 'j', 'y', 'w']
+
+def get_resonant_silent_consonants():
+    return ['b', 'd', 'z', 'ž', 'g']
+
+def get_unresonant_silent_consonants():
+    return ['p', 't', 's', 'š', 'č', 'k', 'f', 'h', 'c']
+
+
+def split_consonants(consonants):
+    # def voiced_consonants():
+    #     return ['m', 'n', 'v', 'l', 'r', 'j', 'y', 'w']
+    #
+    # def resonant_silent_consonants():
+    #     return ['b', 'd', 'z', 'ž', 'g']
+    #
+    # def unresonant_silent_consonants():
+    #     return ['p', 't', 's', 'š', 'č', 'k', 'f', 'h', 'c']
+    # test = get_voiced_consonants()
+    voiced_consonants = get_voiced_consonants()
+    resonant_silent_consonants = get_resonant_silent_consonants()
+    unresonant_silent_consonants = get_unresonant_silent_consonants()
+    if len(consonants) == 0:
+        return [''], ['']
+    elif len(consonants) == 1:
+        return [''], consonants
+    else:
+        split_options = []
+        for i in range(len(consonants)-1):
+            if consonants[i] == '-' or consonants[i] == '_':
+                split_options.append([i, -1])
+            elif consonants[i] == consonants[i+1]:
+                split_options.append([i, 0])
+            elif consonants[i] in voiced_consonants:
+                if consonants[i+1] in resonant_silent_consonants or consonants[i+1] in unresonant_silent_consonants:
+                    split_options.append([i, 2])
+            elif consonants[i] in resonant_silent_consonants:
+                if consonants[i+1] in resonant_silent_consonants:
+                    split_options.append([i, 1])
+                elif consonants[i+1] in unresonant_silent_consonants:
+                    split_options.append([i, 3])
+            elif consonants[i] in unresonant_silent_consonants:
+                if consonants[i+1] in resonant_silent_consonants:
+                    split_options.append([i, 4])
+            else:
+                print(consonants)
+                print('UNRECOGNIZED LETTERS!')
+        if split_options == []:
+            return [''], consonants
+        else:
+            split = min(split_options, key=lambda x:x[1])
+            return consonants[:split[0]+1], consonants[split[0]+1:]
+    # print(consonants)
+    return [''], ['']
+
+
+def create_syllables(word, vowels):
+    word_list = list(word)
+    consonants = []
+    syllables = []
+    for i in range(len(word_list)):
+        if is_vowel(word_list, i, vowels):
+            if syllables == []:
+                consonants.append(word_list[i])
+                syllables.append(''.join(consonants))
+            else:
+                left_consonants, right_consonants = split_consonants(consonants)
+                syllables[-1] += ''.join(left_consonants)
+                right_consonants.append(word_list[i])
+                syllables.append(''.join(right_consonants))
+            consonants = []
+        else:
+            consonants.append(word_list[i])
+    if len(syllables) < 1:
+        return word
+    syllables[-1] += ''.join(consonants)
+
+    return syllables
+
+
+def create_syllables_dictionary(content, vowels):
+    dictionary = []
+    for el in content:
+        syllables = create_syllables(el[0], vowels)
+        for syllable in syllables:
+            if syllable not in dictionary:
+                dictionary.append(syllable)
+    dictionary.append('')
+    return sorted(dictionary)
+
+
+def generate_syllable_inputs(content_shuffle_vector_location, shuffle_vector_location):
+    dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
+    train_content, test_content, validate_content = split_content(content, 0.2, content_shuffle_vector_location)
+    feature_dictionary = create_feature_dictionary()
+    print('CREATING SYLLABLE DICTIONARY...')
+    syllable_dictionary = create_syllables_dictionary(content, vowels)
+    print('CREATION SUCCESSFUL!')
+
+    # Generate X and y
+    print('GENERATING X AND y...')
+    X_train, X_other_features_train, y_train = generate_syllable_X_and_y(syllable_dictionary, max_word, max_num_vowels, train_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_train.h5')
+    X_test, X_other_features_test, y_test = generate_syllable_X_and_y(syllable_dictionary, max_word, max_num_vowels, test_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_test.h5')
+    X_validate, X_other_features_validate, y_validate = generate_syllable_X_and_y(syllable_dictionary, max_word, max_num_vowels, validate_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_validate.h5')
+    print('GENERATION SUCCESSFUL!')
+    return X_train, X_other_features_train, y_train, X_test, X_other_features_test, y_test, X_validate, X_other_features_validate, y_validate
+
+
+# Generate each y as an array of 11 numbers (with possible values between 0 and 1)
+def generate_syllable_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location, shuffle=True):
+    y = np.zeros((len(content), max_num_vowels))
+    X = np.zeros((len(content), max_num_vowels), dtype=int)
+    # X = []
+    print('CREATING OTHER FEATURES...')
+    X_other_features = create_X_features(content, feature_dictionary)
+    print('OTHER FEATURES CREATED!')
+
+    i = 0
+    for el in content:
+        j = 0
+        syllables = create_syllables(el[0], vowels)
+        # X_el = [''] * max_num_vowels
+        for syllable in syllables:
+
+            index = dictionary.index(syllable)
+            X[i][j] = index
+            # X[i][j][index] = 1
+            j += 1
+        # X.append(X_el)
+        j = 0
+        word_accetuations = []
+        num_vowels = 0
+        for c in list(el[3]):
+            index = 0
+            if is_vowel(el[3], j, vowels):
+                num_vowels += 1
+            for d in accetuated_vowels:
+                if c == d:
+                    word_accetuations.append(num_vowels)
+                    break
+                index += 1
+            j += 1
+        if len(word_accetuations) > 0:
+            y_value = 1/len(word_accetuations)
+            for el in word_accetuations:
+                # y[i][el] = y_value
+                y[i][el] = 1
+        else:
+            y[i][0] = 1
+        # y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
+        i += 1
+    # print(len(X))
+    # print(X[0])
+    X = np.array(X)
+    # print(X.shape)
+    # print(X[0])
+    # print(len(X))
+    if shuffle:
+        print('SHUFFELING INPUTS...')
+        X, y, X_other_features = shuffle_inputs(X, y, shuffle_vector_location, X_pure=X_other_features)
+        print('INPUTS SHUFFELED!')
+    return X, X_other_features, y
+
+
+# generator for inputs for tracking of data fitting
+def generate_fake_epoch_syllables(orig_X, orig_X_additional, orig_y, batch_size, dictionary_size=5168):
+    size = orig_X.shape[0]
+    eye = np.eye(dictionary_size, dtype=int)
+    while 1:
+        loc = 0
+        while loc < size:
+            if loc + batch_size >= size:
+                # [eye[i] for i in range(size-loc)]
+                # gen_orig_X = eye[orig_X[loc:size]]
+                # gen_orig_X = [eye[i] for i in range(size-loc)]
+                gen_orig_X = eye[orig_X[loc:size]]
+                yield([gen_orig_X, orig_X_additional[loc:size]], orig_y[loc:size])
+            else:
+                gen_orig_X = eye[orig_X[loc:loc + batch_size]]
+                yield([gen_orig_X, orig_X_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
+            loc += batch_size