From 18348b78fc58e0fae9c8eed3b0da455fbb8bf378 Mon Sep 17 00:00:00 2001
From: lkrsnik <krsnik.luka92@gmail.com>
Date: Thu, 27 Jul 2017 18:20:18 +0200
Subject: [PATCH] [MAJOR REFACTOR] Added accent classification (from scratch)
 and deleted unnecessary y output (output where no accent should be employed)
 X-es in case of both syllabled inputs have also been changed in similar
 manner.

---
 .idea/workspace.xml |  86 +++++++++++++++-------------
 prepare_data.py     | 136 ++++++++++++++++++++++++++++++++------------
 2 files changed, 146 insertions(+), 76 deletions(-)
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index a07fbfc..3971ed8 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -2,10 +2,8 @@
 <project version="4">
   <component name="ChangeListManager">
     <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
-      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/dictionaries/luka.xml" afterPath="$PROJECT_DIR$/.idea/dictionaries/luka.xml" />
       <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
       <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
-      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/cnn.ipynb" />
       <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" />
       <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
     </list>
@@ -37,22 +35,25 @@
       <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
         <entry file="file://$PROJECT_DIR$/prepare_data.py">
           <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="568">
-              <caret line="482" column="74" lean-forward="true" selection-start-line="482" selection-start-column="74" selection-end-line="482" selection-end-column="74" />
+            <state relative-caret-position="244">
+              <caret line="252" column="38" lean-forward="true" selection-start-line="252" selection-start-column="38" selection-end-line="252" selection-end-column="38" />
               <folding>
                 <element signature="e#24#63#0" expanded="true" />
-                <element signature="e#5524#5637#0" expanded="false" />
-                <element signature="e#5684#6970#0" expanded="false" />
-                <element signature="e#7131#8538#0" expanded="false" />
-                <element signature="e#8626#8921#0" expanded="false" />
-                <element signature="e#13363#13665#0" expanded="false" />
-                <element signature="e#13722#14551#0" expanded="false" />
-                <element signature="e#14615#14961#0" expanded="false" />
-                <element signature="e#16836#17749#0" expanded="false" />
-                <element signature="e#18179#18375#0" expanded="false" />
-                <element signature="e#18436#18627#0" expanded="false" />
-                <element signature="e#18694#19341#0" expanded="false" />
-                <element signature="e#19440#21738#0" expanded="false" />
+                <element signature="e#5658#5771#0" expanded="false" />
+                <element signature="e#5818#7106#0" expanded="false" />
+                <element signature="e#7267#8674#0" expanded="false" />
+                <element signature="e#8762#9057#0" expanded="false" />
+                <element signature="e#13496#13798#0" expanded="false" />
+                <element signature="e#13855#14684#0" expanded="false" />
+                <element signature="e#14748#15094#0" expanded="false" />
+                <element signature="e#16969#17882#0" expanded="false" />
+                <element signature="e#18312#18508#0" expanded="false" />
+                <element signature="e#18569#18760#0" expanded="false" />
+                <element signature="e#18827#19474#0" expanded="false" />
+                <element signature="e#19573#21871#0" expanded="false" />
+                <element signature="e#22137#22836#0" expanded="false" />
+                <element signature="e#29631#29772#0" expanded="false" />
+                <element signature="e#29922#32067#0" expanded="false" />
               </folding>
             </state>
           </provider>
@@ -149,16 +150,6 @@
   </component>
   <component name="FindInProjectRecents">
     <findStrings>
-      <find>_create_X_features</find>
-      <find>raise</find>
-      <find>create_syllables_dictionary</find>
-      <find>decode_</find>
-      <find>create_x_features</find>
-      <find>generate_x_and_y</find>
-      <find>create_syllables</find>
-      <find>split_consonants</find>
-      <find>get_unresonant_silent_consonants</find>
-      <find>dict_occurances_in_dataset_rate</find>
       <find>count_vowels</find>
       <find>shuffle_full_vowel_inputs</find>
       <find>generate_presentable_y</find>
@@ -179,6 +170,16 @@
       <find>_create_syllable_letters_translator</find>
       <find>_get_unresonant_silent_consonants</find>
       <find>el[0]</find>
+      <find>max_num_vowels</find>
+      <find>index</find>
+      <find>accentuated</find>
+      <find>create_syll</find>
+      <find>shuffle_all_inputs</find>
+      <find>accented</find>
+      <find>_accented</find>
+      <find>size</find>
+      <find>decode_x</find>
+      <find>self._input_type ==</find>
     </findStrings>
   </component>
   <component name="Git.Settings">
@@ -532,7 +533,7 @@
   </component>
   <component name="XDebuggerManager">
     <breakpoint-manager>
-      <option name="time" value="5" />
+      <option name="time" value="6" />
     </breakpoint-manager>
     <watches-manager />
   </component>
@@ -831,22 +832,25 @@
     </entry>
     <entry file="file://$PROJECT_DIR$/prepare_data.py">
       <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="568">
-          <caret line="482" column="74" lean-forward="true" selection-start-line="482" selection-start-column="74" selection-end-line="482" selection-end-column="74" />
+        <state relative-caret-position="244">
+          <caret line="252" column="38" lean-forward="true" selection-start-line="252" selection-start-column="38" selection-end-line="252" selection-end-column="38" />
           <folding>
             <element signature="e#24#63#0" expanded="true" />
-            <element signature="e#5524#5637#0" expanded="false" />
-            <element signature="e#5684#6970#0" expanded="false" />
-            <element signature="e#7131#8538#0" expanded="false" />
-            <element signature="e#8626#8921#0" expanded="false" />
-            <element signature="e#13363#13665#0" expanded="false" />
-            <element signature="e#13722#14551#0" expanded="false" />
-            <element signature="e#14615#14961#0" expanded="false" />
-            <element signature="e#16836#17749#0" expanded="false" />
-            <element signature="e#18179#18375#0" expanded="false" />
-            <element signature="e#18436#18627#0" expanded="false" />
-            <element signature="e#18694#19341#0" expanded="false" />
-            <element signature="e#19440#21738#0" expanded="false" />
+            <element signature="e#5658#5771#0" expanded="false" />
+            <element signature="e#5818#7106#0" expanded="false" />
+            <element signature="e#7267#8674#0" expanded="false" />
+            <element signature="e#8762#9057#0" expanded="false" />
+            <element signature="e#13496#13798#0" expanded="false" />
+            <element signature="e#13855#14684#0" expanded="false" />
+            <element signature="e#14748#15094#0" expanded="false" />
+            <element signature="e#16969#17882#0" expanded="false" />
+            <element signature="e#18312#18508#0" expanded="false" />
+            <element signature="e#18569#18760#0" expanded="false" />
+            <element signature="e#18827#19474#0" expanded="false" />
+            <element signature="e#19573#21871#0" expanded="false" />
+            <element signature="e#22137#22836#0" expanded="false" />
+            <element signature="e#29631#29772#0" expanded="false" />
+            <element signature="e#29922#32067#0" expanded="false" />
           </folding>
         </state>
       </provider>
diff --git a/prepare_data.py b/prepare_data.py
index d3a882d..9e58988 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -11,13 +11,14 @@ import os.path
 
 class Data:
     def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True,
-                 additional_letter_attributes=True, reverse_inputs=True):
+                 additional_letter_attributes=True, reverse_inputs=True, accent_classification=False):
         self._input_type = input_type
         self._save_generated_data = save_generated_data
         self._allow_shuffle_vector_generation = allow_shuffle_vector_generation
         self._shuffle_all_inputs = shuffle_all_inputs
         self._additional_letter_attributes = additional_letter_attributes
         self._reverse_inputs = reverse_inputs
+        self._accent_classification = accent_classification
 
         self.x_train = None
         self.x_other_features_train = None
@@ -30,14 +31,14 @@ class Data:
         self.y_validate = None
 
     def generate_data(self, train_inputs_name, test_inputs_name, validate_inputs_name, test_and_validation_size=0.1,
-                      content_name='SlovarIJS_BESEDE_utf8.lex',
+                      force_override=False, content_name='SlovarIJS_BESEDE_utf8.lex',
                       content_shuffle_vector='content_shuffle_vector', shuffle_vector='shuffle_vector',
                       inputs_location='../../internal_representations/inputs/', content_location='../../../data/'):
         content_path = '{}{}'.format(content_location, content_name)
         train_path = '{}{}.h5'.format(inputs_location, train_inputs_name)
         test_path = '{}{}.h5'.format(inputs_location, test_inputs_name)
         validate_path = '{}{}.h5'.format(inputs_location, validate_inputs_name)
-        if os.path.exists(train_path) and os.path.exists(test_path) and os.path.exists(validate_path):
+        if not force_override and os.path.exists(train_path) and os.path.exists(test_path) and os.path.exists(validate_path):
             print('LOADING DATA...')
             self.x_train, self.x_other_features_train, self.y_train = self._load_inputs(train_path)
             self.x_test, self.x_other_features_test, self.y_test = self._load_inputs(test_path)
@@ -62,7 +63,7 @@ class Data:
         print('CONTENT READ SUCCESSFULLY')
         print('CREATING DICTIONARY...')
         dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
-        if self._input_type == 's' or self._input_type == 'ls':
+        if self._input_type == 's' or self._input_type == 'sl':
             dictionary = self._create_syllables_dictionary(content, vowels)
         print('DICTIONARY CREATION SUCCESSFUL!')
         # test_and_validation_size = 0.1
@@ -125,7 +126,7 @@ class Data:
                 break
             line += 1
         dictionary_input = sorted(dictionary_input)
-        max_num_vowels += 1
+        # max_num_vowels += 1
         return dictionary_input, max_word, max_num_vowels, vowels, accented_vowels
 
     # split content so that there is no overfitting
@@ -230,23 +231,22 @@ class Data:
                 word = word[::-1]
 
             j = 0
-            word_accentuations = []
+            # word_accentuations = []
             num_vowels = 0
             for c in list(word):
                 index = 0
-                if self._is_vowel(word, j, vowels):
-                    num_vowels += 1
                 for d in accentuated_vowels:
                     if c == d:
-                        word_accentuations.append(num_vowels)
+                        if not self._accent_classification:
+                            y[i][num_vowels] = 1
+                        else:
+                            y[i][num_vowels] = index
+                        # word_accentuations.append(num_vowels)
                         break
                     index += 1
+                if self._is_vowel(word, j, vowels):
+                    num_vowels += 1
                 j += 1
-            if len(word_accentuations) > 0:
-                for word_accentuation in word_accentuations:
-                    y[i][word_accentuation] = 1
-            else:
-                y[i][0] = 1
             i += 1
         return y
 
@@ -255,10 +255,10 @@ class Data:
                           shuffle_vector_location):
         if self._input_type == 'l':
             x = self._x_letter_input(content, dictionary, max_word, vowels)
-        elif self._input_type == 's' or self._input_type == 'ls':
+        elif self._input_type == 's' or self._input_type == 'sl':
             x = self._x_syllable_input(content, dictionary, max_num_vowels, vowels)
         else:
-            raise ValueError('No input_type provided. It could be \'l\', \'s\' or \'ls\'.')
+            raise ValueError('No input_type provided. It could be \'l\', \'s\' or \'sl\'.')
         y = self._y_output(content, max_num_vowels, vowels, accentuated_vowels)
 
         print('CREATING OTHER FEATURES...')
@@ -476,46 +476,112 @@ class Data:
 
     def _generator_instance(self, orig_x, orig_x_additional, orig_y, batch_size, content_path):
         if self._input_type == 'l':
-            return self._letter_generator(orig_x, orig_x_additional, orig_y, batch_size)
+            content = self._read_content(content_path)
+            dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
+            return self._letter_generator(orig_x, orig_x_additional, orig_y, batch_size, accented_vowels)
         elif self._input_type == 's':
             content = self._read_content(content_path)
             dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
             syllable_dictionary = self._create_syllables_dictionary(content, vowels)
             eye = np.eye(len(syllable_dictionary), dtype=int)
-            return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye)
+            return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye, accented_vowels)
         elif self._input_type == 'sl':
             content = self._read_content(content_path)
             dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
             syllable_dictionary = self._create_syllables_dictionary(content, vowels)
             max_syllable = self._get_max_syllable(syllable_dictionary)
             syllable_letters_translator = self._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
-            return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator)
+            return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator, accented_vowels)
 
     # generator for inputs for tracking of data fitting
-    def _letter_generator(self, orig_x, orig_x_additional, orig_y, batch_size):
+    def _letter_generator(self, orig_x, orig_x_additional, orig_y, batch_size, accented_vowels):
         size = orig_x.shape[0]
         while 1:
             loc = 0
-            while loc < size:
-                if loc + batch_size >= size:
-                    yield ([orig_x[loc:size], orig_x_additional[loc:size]], orig_y[loc:size])
-                else:
-                    yield ([orig_x[loc:loc + batch_size], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
-                loc += batch_size
+            if self._accent_classification:
+                eye = np.eye(len(accented_vowels), dtype=int)
+                eye_input_accent = np.eye(len(orig_y[0]), dtype=int)
+                input_x_stack = []
+                input_x_other_features_stack = []
+                input_y_stack = []
+                while loc < size:
+                    while len(input_x_stack) < batch_size and loc < size:
+                        accent_loc = 0
+                        for accent in orig_y[loc]:
+                            if accent > 0:
+                                new_orig_x_additional = orig_x_additional[loc]
+                                new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc]))
+                                input_x_stack.append(orig_x[loc])
+                                input_x_other_features_stack.append(new_orig_x_additional)
+                                input_y_stack.append(eye[int(accent)])
+                            accent_loc += 1
+                        loc += 1
+                    if len(input_x_stack) > batch_size:
+                        yield ([np.array(input_x_stack[:batch_size]),
+                                np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size])
+                        input_x_stack = input_x_stack[batch_size:]
+                        input_x_other_features_stack = input_x_other_features_stack[batch_size:]
+                        input_y_stack = input_y_stack[batch_size:]
+                    else:
+                        # print('BBB')
+                        # print(np.array(input_stack))
+                        # yield (np.array(input_stack))
+                        yield ([np.array(input_x_stack), np.array(input_x_other_features_stack)], np.array(input_y_stack))
+                        input_x_stack = []
+                        input_x_other_features_stack = []
+                        input_y_stack = []
+            else:
+                while loc < size:
+                    if loc + batch_size >= size:
+                        yield ([orig_x[loc:size], orig_x_additional[loc:size]], orig_y[loc:size])
+                    else:
+                        yield ([orig_x[loc:loc + batch_size], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
+                    loc += batch_size
 
     # generator for inputs for tracking of data fitting
-    def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator):
+    def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator, accented_vowels):
         size = orig_x.shape[0]
         while 1:
             loc = 0
-            while loc < size:
-                if loc + batch_size >= size:
-                    gen_orig_x = translator[orig_x[loc:size]]
-                    yield ([gen_orig_x, orig_x_additional[loc:size]], orig_y[loc:size])
-                else:
-                    gen_orig_x = translator[orig_x[loc:loc + batch_size]]
-                    yield ([gen_orig_x, orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
-                loc += batch_size
+            if self._accent_classification:
+                eye = np.eye(len(accented_vowels), dtype=int)
+                eye_input_accent = np.eye(len(orig_y[0]), dtype=int)
+                input_x_stack = []
+                input_x_other_features_stack = []
+                input_y_stack = []
+                while loc < size:
+                    while len(input_x_stack) < batch_size and loc < size:
+                        accent_loc = 0
+                        for accent in orig_y[loc]:
+                            if accent > 0:
+                                new_orig_x_additional = orig_x_additional[loc]
+                                new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc]))
+                                input_x_stack.append(orig_x[loc])
+                                input_x_other_features_stack.append(new_orig_x_additional)
+                                input_y_stack.append(eye[int(accent)])
+                            accent_loc += 1
+                        loc += 1
+                    if len(input_x_stack) > batch_size:
+                        gen_orig_x = translator[np.array(input_x_stack[:batch_size])]
+                        yield ([gen_orig_x, np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size])
+                        input_x_stack = input_x_stack[batch_size:]
+                        input_x_other_features_stack = input_x_other_features_stack[batch_size:]
+                        input_y_stack = input_y_stack[batch_size:]
+                    else:
+                        gen_orig_x = translator[np.array(input_x_stack)]
+                        yield ([gen_orig_x, np.array(input_x_other_features_stack)], np.array(input_y_stack))
+                        input_x_stack = []
+                        input_x_other_features_stack = []
+                        input_y_stack = []
+            else:
+                while loc < size:
+                    if loc + batch_size >= size:
+                        gen_orig_x = translator[orig_x[loc:size]]
+                        yield ([gen_orig_x, orig_x_additional[loc:size]], orig_y[loc:size])
+                    else:
+                        gen_orig_x = translator[orig_x[loc:loc + batch_size]]
+                        yield ([gen_orig_x, orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
+                    loc += batch_size
 
     def _get_max_syllable(self, syllable_dictionary):
         max_len = 0