[MAJOR REFACTOR] Added accent classification (from scratch) and deleted unnecessary y output (output where no accent should be employed) X-es in case of both syllabled inputs have also been changed in similar manner.

2017-07-27 18:20:18 +02:00 · 2017-07-27 18:20:18 +02:00 · 18348b78fc
commit 18348b78fc
parent 83584a0c6f
2 changed files with 146 additions and 76 deletions
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@ -2,10 +2,8 @@
 <project version="4">
  <component name="ChangeListManager">
    <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/dictionaries/luka.xml" afterPath="$PROJECT_DIR$/.idea/dictionaries/luka.xml" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/cnn.ipynb" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
    </list>
@ -37,22 +35,25 @@
      <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
        <entry file="file://$PROJECT_DIR$/prepare_data.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="568">
+            <state relative-caret-position="244">
-              <caret line="482" column="74" lean-forward="true" selection-start-line="482" selection-start-column="74" selection-end-line="482" selection-end-column="74" />
+              <caret line="252" column="38" lean-forward="true" selection-start-line="252" selection-start-column="38" selection-end-line="252" selection-end-column="38" />
              <folding>
                <element signature="e#24#63#0" expanded="true" />
-                <element signature="e#5524#5637#0" expanded="false" />
+                <element signature="e#5658#5771#0" expanded="false" />
-                <element signature="e#5684#6970#0" expanded="false" />
+                <element signature="e#5818#7106#0" expanded="false" />
-                <element signature="e#7131#8538#0" expanded="false" />
+                <element signature="e#7267#8674#0" expanded="false" />
-                <element signature="e#8626#8921#0" expanded="false" />
+                <element signature="e#8762#9057#0" expanded="false" />
-                <element signature="e#13363#13665#0" expanded="false" />
+                <element signature="e#13496#13798#0" expanded="false" />
-                <element signature="e#13722#14551#0" expanded="false" />
+                <element signature="e#13855#14684#0" expanded="false" />
-                <element signature="e#14615#14961#0" expanded="false" />
+                <element signature="e#14748#15094#0" expanded="false" />
-                <element signature="e#16836#17749#0" expanded="false" />
+                <element signature="e#16969#17882#0" expanded="false" />
-                <element signature="e#18179#18375#0" expanded="false" />
+                <element signature="e#18312#18508#0" expanded="false" />
-                <element signature="e#18436#18627#0" expanded="false" />
+                <element signature="e#18569#18760#0" expanded="false" />
-                <element signature="e#18694#19341#0" expanded="false" />
+                <element signature="e#18827#19474#0" expanded="false" />
-                <element signature="e#19440#21738#0" expanded="false" />
+                <element signature="e#19573#21871#0" expanded="false" />
                <element signature="e#22137#22836#0" expanded="false" />
                <element signature="e#29631#29772#0" expanded="false" />
                <element signature="e#29922#32067#0" expanded="false" />
              </folding>
            </state>
          </provider>
@ -149,16 +150,6 @@
  </component>
  <component name="FindInProjectRecents">
    <findStrings>
      <find>_create_X_features</find>
      <find>raise</find>
      <find>create_syllables_dictionary</find>
      <find>decode_</find>
      <find>create_x_features</find>
      <find>generate_x_and_y</find>
      <find>create_syllables</find>
      <find>split_consonants</find>
      <find>get_unresonant_silent_consonants</find>
      <find>dict_occurances_in_dataset_rate</find>
      <find>count_vowels</find>
      <find>shuffle_full_vowel_inputs</find>
      <find>generate_presentable_y</find>
@ -179,6 +170,16 @@
      <find>_create_syllable_letters_translator</find>
      <find>_get_unresonant_silent_consonants</find>
      <find>el[0]</find>
      <find>max_num_vowels</find>
      <find>index</find>
      <find>accentuated</find>
      <find>create_syll</find>
      <find>shuffle_all_inputs</find>
      <find>accented</find>
      <find>_accented</find>
      <find>size</find>
      <find>decode_x</find>
      <find>self._input_type ==</find>
    </findStrings>
  </component>
  <component name="Git.Settings">
@ -532,7 +533,7 @@
  </component>
  <component name="XDebuggerManager">
    <breakpoint-manager>
-      <option name="time" value="5" />
+      <option name="time" value="6" />
    </breakpoint-manager>
    <watches-manager />
  </component>
@ -831,22 +832,25 @@
    </entry>
    <entry file="file://$PROJECT_DIR$/prepare_data.py">
      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="568">
+        <state relative-caret-position="244">
-          <caret line="482" column="74" lean-forward="true" selection-start-line="482" selection-start-column="74" selection-end-line="482" selection-end-column="74" />
+          <caret line="252" column="38" lean-forward="true" selection-start-line="252" selection-start-column="38" selection-end-line="252" selection-end-column="38" />
          <folding>
            <element signature="e#24#63#0" expanded="true" />
-            <element signature="e#5524#5637#0" expanded="false" />
+            <element signature="e#5658#5771#0" expanded="false" />
-            <element signature="e#5684#6970#0" expanded="false" />
+            <element signature="e#5818#7106#0" expanded="false" />
-            <element signature="e#7131#8538#0" expanded="false" />
+            <element signature="e#7267#8674#0" expanded="false" />
-            <element signature="e#8626#8921#0" expanded="false" />
+            <element signature="e#8762#9057#0" expanded="false" />
-            <element signature="e#13363#13665#0" expanded="false" />
+            <element signature="e#13496#13798#0" expanded="false" />
-            <element signature="e#13722#14551#0" expanded="false" />
+            <element signature="e#13855#14684#0" expanded="false" />
-            <element signature="e#14615#14961#0" expanded="false" />
+            <element signature="e#14748#15094#0" expanded="false" />
-            <element signature="e#16836#17749#0" expanded="false" />
+            <element signature="e#16969#17882#0" expanded="false" />
-            <element signature="e#18179#18375#0" expanded="false" />
+            <element signature="e#18312#18508#0" expanded="false" />
-            <element signature="e#18436#18627#0" expanded="false" />
+            <element signature="e#18569#18760#0" expanded="false" />
-            <element signature="e#18694#19341#0" expanded="false" />
+            <element signature="e#18827#19474#0" expanded="false" />
-            <element signature="e#19440#21738#0" expanded="false" />
+            <element signature="e#19573#21871#0" expanded="false" />
            <element signature="e#22137#22836#0" expanded="false" />
            <element signature="e#29631#29772#0" expanded="false" />
            <element signature="e#29922#32067#0" expanded="false" />
          </folding>
        </state>
      </provider>
--- a/prepare_data.py
+++ b/prepare_data.py
@ -11,13 +11,14 @@ import os.path
 class Data:
    def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True,
-                 additional_letter_attributes=True, reverse_inputs=True):
+                 additional_letter_attributes=True, reverse_inputs=True, accent_classification=False):
        self._input_type = input_type
        self._save_generated_data = save_generated_data
        self._allow_shuffle_vector_generation = allow_shuffle_vector_generation
        self._shuffle_all_inputs = shuffle_all_inputs
        self._additional_letter_attributes = additional_letter_attributes
        self._reverse_inputs = reverse_inputs
        self._accent_classification = accent_classification
        self.x_train = None
        self.x_other_features_train = None
@ -30,14 +31,14 @@ class Data:
        self.y_validate = None
    def generate_data(self, train_inputs_name, test_inputs_name, validate_inputs_name, test_and_validation_size=0.1,
-                      content_name='SlovarIJS_BESEDE_utf8.lex',
+                      force_override=False, content_name='SlovarIJS_BESEDE_utf8.lex',
                      content_shuffle_vector='content_shuffle_vector', shuffle_vector='shuffle_vector',
                      inputs_location='../../internal_representations/inputs/', content_location='../../../data/'):
        content_path = '{}{}'.format(content_location, content_name)
        train_path = '{}{}.h5'.format(inputs_location, train_inputs_name)
        test_path = '{}{}.h5'.format(inputs_location, test_inputs_name)
        validate_path = '{}{}.h5'.format(inputs_location, validate_inputs_name)
-        if os.path.exists(train_path) and os.path.exists(test_path) and os.path.exists(validate_path):
+        if not force_override and os.path.exists(train_path) and os.path.exists(test_path) and os.path.exists(validate_path):
            print('LOADING DATA...')
            self.x_train, self.x_other_features_train, self.y_train = self._load_inputs(train_path)
            self.x_test, self.x_other_features_test, self.y_test = self._load_inputs(test_path)
@ -62,7 +63,7 @@ class Data:
        print('CONTENT READ SUCCESSFULLY')
        print('CREATING DICTIONARY...')
        dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
-        if self._input_type == 's' or self._input_type == 'ls':
+        if self._input_type == 's' or self._input_type == 'sl':
            dictionary = self._create_syllables_dictionary(content, vowels)
        print('DICTIONARY CREATION SUCCESSFUL!')
        # test_and_validation_size = 0.1
@ -125,7 +126,7 @@ class Data:
                break
            line += 1
        dictionary_input = sorted(dictionary_input)
-        max_num_vowels += 1
+        # max_num_vowels += 1
        return dictionary_input, max_word, max_num_vowels, vowels, accented_vowels
    # split content so that there is no overfitting
@ -230,23 +231,22 @@ class Data:
                word = word[::-1]
            j = 0
-            word_accentuations = []
+            # word_accentuations = []
            num_vowels = 0
            for c in list(word):
                index = 0
                if self._is_vowel(word, j, vowels):
                    num_vowels += 1
                for d in accentuated_vowels:
                    if c == d:
-                        word_accentuations.append(num_vowels)
+                        if not self._accent_classification:
                            y[i][num_vowels] = 1
                        else:
                            y[i][num_vowels] = index
                        # word_accentuations.append(num_vowels)
                        break
                    index += 1
                if self._is_vowel(word, j, vowels):
                    num_vowels += 1
                j += 1
            if len(word_accentuations) > 0:
                for word_accentuation in word_accentuations:
                    y[i][word_accentuation] = 1
            else:
                y[i][0] = 1
            i += 1
        return y
@ -255,10 +255,10 @@ class Data:
                          shuffle_vector_location):
        if self._input_type == 'l':
            x = self._x_letter_input(content, dictionary, max_word, vowels)
-        elif self._input_type == 's' or self._input_type == 'ls':
+        elif self._input_type == 's' or self._input_type == 'sl':
            x = self._x_syllable_input(content, dictionary, max_num_vowels, vowels)
        else:
-            raise ValueError('No input_type provided. It could be \'l\', \'s\' or \'ls\'.')
+            raise ValueError('No input_type provided. It could be \'l\', \'s\' or \'sl\'.')
        y = self._y_output(content, max_num_vowels, vowels, accentuated_vowels)
        print('CREATING OTHER FEATURES...')
@ -476,46 +476,112 @@ class Data:
    def _generator_instance(self, orig_x, orig_x_additional, orig_y, batch_size, content_path):
        if self._input_type == 'l':
-            return self._letter_generator(orig_x, orig_x_additional, orig_y, batch_size)
+            content = self._read_content(content_path)
            dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
            return self._letter_generator(orig_x, orig_x_additional, orig_y, batch_size, accented_vowels)
        elif self._input_type == 's':
            content = self._read_content(content_path)
            dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
            syllable_dictionary = self._create_syllables_dictionary(content, vowels)
            eye = np.eye(len(syllable_dictionary), dtype=int)
-            return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye)
+            return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye, accented_vowels)
        elif self._input_type == 'sl':
            content = self._read_content(content_path)
            dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
            syllable_dictionary = self._create_syllables_dictionary(content, vowels)
            max_syllable = self._get_max_syllable(syllable_dictionary)
            syllable_letters_translator = self._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
-            return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator)
+            return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator, accented_vowels)
    # generator for inputs for tracking of data fitting
-    def _letter_generator(self, orig_x, orig_x_additional, orig_y, batch_size):
+    def _letter_generator(self, orig_x, orig_x_additional, orig_y, batch_size, accented_vowels):
        size = orig_x.shape[0]
        while 1:
            loc = 0
-            while loc < size:
+            if self._accent_classification:
-                if loc + batch_size >= size:
+                eye = np.eye(len(accented_vowels), dtype=int)
-                    yield ([orig_x[loc:size], orig_x_additional[loc:size]], orig_y[loc:size])
+                eye_input_accent = np.eye(len(orig_y[0]), dtype=int)
-                else:
+                input_x_stack = []
-                    yield ([orig_x[loc:loc + batch_size], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
+                input_x_other_features_stack = []
-                loc += batch_size
+                input_y_stack = []
                while loc < size:
                    while len(input_x_stack) < batch_size and loc < size:
                        accent_loc = 0
                        for accent in orig_y[loc]:
                            if accent > 0:
                                new_orig_x_additional = orig_x_additional[loc]
                                new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc]))
                                input_x_stack.append(orig_x[loc])
                                input_x_other_features_stack.append(new_orig_x_additional)
                                input_y_stack.append(eye[int(accent)])
                            accent_loc += 1
                        loc += 1
                    if len(input_x_stack) > batch_size:
                        yield ([np.array(input_x_stack[:batch_size]),
                                np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size])
                        input_x_stack = input_x_stack[batch_size:]
                        input_x_other_features_stack = input_x_other_features_stack[batch_size:]
                        input_y_stack = input_y_stack[batch_size:]
                    else:
                        # print('BBB')
                        # print(np.array(input_stack))
                        # yield (np.array(input_stack))
                        yield ([np.array(input_x_stack), np.array(input_x_other_features_stack)], np.array(input_y_stack))
                        input_x_stack = []
                        input_x_other_features_stack = []
                        input_y_stack = []
            else:
                while loc < size:
                    if loc + batch_size >= size:
                        yield ([orig_x[loc:size], orig_x_additional[loc:size]], orig_y[loc:size])
                    else:
                        yield ([orig_x[loc:loc + batch_size], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
                    loc += batch_size
    # generator for inputs for tracking of data fitting
-    def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator):
+    def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator, accented_vowels):
        size = orig_x.shape[0]
        while 1:
            loc = 0
-            while loc < size:
+            if self._accent_classification:
-                if loc + batch_size >= size:
+                eye = np.eye(len(accented_vowels), dtype=int)
-                    gen_orig_x = translator[orig_x[loc:size]]
+                eye_input_accent = np.eye(len(orig_y[0]), dtype=int)
-                    yield ([gen_orig_x, orig_x_additional[loc:size]], orig_y[loc:size])
+                input_x_stack = []
-                else:
+                input_x_other_features_stack = []
-                    gen_orig_x = translator[orig_x[loc:loc + batch_size]]
+                input_y_stack = []
-                    yield ([gen_orig_x, orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
+                while loc < size:
-                loc += batch_size
+                    while len(input_x_stack) < batch_size and loc < size:
                        accent_loc = 0
                        for accent in orig_y[loc]:
                            if accent > 0:
                                new_orig_x_additional = orig_x_additional[loc]
                                new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc]))
                                input_x_stack.append(orig_x[loc])
                                input_x_other_features_stack.append(new_orig_x_additional)
                                input_y_stack.append(eye[int(accent)])
                            accent_loc += 1
                        loc += 1
                    if len(input_x_stack) > batch_size:
                        gen_orig_x = translator[np.array(input_x_stack[:batch_size])]
                        yield ([gen_orig_x, np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size])
                        input_x_stack = input_x_stack[batch_size:]
                        input_x_other_features_stack = input_x_other_features_stack[batch_size:]
                        input_y_stack = input_y_stack[batch_size:]
                    else:
                        gen_orig_x = translator[np.array(input_x_stack)]
                        yield ([gen_orig_x, np.array(input_x_other_features_stack)], np.array(input_y_stack))
                        input_x_stack = []
                        input_x_other_features_stack = []
                        input_y_stack = []
            else:
                while loc < size:
                    if loc + batch_size >= size:
                        gen_orig_x = translator[orig_x[loc:size]]
                        yield ([gen_orig_x, orig_x_additional[loc:size]], orig_y[loc:size])
                    else:
                        gen_orig_x = translator[orig_x[loc:loc + batch_size]]
                        yield ([gen_orig_x, orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
                    loc += batch_size
    def _get_max_syllable(self, syllable_dictionary):
        max_len = 0