Commit before major RAM lack update

2018-03-21 11:35:05 +01:00 · 2018-03-21 11:35:05 +01:00 · 9edad0ad07
commit 9edad0ad07
parent a316574314
4 changed files with 1367 additions and 1108 deletions
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@ -2,11 +2,19 @@
 <project version="4">
  <component name="ChangeListManager">
    <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
+      <change type="NEW" beforePath="" afterPath="$PROJECT_DIR$/sloleks_accentuation.py" />
+      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_errors.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_errors.pkl" />
+      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_predictions.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_predictions.pkl" />
+      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_error.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_error.pkl" />
+      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_predictions.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_predictions.pkl" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
-      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/accent_classification/error_analysis.ipynb" afterPath="$PROJECT_DIR$/cnn/accent_classification/error_analysis.ipynb" />
+      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/connected_text_accetuation.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/connected_text_accetuation.ipynb" />
+      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" />
+      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v3_10/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v3_10/cnn.ipynb" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis.ipynb" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
+      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/sloleks_accetuation.ipynb" afterPath="$PROJECT_DIR$/sloleks_accetuation.ipynb" />
    </list>
    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
    <option name="TRACKING_ENABLED" value="true" />
@ -36,19 +44,28 @@
      <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
        <entry file="file://$PROJECT_DIR$/prepare_data.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="298">
-              <caret line="514" column="33" lean-forward="false" selection-start-line="514" selection-start-column="20" selection-end-line="514" selection-end-column="33" />
+            <state relative-caret-position="-1034">
+              <caret line="71" column="114" lean-forward="true" selection-start-line="68" selection-start-column="12" selection-end-line="71" selection-end-column="114" />
+              <folding>
+                <element signature="e#24#63#0" expanded="true" />
+                <element signature="e#6485#7773#0" expanded="false" />
+                <element signature="e#9429#9724#0" expanded="false" />
+                <element signature="e#15725#16027#0" expanded="false" />
+                <element signature="e#17000#17346#0" expanded="false" />
+                <element signature="e#21415#22062#0" expanded="false" />
+                <element signature="e#32751#32892#0" expanded="false" />
+              </folding>
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file leaf-file-name="sloleks_accentuation.py" pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/sloleks_accentuation.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="180">
+              <caret line="16" column="53" lean-forward="false" selection-start-line="16" selection-start-column="53" selection-end-line="16" selection-end-column="53" />
              <folding>
                <element signature="e#24#63#0" expanded="true" />
-                <element signature="e#5979#7267#0" expanded="false" />
-                <element signature="e#8923#9218#0" expanded="false" />
-                <element signature="e#13768#14070#0" expanded="false" />
-                <element signature="e#14127#14956#0" expanded="false" />
-                <element signature="e#15020#15366#0" expanded="false" />
-                <element signature="e#18933#19129#0" expanded="false" />
-                <element signature="e#19448#20095#0" expanded="false" />
-                <element signature="e#20194#22492#0" expanded="false" />
-                <element signature="e#30252#30393#0" expanded="false" />
              </folding>
            </state>
          </provider>
@ -77,8 +94,8 @@
      <file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
        <entry file="file://$PROJECT_DIR$/workbench.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="1044">
-              <caret line="69" column="22" lean-forward="false" selection-start-line="69" selection-start-column="22" selection-end-line="69" selection-end-column="22" />
+            <state relative-caret-position="1710">
+              <caret line="106" column="30" lean-forward="false" selection-start-line="106" selection-start-column="30" selection-end-line="106" selection-end-column="39" />
              <folding>
                <element signature="e#24#63#0" expanded="true" />
              </folding>
@ -165,23 +182,6 @@
  </component>
  <component name="FindInProjectRecents">
    <findStrings>
-      <find>predict</find>
-      <find>_reverse_inputs</find>
-      <find>_letter_generator</find>
-      <find>_create_feature_dictionary</find>
-      <find>generate_data</find>
-      <find>Data</find>
-      <find>shuffle_vector</find>
-      <find>shuffle_vector_path</find>
-      <find>fit_generator</find>
-      <find>../../../data/</find>
-      <find>self.x_other_features_train</find>
-      <find>_create_x_features</find>
-      <find>force</find>
-      <find>test_and_validation_size</find>
-      <find>self.y_train</find>
-      <find>_additional_letter_attributes</find>
-      <find>np.random.seed</find>
      <find>round</find>
      <find>is_vow</find>
      <find>self._input_type == 'l'</find>
@ -195,6 +195,23 @@
      <find>_generator</find>
      <find>_create_syllable_letters_translator</find>
      <find>_accent_classification</find>
+      <find>wrong</find>
+      <find>wrong_word</find>
+      <find>predict</find>
+      <find>get_ensemble_type_predictions</find>
+      <find>_convert_to_multext_east_v4</find>
+      <find>_split_consonants</find>
+      <find>UNRECOGNIZED</find>
+      <find>word_glob_num</find>
+      <find>convert_multext</find>
+      <find>_syllable_generator</find>
+      <find>generator</find>
+      <find>generate_data</find>
+      <find>_x</find>
+      <find>bidirectional_basic_input</find>
+      <find>_bidirectional_basic_input</find>
+      <find>shuffeling</find>
+      <find>_generate_x_and_y</find>
    </findStrings>
  </component>
  <component name="Git.Settings">
@ -216,13 +233,14 @@
        <option value="$PROJECT_DIR$/notes" />
        <option value="$PROJECT_DIR$/workbench.xrsl" />
        <option value="$PROJECT_DIR$/workbench.py" />
+        <option value="$PROJECT_DIR$/sloleks_accentuation.py" />
        <option value="$PROJECT_DIR$/prepare_data.py" />
      </list>
    </option>
  </component>
  <component name="ProjectFrameBounds">
    <option name="x" value="65" />
-    <option name="y" value="144" />
+    <option name="y" value="24" />
    <option name="width" value="1855" />
    <option name="height" value="1056" />
  </component>
@ -241,8 +259,6 @@
      <foldersAlwaysOnTop value="true" />
    </navigator>
    <panes>
-      <pane id="Scratches" />
-      <pane id="Scope" />
      <pane id="ProjectPane">
        <subPane>
          <PATH>
@ -257,11 +273,13 @@
          </PATH>
        </subPane>
      </pane>
+      <pane id="Scratches" />
+      <pane id="Scope" />
    </panes>
  </component>
  <component name="PropertiesComponent">
    <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
-    <property name="last_opened_file_path" value="$USER_HOME$/miniconda3/bin/python" />
+    <property name="last_opened_file_path" value="$PROJECT_DIR$" />
  </component>
  <component name="RecentsManager">
    <key name="CopyFile.RECENT_KEYS">
@ -519,7 +537,7 @@
    <servers />
  </component>
  <component name="ToolWindowManager">
-    <frame x="65" y="144" width="1855" height="1056" extended-state="6" />
+    <frame x="65" y="24" width="1855" height="1056" extended-state="6" />
    <editor active="true" />
    <layout>
      <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.12227074" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
@ -574,17 +592,6 @@
    <watches-manager />
  </component>
  <component name="editorHistoryManager">
-    <entry file="file://$PROJECT_DIR$/theano_tutorial/tutorial_derivates.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="0">
-          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
-          <folding>
-            <element signature="e#0#18#0" expanded="true" />
-          </folding>
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/theanoTest.py" />
    <entry file="file://$PROJECT_DIR$/theano_tutorial/test.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="1368">
@ -920,8 +927,19 @@
    </entry>
    <entry file="file://$PROJECT_DIR$/workbench.py">
      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="1044">
-          <caret line="69" column="22" lean-forward="false" selection-start-line="69" selection-start-column="22" selection-end-line="69" selection-end-column="22" />
+        <state relative-caret-position="1710">
+          <caret line="106" column="30" lean-forward="false" selection-start-line="106" selection-start-column="30" selection-end-line="106" selection-end-column="39" />
+          <folding>
+            <element signature="e#24#63#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/../adventofcode/2017/2/1.py" />
+    <entry file="file://$PROJECT_DIR$/sloleks_accentuation.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="180">
+          <caret line="16" column="53" lean-forward="false" selection-start-line="16" selection-start-column="53" selection-end-line="16" selection-end-column="53" />
          <folding>
            <element signature="e#24#63#0" expanded="true" />
          </folding>
@ -930,19 +948,16 @@
    </entry>
    <entry file="file://$PROJECT_DIR$/prepare_data.py">
      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="298">
-          <caret line="514" column="33" lean-forward="false" selection-start-line="514" selection-start-column="20" selection-end-line="514" selection-end-column="33" />
+        <state relative-caret-position="-1034">
+          <caret line="71" column="114" lean-forward="true" selection-start-line="68" selection-start-column="12" selection-end-line="71" selection-end-column="114" />
          <folding>
            <element signature="e#24#63#0" expanded="true" />
-            <element signature="e#5979#7267#0" expanded="false" />
-            <element signature="e#8923#9218#0" expanded="false" />
-            <element signature="e#13768#14070#0" expanded="false" />
-            <element signature="e#14127#14956#0" expanded="false" />
-            <element signature="e#15020#15366#0" expanded="false" />
-            <element signature="e#18933#19129#0" expanded="false" />
-            <element signature="e#19448#20095#0" expanded="false" />
-            <element signature="e#20194#22492#0" expanded="false" />
-            <element signature="e#30252#30393#0" expanded="false" />
+            <element signature="e#6485#7773#0" expanded="false" />
+            <element signature="e#9429#9724#0" expanded="false" />
+            <element signature="e#15725#16027#0" expanded="false" />
+            <element signature="e#17000#17346#0" expanded="false" />
+            <element signature="e#21415#22062#0" expanded="false" />
+            <element signature="e#32751#32892#0" expanded="false" />
          </folding>
        </state>
      </provider>
--- a/prepare_data.py
+++ b/prepare_data.py
@ -9,10 +9,20 @@ import keras.backend as K
 import os.path
 import codecs

+from keras import optimizers
+from keras.models import Model
+from keras.layers import Dense, Dropout, Input
+from keras.layers.merge import concatenate
+from keras.layers.convolutional import Conv1D
+from keras.layers.convolutional import MaxPooling1D
+from keras.layers import Flatten
+from keras.models import load_model
+

 class Data:
    def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True,
-                 additional_letter_attributes=True, reverse_inputs=True, accent_classification=False, number_of_syllables=False):
+                 additional_letter_attributes=True, reverse_inputs=True, accent_classification=False, number_of_syllables=False,
+                 convert_multext=True, bidirectional_basic_input=False):
        self._input_type = input_type
        self._save_generated_data = save_generated_data
        self._allow_shuffle_vector_generation = allow_shuffle_vector_generation
@ -21,6 +31,8 @@ class Data:
        self._reverse_inputs = reverse_inputs
        self._accent_classification = accent_classification
        self._number_of_syllables = number_of_syllables
+        self._convert_multext = convert_multext
+        self._bidirectional_basic_input = bidirectional_basic_input

        self.x_train = None
        self.x_other_features_train = None
@ -169,14 +181,20 @@ class Data:

    def _x_letter_input(self, content, dictionary, max_word, vowels):
        if self._additional_letter_attributes:
-            x = np.zeros((len(content), max_word, len(dictionary) + 6), dtype=int)
+            if not self._bidirectional_basic_input:
+                x = np.zeros((len(content), max_word, len(dictionary) + 6), dtype=int)
+            else:
+                x = np.zeros((len(content), 2 * max_word, len(dictionary) + 6), dtype=int)
            voiced_consonants = self._get_voiced_consonants()
            resonant_silent_consonants = self._get_resonant_silent_consonants()
            nonresonant_silent_consonants = self._get_nonresonant_silent_consonants()
            # print('HERE!!!')
        else:
            # print('HERE!!!')
-            x = np.zeros((len(content), max_word, len(dictionary)), dtype=int)
+            if not self._bidirectional_basic_input:
+                x = np.zeros((len(content), max_word, len(dictionary)), dtype=int)
+            else:
+                x = np.zeros((len(content), 2 * max_word, len(dictionary)), dtype=int)

        i = 0
        for el in content:
@ -185,25 +203,44 @@ class Data:
                word = word[::-1]
            j = 0
            for c in list(word):
+                if j >= max_word:
+                    continue
                index = 0
+                if self._bidirectional_basic_input:
+                    j2 = max_word + (len(word) - j - 1)
                for d in dictionary:
                    if c == d:
                        x[i][j][index] = 1
+                        if self._bidirectional_basic_input:
+                            x[i][j2][index] = 1
                        break
                    index += 1
                if self._additional_letter_attributes:
                    if self._is_vowel(word, j, vowels):
                        x[i][j][len(dictionary)] = 1
+                        if self._bidirectional_basic_input:
+                            x[i][j2][len(dictionary)] = 1
                    else:
                        x[i][j][len(dictionary) + 1] = 1
+                        if self._bidirectional_basic_input:
+                            x[i][j2][len(dictionary) + 1] = 1
                        if c in voiced_consonants:
                            x[i][j][len(dictionary) + 2] = 1
+                            if self._bidirectional_basic_input:
+                                x[i][j2][len(dictionary) + 2] = 1
                        else:
                            x[i][j][len(dictionary) + 3] = 1
+                            if self._bidirectional_basic_input:
+                                x[i][j2][len(dictionary) + 3] = 1
+
                            if c in resonant_silent_consonants:
                                x[i][j][len(dictionary) + 4] = 1
+                                if self._bidirectional_basic_input:
+                                    x[i][j2][len(dictionary) + 4] = 1
                            elif c in nonresonant_silent_consonants:
                                x[i][j][len(dictionary) + 5] = 1
+                                if self._bidirectional_basic_input:
+                                    x[i][j2][len(dictionary) + 5] = 1
                j += 1
            i += 1
        return x
@ -218,6 +255,8 @@ class Data:
            if self._reverse_inputs:
                syllables = syllables[::-1]
            for syllable in syllables:
+                if j >= max_num_vowels:
+                    continue
                if syllable in dictionary:
                    index = dictionary.index(syllable)
                else:
@ -297,7 +336,7 @@ class Data:
                    consonants.append(word_list[i])
                    syllables.append(''.join(consonants))
                else:
-                    left_consonants, right_consonants = self._split_consonants(consonants)
+                    left_consonants, right_consonants = self._split_consonants(list(''.join(consonants).lower()))
                    syllables[-1] += ''.join(left_consonants)
                    right_consonants.append(word_list[i])
                    syllables.append(''.join(right_consonants))
@ -344,9 +383,7 @@ class Data:
                elif consonants[i] in unresonant_silent_consonants:
                    if consonants[i + 1] in resonant_silent_consonants:
                        split_options.append([i, 4])
-                else:
-                    print(consonants)
-                    print('UNRECOGNIZED LETTERS!')
+
            if split_options == []:
                return [''], consonants
            else:
@ -358,7 +395,10 @@ class Data:
        x_other_features = []
        for el in content:
            x_el_other_features = []
-            converted_el = ''.join(self._convert_to_multext_east_v4(list(el[2]), feature_dictionary))
+            if self._convert_multext:
+                converted_el = ''.join(self._convert_to_multext_east_v4(list(el[2]), feature_dictionary))
+            else:
+                converted_el = el[2]
            for feature in feature_dictionary:
                if converted_el[0] == feature[1]:
                    x_el_other_features.append(1)
@ -582,6 +622,15 @@ class Data:
                        input_x_other_features_stack = input_x_other_features_stack[batch_size:]
                        input_y_stack = input_y_stack[batch_size:]
                    else:
+                        #print('-------------------------------------------------------------------------------------------')
+                        #if dictionary is not None:
+                        #    print(self.decode_x(word_encoded, dictionary))
+                        #print(input_x_stack)
+                        #print(input_x_other_features_stack)
+                        #print(input_y_stack)
+                        #print(loc)
+                        if len(input_x_stack) == 0:
+                            continue
                        gen_orig_x = translator[np.array(input_x_stack)]
                        yield ([gen_orig_x, np.array(input_x_other_features_stack)], np.array(input_y_stack))
                        input_x_stack = []
@ -1005,6 +1054,310 @@ class Data:
        else:
            return ''.join(word_list[::-1])

+    def assign_stress_types(self, predictions, word, y, vowels, accented_vowels):
+        words = []
+        accentuation_index = 0
+        for i in range(len(y)):
+            wrong_word = word[i][::-1]
+
+            for j in range(len(y[i])):
+                if y[i][j] > 0:
+                    stressed_letter = self.get_accentuated_letter(word[i][::-1], j, vowels, syllables=self._input_type != 'l')
+                    possible_places = np.zeros(len(predictions[accentuation_index]))
+                    if stressed_letter == 'r':
+                        possible_places[0] = 1
+                    elif stressed_letter == 'a':
+                        possible_places[1] = 1
+                        possible_places[2] = 1
+                    elif stressed_letter == 'e':
+                        possible_places[3] = 1
+                        possible_places[4] = 1
+                        possible_places[5] = 1
+                    elif stressed_letter == 'i':
+                        possible_places[6] = 1
+                        possible_places[7] = 1
+                    elif stressed_letter == 'o':
+                        possible_places[8] = 1
+                        possible_places[9] = 1
+                        possible_places[10] = 1
+                    elif stressed_letter == 'u':
+                        possible_places[11] = 1
+                        possible_places[12] = 1
+                    possible_predictions = predictions[accentuation_index] * possible_places
+
+                    arounded_predictions = np.zeros(len(predictions[accentuation_index]), dtype=int)
+
+                    arounded_predictions[np.argmax(possible_predictions).astype(int)] = 1
+
+                    if np.max(possible_predictions) != 0:
+                        wrong_word = self.assign_word_accentuation_type(wrong_word, j, arounded_predictions, vowels, accented_vowels,
+                                                                    syllables=self._input_type != 'l', debug=i == 313)
+
+                    accentuation_index += 1
+
+            words.append(wrong_word[::-1])
+        return words
+
+    @staticmethod
+    def load_location_models(letters_path, syllables_path, syllabled_letters_path):
+        ############################ LOCATION ########################
+        letter_location_model = load_model(letters_path, custom_objects={'actual_accuracy': actual_accuracy})
+
+        # num_examples = len(data.x_train)  # training set size
+        nn_output_dim = 10
+
+        conv_input_shape = (10, 5168)
+        othr_input = (140,)
+        conv_input = Input(shape=conv_input_shape, name='conv_input')
+
+        # syllabled letters
+        x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
+        x_conv = MaxPooling1D(pool_size=2)(x_conv)
+        x_conv = Flatten()(x_conv)
+
+        othr_input = Input(shape=othr_input, name='othr_input')
+
+        x = concatenate([x_conv, othr_input])
+        x = Dense(256, activation='relu')(x)
+        x = Dropout(0.3)(x)
+        x = Dense(256, activation='relu')(x)
+        x = Dropout(0.3)(x)
+        x = Dense(256, activation='relu')(x)
+        x = Dropout(0.3)(x)
+        x = Dense(nn_output_dim, activation='sigmoid')(x)
+
+        syllable_location_model = Model(inputs=[conv_input, othr_input], outputs=x)
+        opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
+        syllable_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
+        syllable_location_model.load_weights(syllables_path)
+
+        conv_input_shape = (10, 252)
+        othr_input = (140,)
+
+        conv_input = Input(shape=conv_input_shape, name='conv_input')
+
+        # syllabled letters
+        x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
+        x_conv = MaxPooling1D(pool_size=2)(x_conv)
+        x_conv = Flatten()(x_conv)
+
+        othr_input = Input(shape=othr_input, name='othr_input')
+
+        x = concatenate([x_conv, othr_input])
+        x = Dense(256, activation='relu')(x)
+        x = Dropout(0.3)(x)
+        x = Dense(256, activation='relu')(x)
+        x = Dropout(0.3)(x)
+        x = Dense(256, activation='relu')(x)
+        x = Dropout(0.3)(x)
+        x = Dense(nn_output_dim, activation='sigmoid')(x)
+
+        syllabled_letters_location_model = Model(inputs=[conv_input, othr_input], outputs=x)
+        opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
+        syllabled_letters_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
+        syllabled_letters_location_model.load_weights(syllabled_letters_path)
+
+        return letter_location_model, syllable_location_model, syllabled_letters_location_model
+
+    @staticmethod
+    def load_type_models(letters_path, syllables_path, syllabled_letters_path):
+        nn_output_dim = 13
+
+        # letters
+        conv_input_shape = (23, 36)
+        othr_input = (150,)
+        conv_input = Input(shape=conv_input_shape, name='conv_input')
+        # letters
+        x_conv = Conv1D(115, (3), padding='same', activation='relu')(conv_input)
+        x_conv = Conv1D(46, (3), padding='same', activation='relu')(x_conv)
+
+        # syllabled letters
+        x_conv = MaxPooling1D(pool_size=2)(x_conv)
+        x_conv = Flatten()(x_conv)
+
+        othr_input = Input(shape=othr_input, name='othr_input')
+        x = concatenate([x_conv, othr_input])
+        x = Dense(256, activation='relu')(x)
+        x = Dropout(0.3)(x)
+        x = Dense(256, activation='relu')(x)
+        x = Dropout(0.3)(x)
+        x = Dense(256, activation='relu')(x)
+        x = Dropout(0.3)(x)
+        x = Dense(nn_output_dim, activation='sigmoid')(x)
+
+        letter_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
+        opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
+        letter_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
+        letter_type_model.load_weights(letters_path)
+
+        conv_input_shape = (10, 5168)
+        othr_input = (150,)
+        conv_input = Input(shape=conv_input_shape, name='conv_input')
+
+        x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
+        x_conv = MaxPooling1D(pool_size=2)(x_conv)
+        x_conv = Flatten()(x_conv)
+
+        othr_input = Input(shape=othr_input, name='othr_input')
+        x = concatenate([x_conv, othr_input])
+        x = Dense(256, activation='relu')(x)
+        x = Dropout(0.3)(x)
+        x = Dense(256, activation='relu')(x)
+        x = Dropout(0.3)(x)
+        x = Dense(256, activation='relu')(x)
+        x = Dropout(0.3)(x)
+        x = Dense(nn_output_dim, activation='sigmoid')(x)
+
+        syllable_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
+        opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
+        syllable_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
+        syllable_type_model.load_weights(syllables_path)
+
+        # syllabled letters
+        conv_input_shape = (10, 252)
+        othr_input = (150,)
+        conv_input = Input(shape=conv_input_shape, name='conv_input')
+
+        x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
+        x_conv = MaxPooling1D(pool_size=2)(x_conv)
+        x_conv = Flatten()(x_conv)
+
+        othr_input = Input(shape=othr_input, name='othr_input')
+        x = concatenate([x_conv, othr_input])
+        x = Dense(256, activation='relu')(x)
+        x = Dropout(0.3)(x)
+        x = Dense(256, activation='relu')(x)
+        x = Dropout(0.3)(x)
+        x = Dense(256, activation='relu')(x)
+        x = Dropout(0.3)(x)
+        x = Dense(nn_output_dim, activation='sigmoid')(x)
+
+        syllabled_letter_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
+        opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
+        syllabled_letter_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
+        syllabled_letter_type_model.load_weights(syllabled_letters_path)
+
+        return letter_type_model, syllable_type_model, syllabled_letter_type_model
+
+    @staticmethod
+    def get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
+                                          dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
+        batch_size = 16
+        # print(tagged_input_words[pos])
+
+        data = Data('l', shuffle_all_inputs=False, convert_multext=False)
+        x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
+                                                             feature_dictionary, 'who cares')
+        generator = data._letter_generator(x, x_other_features, fake_y, batch_size, accented_vowels)
+        letter_location_predictions = letter_location_model.predict_generator(generator, len(x) / (batch_size))
+
+        data = Data('s', shuffle_all_inputs=False, convert_multext=False)
+        x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
+                                                             accented_vowels, feature_dictionary, 'who cares')
+        eye = np.eye(len(syllable_dictionary), dtype=int)
+        generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, eye, accented_vowels)
+        syllable_location_predictions = syllable_location_model.predict_generator(generator, len(x) / (batch_size))
+
+        data = Data('sl', shuffle_all_inputs=False, convert_multext=False)
+        x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
+                                                             accented_vowels, feature_dictionary, 'who cares')
+        max_syllable = data._get_max_syllable(syllable_dictionary)
+        syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
+        generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, syllable_letters_translator, accented_vowels)
+        syllabled_letters_location_predictions = syllabled_letters_location_model.predict_generator(generator, len(x) / (batch_size))
+
+        return np.mean(np.array([letter_location_predictions, syllable_location_predictions, syllabled_letters_location_predictions]), axis=0)
+
+    @staticmethod
+    def get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model, syllabled_letter_type_model,
+                                          dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
+        batch_size = 16
+        y_array = np.asarray(location_y)
+        accentuation_length = (y_array > 0).sum()
+
+        data = Data('l', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
+        x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
+                                                             feature_dictionary, 'who cares')
+        generator = data._letter_generator(x, x_other_features, location_y, batch_size, accented_vowels)
+        letter_type_predictions = letter_type_model.predict_generator(generator, accentuation_length / (batch_size))
+
+        data = Data('s', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
+        x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
+                                                             accented_vowels, feature_dictionary, 'who cares')
+        eye = np.eye(len(syllable_dictionary), dtype=int)
+        generator = data._syllable_generator(x, x_other_features, location_y, batch_size, eye, accented_vowels)
+        syllable_type_predictions = syllable_type_model.predict_generator(generator, accentuation_length / (batch_size))
+
+        data = Data('sl', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
+        x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
+                                                             accented_vowels, feature_dictionary, 'who cares')
+        max_syllable = data._get_max_syllable(syllable_dictionary)
+        syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
+        generator = data._syllable_generator(x, x_other_features, location_y, batch_size, syllable_letters_translator, accented_vowels)
+        syllabled_letter_type_predictions = syllabled_letter_type_model.predict_generator(generator, accentuation_length / batch_size)
+
+        return np.mean(np.array([letter_type_predictions, syllable_type_predictions, syllabled_letter_type_predictions]), axis=0)
+
+    def assign_location_stress(self, word, locations, vowels):
+            #     word = list(word)
+        word_list = list(word)
+        for loc in locations:
+            vowel_num = 0
+            # if loc == 0:
+            #    return word
+            for i in range(len(word_list)):
+                if self._is_vowel(word_list, i, vowels):
+                    if word_list[i] == 'a' and vowel_num == loc:
+                        word_list[i] = 'á'
+                    elif word_list[i] == 'e' and vowel_num == loc:
+                        word_list[i] = 'é'
+                    elif word_list[i] == 'i' and vowel_num == loc:
+                        word_list[i] = 'í'
+                    elif word_list[i] == 'o' and vowel_num == loc:
+                        word_list[i] = 'ó'
+                    elif word_list[i] == 'u' and vowel_num == loc:
+                        word_list[i] = 'ú'
+                    elif word_list[i] == 'r' and vowel_num == loc:
+                        word_list[i] = 'ŕ'
+                    elif word_list[i] == 'A' and vowel_num == loc:
+                        word_list[i] = 'Á'
+                    elif word_list[i] == 'E' and vowel_num == loc:
+                        word_list[i] = 'É'
+                    elif word_list[i] == 'I' and vowel_num == loc:
+                        word_list[i] = 'Í'
+                    elif word_list[i] == 'O' and vowel_num == loc:
+                        word_list[i] = 'Ó'
+                    elif word_list[i] == 'U' and vowel_num == loc:
+                        word_list[i] = 'Ú'
+                    elif word_list[i] == 'R' and vowel_num == loc:
+                        word_list[i] = 'Ŕ'
+                    vowel_num += 1
+                    #     print(word_list)
+        return ''.join(word_list)
+
+    def accentuate_word(self, input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
+                        letter_type_model, syllable_type_model, syllabled_letter_type_model,
+                        dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
+        predictions = self.get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model,
+                                                             syllabled_letters_location_model,
+                                                             dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
+                                                             syllable_dictionary)
+        if 'A' not in vowels:
+            vowels.extend(['A', 'E', 'I', 'O', 'U'])
+        location_accented_words = [self.assign_location_stress(input_words[i][0][::-1], self.decode_y(predictions[i]), vowels)[::-1] for i in
+                          range(len(input_words))]
+
+        location_y = np.around(predictions)
+        type_predictions = self.get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model,
+                                                              syllabled_letter_type_model,
+                                                              dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
+                                                              syllable_dictionary)
+
+        only_words = [el[0] for el in input_words]
+        accented_words = self.assign_stress_types(type_predictions, only_words, location_y, vowels, accented_vowels)
+
+        return location_accented_words, accented_words
+
 # def count_vowels(content, vowels):
 #     num_all_vowels = 0
 #     for el in content:
--- a/sloleks_accentuation.py
+++ b/sloleks_accentuation.py
@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import numpy as np
+from keras.models import load_model
+import sys
+
+from prepare_data import *
+
+np.random.seed(7)
+
+data = Data('l', shuffle_all_inputs=False)
+content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex')
+dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content)
+feature_dictionary = data._create_slovene_feature_dictionary()
+syllable_dictionary = data._create_syllables_dictionary(content, vowels)
+accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']
+
+
+letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(
+    'cnn/word_accetuation/cnn_dictionary/v3_10/20_test_epoch.h5',
+    'cnn/word_accetuation/syllables/v2_4/20_test_epoch.h5',
+    'cnn/word_accetuation/syllabled_letters/v2_5_3/20_test_epoch.h5')
+
+letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models(
+    'cnn/accent_classification/letters/v2_1/20_test_epoch.h5',
+    'cnn/accent_classification/syllables/v1_0/20_test_epoch.h5',
+    'cnn/accent_classification/syllabled_letters/v1_0/20_test_epoch.h5')
+
+from lxml import etree
+
+
+def xml_words_generator(xml_path):
+    for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):
+        words = []
+        for child in element:
+            if child.tag == 'WordForm':
+                msd = None
+                word = None
+                for wf in child:
+                    if 'att' in wf.attrib and wf.attrib['att'] == 'msd':
+                        msd = wf.attrib['val']
+                    elif wf.tag == 'FormRepresentation':
+                        for form_rep in wf:
+                            if form_rep.attrib['att'] == 'zapis_oblike':
+                                word = form_rep.attrib['val']
+                        # if msd is not None and word is not None:
+                        #    pass
+                        # else:
+                        #    print('NOOOOO')
+                        words.append([word, '', msd, word])
+        yield words
+
+
+gen = xml_words_generator('data/Sloleks_v1.2.xml')
+
+# Words proccesed: 650250
+# Word indeks: 50023
+# Word number: 50023
+
+from lxml import etree
+import time
+
+gen = xml_words_generator('data/Sloleks_v1.2.xml')
+word_glob_num = 0
+word_limit = 0
+iter_num = 50000
+word_index = 0
+start_timer = time.time()
+iter_index = 0
+words = []
+
+lexical_entries_load_number = 0
+lexical_entries_save_number = 0
+
+# INSIDE
+word_glob_num = 1500686
+
+word_limit = 50000
+iter_index = 30
+
+done_lexical_entries = 33522
+
+import gc
+
+with open("data/new_sloleks/new_sloleks.xml", "ab") as myfile:
+    myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
+    for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
+        # LOAD NEW WORDS AND ACCENTUATE THEM
+        # print("HERE")
+
+        if lexical_entries_save_number < done_lexical_entries:
+            g = next(gen)
+            # print(lexical_entries_save_number)
+            lexical_entries_save_number += 1
+            lexical_entries_load_number += 1
+            print(lexical_entries_save_number)
+            del g
+            gc.collect()
+            continue
+
+        if word_glob_num >= word_limit:
+            myfile2.close()
+            myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
+            iter_index += 1
+            print("Words proccesed: " + str(word_glob_num))
+
+            print("Word indeks: " + str(word_index))
+            print("Word number: " + str(len(words)))
+
+            print("lexical_entries_load_number: " + str(lexical_entries_load_number))
+            print("lexical_entries_save_number: " + str(lexical_entries_save_number))
+
+            end_timer = time.time()
+            print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
+
+            word_index = 0
+            words = []
+
+            while len(words) < iter_num:
+                try:
+                    words.extend(next(gen))
+                    lexical_entries_load_number += 1
+                except:
+                    break
+            # if word_glob_num > 1:
+            #    break
+
+            data = Data('l', shuffle_all_inputs=False)
+            location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model,
+                                                                           syllabled_letters_location_model,
+                                                                           letter_type_model, syllable_type_model, syllabled_letter_type_model,
+                                                                           dictionary, max_word, max_num_vowels, vowels, accented_vowels,
+                                                                           feature_dictionary, syllable_dictionary)
+
+            word_limit += len(words)
+
+        # READ DATA
+        for child in element:
+            if child.tag == 'WordForm':
+                msd = None
+                word = None
+                for wf in child:
+                    if wf.tag == 'FormRepresentation':
+                        new_element = etree.Element('feat')
+                        new_element.attrib['att'] = 'naglasna_mesta_oblike'
+                        new_element.attrib['val'] = location_accented_words[word_index]
+                        wf.append(new_element)
+
+                        new_element = etree.Element('feat')
+                        new_element.attrib['att'] = 'naglašena_oblika'
+                        new_element.attrib['val'] = accented_words[word_index]
+                        wf.append(new_element)
+                        word_glob_num += 1
+                        word_index += 1
+
+        # print(etree.tostring(element, encoding="UTF-8"))
+        myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
+        myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
+        element.clear()
+        lexical_entries_save_number += 1
--- a/sloleks_accetuation.ipynb
+++ b/sloleks_accetuation.ipynb