Commit before major RAM lack update

2018-03-21 11:35:05 +01:00 · 2018-03-21 11:35:05 +01:00 · 9edad0ad07
commit 9edad0ad07
parent a316574314
4 changed files with 1367 additions and 1108 deletions
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@ -2,11 +2,19 @@
 <project version="4">
  <component name="ChangeListManager">
    <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
      <change type="NEW" beforePath="" afterPath="$PROJECT_DIR$/sloleks_accentuation.py" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_errors.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_errors.pkl" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_predictions.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_predictions.pkl" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_error.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_error.pkl" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_predictions.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_predictions.pkl" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
-      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/accent_classification/error_analysis.ipynb" afterPath="$PROJECT_DIR$/cnn/accent_classification/error_analysis.ipynb" />
+      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/connected_text_accetuation.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/connected_text_accetuation.ipynb" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v3_10/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v3_10/cnn.ipynb" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis.ipynb" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/sloleks_accetuation.ipynb" afterPath="$PROJECT_DIR$/sloleks_accetuation.ipynb" />
    </list>
    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
    <option name="TRACKING_ENABLED" value="true" />
@ -36,19 +44,28 @@
      <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
        <entry file="file://$PROJECT_DIR$/prepare_data.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="298">
+            <state relative-caret-position="-1034">
-              <caret line="514" column="33" lean-forward="false" selection-start-line="514" selection-start-column="20" selection-end-line="514" selection-end-column="33" />
+              <caret line="71" column="114" lean-forward="true" selection-start-line="68" selection-start-column="12" selection-end-line="71" selection-end-column="114" />
              <folding>
                <element signature="e#24#63#0" expanded="true" />
                <element signature="e#6485#7773#0" expanded="false" />
                <element signature="e#9429#9724#0" expanded="false" />
                <element signature="e#15725#16027#0" expanded="false" />
                <element signature="e#17000#17346#0" expanded="false" />
                <element signature="e#21415#22062#0" expanded="false" />
                <element signature="e#32751#32892#0" expanded="false" />
              </folding>
            </state>
          </provider>
        </entry>
      </file>
      <file leaf-file-name="sloleks_accentuation.py" pinned="false" current-in-tab="false">
        <entry file="file://$PROJECT_DIR$/sloleks_accentuation.py">
          <provider selected="true" editor-type-id="text-editor">
            <state relative-caret-position="180">
              <caret line="16" column="53" lean-forward="false" selection-start-line="16" selection-start-column="53" selection-end-line="16" selection-end-column="53" />
              <folding>
                <element signature="e#24#63#0" expanded="true" />
                <element signature="e#5979#7267#0" expanded="false" />
                <element signature="e#8923#9218#0" expanded="false" />
                <element signature="e#13768#14070#0" expanded="false" />
                <element signature="e#14127#14956#0" expanded="false" />
                <element signature="e#15020#15366#0" expanded="false" />
                <element signature="e#18933#19129#0" expanded="false" />
                <element signature="e#19448#20095#0" expanded="false" />
                <element signature="e#20194#22492#0" expanded="false" />
                <element signature="e#30252#30393#0" expanded="false" />
              </folding>
            </state>
          </provider>
@ -77,8 +94,8 @@
      <file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
        <entry file="file://$PROJECT_DIR$/workbench.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="1044">
+            <state relative-caret-position="1710">
-              <caret line="69" column="22" lean-forward="false" selection-start-line="69" selection-start-column="22" selection-end-line="69" selection-end-column="22" />
+              <caret line="106" column="30" lean-forward="false" selection-start-line="106" selection-start-column="30" selection-end-line="106" selection-end-column="39" />
              <folding>
                <element signature="e#24#63#0" expanded="true" />
              </folding>
@ -165,23 +182,6 @@
  </component>
  <component name="FindInProjectRecents">
    <findStrings>
      <find>predict</find>
      <find>_reverse_inputs</find>
      <find>_letter_generator</find>
      <find>_create_feature_dictionary</find>
      <find>generate_data</find>
      <find>Data</find>
      <find>shuffle_vector</find>
      <find>shuffle_vector_path</find>
      <find>fit_generator</find>
      <find>../../../data/</find>
      <find>self.x_other_features_train</find>
      <find>_create_x_features</find>
      <find>force</find>
      <find>test_and_validation_size</find>
      <find>self.y_train</find>
      <find>_additional_letter_attributes</find>
      <find>np.random.seed</find>
      <find>round</find>
      <find>is_vow</find>
      <find>self._input_type == 'l'</find>
@ -195,6 +195,23 @@
      <find>_generator</find>
      <find>_create_syllable_letters_translator</find>
      <find>_accent_classification</find>
      <find>wrong</find>
      <find>wrong_word</find>
      <find>predict</find>
      <find>get_ensemble_type_predictions</find>
      <find>_convert_to_multext_east_v4</find>
      <find>_split_consonants</find>
      <find>UNRECOGNIZED</find>
      <find>word_glob_num</find>
      <find>convert_multext</find>
      <find>_syllable_generator</find>
      <find>generator</find>
      <find>generate_data</find>
      <find>_x</find>
      <find>bidirectional_basic_input</find>
      <find>_bidirectional_basic_input</find>
      <find>shuffeling</find>
      <find>_generate_x_and_y</find>
    </findStrings>
  </component>
  <component name="Git.Settings">
@ -216,13 +233,14 @@
        <option value="$PROJECT_DIR$/notes" />
        <option value="$PROJECT_DIR$/workbench.xrsl" />
        <option value="$PROJECT_DIR$/workbench.py" />
        <option value="$PROJECT_DIR$/sloleks_accentuation.py" />
        <option value="$PROJECT_DIR$/prepare_data.py" />
      </list>
    </option>
  </component>
  <component name="ProjectFrameBounds">
    <option name="x" value="65" />
-    <option name="y" value="144" />
+    <option name="y" value="24" />
    <option name="width" value="1855" />
    <option name="height" value="1056" />
  </component>
@ -241,8 +259,6 @@
      <foldersAlwaysOnTop value="true" />
    </navigator>
    <panes>
      <pane id="Scratches" />
      <pane id="Scope" />
      <pane id="ProjectPane">
        <subPane>
          <PATH>
@ -257,11 +273,13 @@
          </PATH>
        </subPane>
      </pane>
      <pane id="Scratches" />
      <pane id="Scope" />
    </panes>
  </component>
  <component name="PropertiesComponent">
    <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
-    <property name="last_opened_file_path" value="$USER_HOME$/miniconda3/bin/python" />
+    <property name="last_opened_file_path" value="$PROJECT_DIR$" />
  </component>
  <component name="RecentsManager">
    <key name="CopyFile.RECENT_KEYS">
@ -519,7 +537,7 @@
    <servers />
  </component>
  <component name="ToolWindowManager">
-    <frame x="65" y="144" width="1855" height="1056" extended-state="6" />
+    <frame x="65" y="24" width="1855" height="1056" extended-state="6" />
    <editor active="true" />
    <layout>
      <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.12227074" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
@ -574,17 +592,6 @@
    <watches-manager />
  </component>
  <component name="editorHistoryManager">
    <entry file="file://$PROJECT_DIR$/theano_tutorial/tutorial_derivates.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="0">
          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
          <folding>
            <element signature="e#0#18#0" expanded="true" />
          </folding>
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/theanoTest.py" />
    <entry file="file://$PROJECT_DIR$/theano_tutorial/test.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="1368">
@ -920,8 +927,19 @@
    </entry>
    <entry file="file://$PROJECT_DIR$/workbench.py">
      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="1044">
+        <state relative-caret-position="1710">
-          <caret line="69" column="22" lean-forward="false" selection-start-line="69" selection-start-column="22" selection-end-line="69" selection-end-column="22" />
+          <caret line="106" column="30" lean-forward="false" selection-start-line="106" selection-start-column="30" selection-end-line="106" selection-end-column="39" />
          <folding>
            <element signature="e#24#63#0" expanded="true" />
          </folding>
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/../adventofcode/2017/2/1.py" />
    <entry file="file://$PROJECT_DIR$/sloleks_accentuation.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="180">
          <caret line="16" column="53" lean-forward="false" selection-start-line="16" selection-start-column="53" selection-end-line="16" selection-end-column="53" />
          <folding>
            <element signature="e#24#63#0" expanded="true" />
          </folding>
@ -930,19 +948,16 @@
    </entry>
    <entry file="file://$PROJECT_DIR$/prepare_data.py">
      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="298">
+        <state relative-caret-position="-1034">
-          <caret line="514" column="33" lean-forward="false" selection-start-line="514" selection-start-column="20" selection-end-line="514" selection-end-column="33" />
+          <caret line="71" column="114" lean-forward="true" selection-start-line="68" selection-start-column="12" selection-end-line="71" selection-end-column="114" />
          <folding>
            <element signature="e#24#63#0" expanded="true" />
-            <element signature="e#5979#7267#0" expanded="false" />
+            <element signature="e#6485#7773#0" expanded="false" />
-            <element signature="e#8923#9218#0" expanded="false" />
+            <element signature="e#9429#9724#0" expanded="false" />
-            <element signature="e#13768#14070#0" expanded="false" />
+            <element signature="e#15725#16027#0" expanded="false" />
-            <element signature="e#14127#14956#0" expanded="false" />
+            <element signature="e#17000#17346#0" expanded="false" />
-            <element signature="e#15020#15366#0" expanded="false" />
+            <element signature="e#21415#22062#0" expanded="false" />
-            <element signature="e#18933#19129#0" expanded="false" />
+            <element signature="e#32751#32892#0" expanded="false" />
            <element signature="e#19448#20095#0" expanded="false" />
            <element signature="e#20194#22492#0" expanded="false" />
            <element signature="e#30252#30393#0" expanded="false" />
          </folding>
        </state>
      </provider>
--- a/prepare_data.py
+++ b/prepare_data.py
@ -9,10 +9,20 @@ import keras.backend as K
 import os.path
 import codecs
 from keras import optimizers
 from keras.models import Model
 from keras.layers import Dense, Dropout, Input
 from keras.layers.merge import concatenate
 from keras.layers.convolutional import Conv1D
 from keras.layers.convolutional import MaxPooling1D
 from keras.layers import Flatten
 from keras.models import load_model
 class Data:
    def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True,
-                 additional_letter_attributes=True, reverse_inputs=True, accent_classification=False, number_of_syllables=False):
+                 additional_letter_attributes=True, reverse_inputs=True, accent_classification=False, number_of_syllables=False,
                 convert_multext=True, bidirectional_basic_input=False):
        self._input_type = input_type
        self._save_generated_data = save_generated_data
        self._allow_shuffle_vector_generation = allow_shuffle_vector_generation
@ -21,6 +31,8 @@ class Data:
        self._reverse_inputs = reverse_inputs
        self._accent_classification = accent_classification
        self._number_of_syllables = number_of_syllables
        self._convert_multext = convert_multext
        self._bidirectional_basic_input = bidirectional_basic_input
        self.x_train = None
        self.x_other_features_train = None
@ -169,14 +181,20 @@ class Data:
    def _x_letter_input(self, content, dictionary, max_word, vowels):
        if self._additional_letter_attributes:
            if not self._bidirectional_basic_input:
                x = np.zeros((len(content), max_word, len(dictionary) + 6), dtype=int)
            else:
                x = np.zeros((len(content), 2 * max_word, len(dictionary) + 6), dtype=int)
            voiced_consonants = self._get_voiced_consonants()
            resonant_silent_consonants = self._get_resonant_silent_consonants()
            nonresonant_silent_consonants = self._get_nonresonant_silent_consonants()
            # print('HERE!!!')
        else:
            # print('HERE!!!')
            if not self._bidirectional_basic_input:
                x = np.zeros((len(content), max_word, len(dictionary)), dtype=int)
            else:
                x = np.zeros((len(content), 2 * max_word, len(dictionary)), dtype=int)
        i = 0
        for el in content:
@ -185,25 +203,44 @@ class Data:
                word = word[::-1]
            j = 0
            for c in list(word):
                if j >= max_word:
                    continue
                index = 0
                if self._bidirectional_basic_input:
                    j2 = max_word + (len(word) - j - 1)
                for d in dictionary:
                    if c == d:
                        x[i][j][index] = 1
                        if self._bidirectional_basic_input:
                            x[i][j2][index] = 1
                        break
                    index += 1
                if self._additional_letter_attributes:
                    if self._is_vowel(word, j, vowels):
                        x[i][j][len(dictionary)] = 1
                        if self._bidirectional_basic_input:
                            x[i][j2][len(dictionary)] = 1
                    else:
                        x[i][j][len(dictionary) + 1] = 1
                        if self._bidirectional_basic_input:
                            x[i][j2][len(dictionary) + 1] = 1
                        if c in voiced_consonants:
                            x[i][j][len(dictionary) + 2] = 1
                            if self._bidirectional_basic_input:
                                x[i][j2][len(dictionary) + 2] = 1
                        else:
                            x[i][j][len(dictionary) + 3] = 1
                            if self._bidirectional_basic_input:
                                x[i][j2][len(dictionary) + 3] = 1
                            if c in resonant_silent_consonants:
                                x[i][j][len(dictionary) + 4] = 1
                                if self._bidirectional_basic_input:
                                    x[i][j2][len(dictionary) + 4] = 1
                            elif c in nonresonant_silent_consonants:
                                x[i][j][len(dictionary) + 5] = 1
                                if self._bidirectional_basic_input:
                                    x[i][j2][len(dictionary) + 5] = 1
                j += 1
            i += 1
        return x
@ -218,6 +255,8 @@ class Data:
            if self._reverse_inputs:
                syllables = syllables[::-1]
            for syllable in syllables:
                if j >= max_num_vowels:
                    continue
                if syllable in dictionary:
                    index = dictionary.index(syllable)
                else:
@ -297,7 +336,7 @@ class Data:
                    consonants.append(word_list[i])
                    syllables.append(''.join(consonants))
                else:
-                    left_consonants, right_consonants = self._split_consonants(consonants)
+                    left_consonants, right_consonants = self._split_consonants(list(''.join(consonants).lower()))
                    syllables[-1] += ''.join(left_consonants)
                    right_consonants.append(word_list[i])
                    syllables.append(''.join(right_consonants))
@ -344,9 +383,7 @@ class Data:
                elif consonants[i] in unresonant_silent_consonants:
                    if consonants[i + 1] in resonant_silent_consonants:
                        split_options.append([i, 4])
-                else:
+
                    print(consonants)
                    print('UNRECOGNIZED LETTERS!')
            if split_options == []:
                return [''], consonants
            else:
@ -358,7 +395,10 @@ class Data:
        x_other_features = []
        for el in content:
            x_el_other_features = []
            if self._convert_multext:
                converted_el = ''.join(self._convert_to_multext_east_v4(list(el[2]), feature_dictionary))
            else:
                converted_el = el[2]
            for feature in feature_dictionary:
                if converted_el[0] == feature[1]:
                    x_el_other_features.append(1)
@ -582,6 +622,15 @@ class Data:
                        input_x_other_features_stack = input_x_other_features_stack[batch_size:]
                        input_y_stack = input_y_stack[batch_size:]
                    else:
                        #print('-------------------------------------------------------------------------------------------')
                        #if dictionary is not None:
                        #    print(self.decode_x(word_encoded, dictionary))
                        #print(input_x_stack)
                        #print(input_x_other_features_stack)
                        #print(input_y_stack)
                        #print(loc)
                        if len(input_x_stack) == 0:
                            continue
                        gen_orig_x = translator[np.array(input_x_stack)]
                        yield ([gen_orig_x, np.array(input_x_other_features_stack)], np.array(input_y_stack))
                        input_x_stack = []
@ -1005,6 +1054,310 @@ class Data:
        else:
            return ''.join(word_list[::-1])
    def assign_stress_types(self, predictions, word, y, vowels, accented_vowels):
        words = []
        accentuation_index = 0
        for i in range(len(y)):
            wrong_word = word[i][::-1]
            for j in range(len(y[i])):
                if y[i][j] > 0:
                    stressed_letter = self.get_accentuated_letter(word[i][::-1], j, vowels, syllables=self._input_type != 'l')
                    possible_places = np.zeros(len(predictions[accentuation_index]))
                    if stressed_letter == 'r':
                        possible_places[0] = 1
                    elif stressed_letter == 'a':
                        possible_places[1] = 1
                        possible_places[2] = 1
                    elif stressed_letter == 'e':
                        possible_places[3] = 1
                        possible_places[4] = 1
                        possible_places[5] = 1
                    elif stressed_letter == 'i':
                        possible_places[6] = 1
                        possible_places[7] = 1
                    elif stressed_letter == 'o':
                        possible_places[8] = 1
                        possible_places[9] = 1
                        possible_places[10] = 1
                    elif stressed_letter == 'u':
                        possible_places[11] = 1
                        possible_places[12] = 1
                    possible_predictions = predictions[accentuation_index] * possible_places
                    arounded_predictions = np.zeros(len(predictions[accentuation_index]), dtype=int)
                    arounded_predictions[np.argmax(possible_predictions).astype(int)] = 1
                    if np.max(possible_predictions) != 0:
                        wrong_word = self.assign_word_accentuation_type(wrong_word, j, arounded_predictions, vowels, accented_vowels,
                                                                    syllables=self._input_type != 'l', debug=i == 313)
                    accentuation_index += 1
            words.append(wrong_word[::-1])
        return words
    @staticmethod
    def load_location_models(letters_path, syllables_path, syllabled_letters_path):
        ############################ LOCATION ########################
        letter_location_model = load_model(letters_path, custom_objects={'actual_accuracy': actual_accuracy})
        # num_examples = len(data.x_train)  # training set size
        nn_output_dim = 10
        conv_input_shape = (10, 5168)
        othr_input = (140,)
        conv_input = Input(shape=conv_input_shape, name='conv_input')
        # syllabled letters
        x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
        x_conv = MaxPooling1D(pool_size=2)(x_conv)
        x_conv = Flatten()(x_conv)
        othr_input = Input(shape=othr_input, name='othr_input')
        x = concatenate([x_conv, othr_input])
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(nn_output_dim, activation='sigmoid')(x)
        syllable_location_model = Model(inputs=[conv_input, othr_input], outputs=x)
        opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        syllable_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
        syllable_location_model.load_weights(syllables_path)
        conv_input_shape = (10, 252)
        othr_input = (140,)
        conv_input = Input(shape=conv_input_shape, name='conv_input')
        # syllabled letters
        x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
        x_conv = MaxPooling1D(pool_size=2)(x_conv)
        x_conv = Flatten()(x_conv)
        othr_input = Input(shape=othr_input, name='othr_input')
        x = concatenate([x_conv, othr_input])
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(nn_output_dim, activation='sigmoid')(x)
        syllabled_letters_location_model = Model(inputs=[conv_input, othr_input], outputs=x)
        opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        syllabled_letters_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
        syllabled_letters_location_model.load_weights(syllabled_letters_path)
        return letter_location_model, syllable_location_model, syllabled_letters_location_model
    @staticmethod
    def load_type_models(letters_path, syllables_path, syllabled_letters_path):
        nn_output_dim = 13
        # letters
        conv_input_shape = (23, 36)
        othr_input = (150,)
        conv_input = Input(shape=conv_input_shape, name='conv_input')
        # letters
        x_conv = Conv1D(115, (3), padding='same', activation='relu')(conv_input)
        x_conv = Conv1D(46, (3), padding='same', activation='relu')(x_conv)
        # syllabled letters
        x_conv = MaxPooling1D(pool_size=2)(x_conv)
        x_conv = Flatten()(x_conv)
        othr_input = Input(shape=othr_input, name='othr_input')
        x = concatenate([x_conv, othr_input])
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(nn_output_dim, activation='sigmoid')(x)
        letter_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
        opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        letter_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
        letter_type_model.load_weights(letters_path)
        conv_input_shape = (10, 5168)
        othr_input = (150,)
        conv_input = Input(shape=conv_input_shape, name='conv_input')
        x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
        x_conv = MaxPooling1D(pool_size=2)(x_conv)
        x_conv = Flatten()(x_conv)
        othr_input = Input(shape=othr_input, name='othr_input')
        x = concatenate([x_conv, othr_input])
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(nn_output_dim, activation='sigmoid')(x)
        syllable_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
        opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        syllable_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
        syllable_type_model.load_weights(syllables_path)
        # syllabled letters
        conv_input_shape = (10, 252)
        othr_input = (150,)
        conv_input = Input(shape=conv_input_shape, name='conv_input')
        x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
        x_conv = MaxPooling1D(pool_size=2)(x_conv)
        x_conv = Flatten()(x_conv)
        othr_input = Input(shape=othr_input, name='othr_input')
        x = concatenate([x_conv, othr_input])
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(nn_output_dim, activation='sigmoid')(x)
        syllabled_letter_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
        opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        syllabled_letter_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
        syllabled_letter_type_model.load_weights(syllabled_letters_path)
        return letter_type_model, syllable_type_model, syllabled_letter_type_model
    @staticmethod
    def get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
                                          dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
        batch_size = 16
        # print(tagged_input_words[pos])
        data = Data('l', shuffle_all_inputs=False, convert_multext=False)
        x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
                                                             feature_dictionary, 'who cares')
        generator = data._letter_generator(x, x_other_features, fake_y, batch_size, accented_vowels)
        letter_location_predictions = letter_location_model.predict_generator(generator, len(x) / (batch_size))
        data = Data('s', shuffle_all_inputs=False, convert_multext=False)
        x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
                                                             accented_vowels, feature_dictionary, 'who cares')
        eye = np.eye(len(syllable_dictionary), dtype=int)
        generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, eye, accented_vowels)
        syllable_location_predictions = syllable_location_model.predict_generator(generator, len(x) / (batch_size))
        data = Data('sl', shuffle_all_inputs=False, convert_multext=False)
        x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
                                                             accented_vowels, feature_dictionary, 'who cares')
        max_syllable = data._get_max_syllable(syllable_dictionary)
        syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
        generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, syllable_letters_translator, accented_vowels)
        syllabled_letters_location_predictions = syllabled_letters_location_model.predict_generator(generator, len(x) / (batch_size))
        return np.mean(np.array([letter_location_predictions, syllable_location_predictions, syllabled_letters_location_predictions]), axis=0)
    @staticmethod
    def get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model, syllabled_letter_type_model,
                                          dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
        batch_size = 16
        y_array = np.asarray(location_y)
        accentuation_length = (y_array > 0).sum()
        data = Data('l', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
        x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
                                                             feature_dictionary, 'who cares')
        generator = data._letter_generator(x, x_other_features, location_y, batch_size, accented_vowels)
        letter_type_predictions = letter_type_model.predict_generator(generator, accentuation_length / (batch_size))
        data = Data('s', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
        x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
                                                             accented_vowels, feature_dictionary, 'who cares')
        eye = np.eye(len(syllable_dictionary), dtype=int)
        generator = data._syllable_generator(x, x_other_features, location_y, batch_size, eye, accented_vowels)
        syllable_type_predictions = syllable_type_model.predict_generator(generator, accentuation_length / (batch_size))
        data = Data('sl', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
        x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
                                                             accented_vowels, feature_dictionary, 'who cares')
        max_syllable = data._get_max_syllable(syllable_dictionary)
        syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
        generator = data._syllable_generator(x, x_other_features, location_y, batch_size, syllable_letters_translator, accented_vowels)
        syllabled_letter_type_predictions = syllabled_letter_type_model.predict_generator(generator, accentuation_length / batch_size)
        return np.mean(np.array([letter_type_predictions, syllable_type_predictions, syllabled_letter_type_predictions]), axis=0)
    def assign_location_stress(self, word, locations, vowels):
            #     word = list(word)
        word_list = list(word)
        for loc in locations:
            vowel_num = 0
            # if loc == 0:
            #    return word
            for i in range(len(word_list)):
                if self._is_vowel(word_list, i, vowels):
                    if word_list[i] == 'a' and vowel_num == loc:
                        word_list[i] = 'á'
                    elif word_list[i] == 'e' and vowel_num == loc:
                        word_list[i] = 'é'
                    elif word_list[i] == 'i' and vowel_num == loc:
                        word_list[i] = 'í'
                    elif word_list[i] == 'o' and vowel_num == loc:
                        word_list[i] = 'ó'
                    elif word_list[i] == 'u' and vowel_num == loc:
                        word_list[i] = 'ú'
                    elif word_list[i] == 'r' and vowel_num == loc:
                        word_list[i] = 'ŕ'
                    elif word_list[i] == 'A' and vowel_num == loc:
                        word_list[i] = 'Á'
                    elif word_list[i] == 'E' and vowel_num == loc:
                        word_list[i] = 'É'
                    elif word_list[i] == 'I' and vowel_num == loc:
                        word_list[i] = 'Í'
                    elif word_list[i] == 'O' and vowel_num == loc:
                        word_list[i] = 'Ó'
                    elif word_list[i] == 'U' and vowel_num == loc:
                        word_list[i] = 'Ú'
                    elif word_list[i] == 'R' and vowel_num == loc:
                        word_list[i] = 'Ŕ'
                    vowel_num += 1
                    #     print(word_list)
        return ''.join(word_list)
    def accentuate_word(self, input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
                        letter_type_model, syllable_type_model, syllabled_letter_type_model,
                        dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
        predictions = self.get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model,
                                                             syllabled_letters_location_model,
                                                             dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
                                                             syllable_dictionary)
        if 'A' not in vowels:
            vowels.extend(['A', 'E', 'I', 'O', 'U'])
        location_accented_words = [self.assign_location_stress(input_words[i][0][::-1], self.decode_y(predictions[i]), vowels)[::-1] for i in
                          range(len(input_words))]
        location_y = np.around(predictions)
        type_predictions = self.get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model,
                                                              syllabled_letter_type_model,
                                                              dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
                                                              syllable_dictionary)
        only_words = [el[0] for el in input_words]
        accented_words = self.assign_stress_types(type_predictions, only_words, location_y, vowels, accented_vowels)
        return location_accented_words, accented_words
 # def count_vowels(content, vowels):
 #     num_all_vowels = 0
 #     for el in content:
--- a/sloleks_accentuation.py
+++ b/sloleks_accentuation.py
@ -0,0 +1,161 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 import numpy as np
 from keras.models import load_model
 import sys
 from prepare_data import *
 np.random.seed(7)
 data = Data('l', shuffle_all_inputs=False)
 content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex')
 dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content)
 feature_dictionary = data._create_slovene_feature_dictionary()
 syllable_dictionary = data._create_syllables_dictionary(content, vowels)
 accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']
 letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(
    'cnn/word_accetuation/cnn_dictionary/v3_10/20_test_epoch.h5',
    'cnn/word_accetuation/syllables/v2_4/20_test_epoch.h5',
    'cnn/word_accetuation/syllabled_letters/v2_5_3/20_test_epoch.h5')
 letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models(
    'cnn/accent_classification/letters/v2_1/20_test_epoch.h5',
    'cnn/accent_classification/syllables/v1_0/20_test_epoch.h5',
    'cnn/accent_classification/syllabled_letters/v1_0/20_test_epoch.h5')
 from lxml import etree
 def xml_words_generator(xml_path):
    for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):
        words = []
        for child in element:
            if child.tag == 'WordForm':
                msd = None
                word = None
                for wf in child:
                    if 'att' in wf.attrib and wf.attrib['att'] == 'msd':
                        msd = wf.attrib['val']
                    elif wf.tag == 'FormRepresentation':
                        for form_rep in wf:
                            if form_rep.attrib['att'] == 'zapis_oblike':
                                word = form_rep.attrib['val']
                        # if msd is not None and word is not None:
                        #    pass
                        # else:
                        #    print('NOOOOO')
                        words.append([word, '', msd, word])
        yield words
 gen = xml_words_generator('data/Sloleks_v1.2.xml')
 # Words proccesed: 650250
 # Word indeks: 50023
 # Word number: 50023
 from lxml import etree
 import time
 gen = xml_words_generator('data/Sloleks_v1.2.xml')
 word_glob_num = 0
 word_limit = 0
 iter_num = 50000
 word_index = 0
 start_timer = time.time()
 iter_index = 0
 words = []
 lexical_entries_load_number = 0
 lexical_entries_save_number = 0
 # INSIDE
 word_glob_num = 1500686
 word_limit = 50000
 iter_index = 30
 done_lexical_entries = 33522
 import gc
 with open("data/new_sloleks/new_sloleks.xml", "ab") as myfile:
    myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
    for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
        # LOAD NEW WORDS AND ACCENTUATE THEM
        # print("HERE")
        if lexical_entries_save_number < done_lexical_entries:
            g = next(gen)
            # print(lexical_entries_save_number)
            lexical_entries_save_number += 1
            lexical_entries_load_number += 1
            print(lexical_entries_save_number)
            del g
            gc.collect()
            continue
        if word_glob_num >= word_limit:
            myfile2.close()
            myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
            iter_index += 1
            print("Words proccesed: " + str(word_glob_num))
            print("Word indeks: " + str(word_index))
            print("Word number: " + str(len(words)))
            print("lexical_entries_load_number: " + str(lexical_entries_load_number))
            print("lexical_entries_save_number: " + str(lexical_entries_save_number))
            end_timer = time.time()
            print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
            word_index = 0
            words = []
            while len(words) < iter_num:
                try:
                    words.extend(next(gen))
                    lexical_entries_load_number += 1
                except:
                    break
            # if word_glob_num > 1:
            #    break
            data = Data('l', shuffle_all_inputs=False)
            location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model,
                                                                           syllabled_letters_location_model,
                                                                           letter_type_model, syllable_type_model, syllabled_letter_type_model,
                                                                           dictionary, max_word, max_num_vowels, vowels, accented_vowels,
                                                                           feature_dictionary, syllable_dictionary)
            word_limit += len(words)
        # READ DATA
        for child in element:
            if child.tag == 'WordForm':
                msd = None
                word = None
                for wf in child:
                    if wf.tag == 'FormRepresentation':
                        new_element = etree.Element('feat')
                        new_element.attrib['att'] = 'naglasna_mesta_oblike'
                        new_element.attrib['val'] = location_accented_words[word_index]
                        wf.append(new_element)
                        new_element = etree.Element('feat')
                        new_element.attrib['att'] = 'naglašena_oblika'
                        new_element.attrib['val'] = accented_words[word_index]
                        wf.append(new_element)
                        word_glob_num += 1
                        word_index += 1
        # print(etree.tostring(element, encoding="UTF-8"))
        myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
        myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
        element.clear()
        lexical_entries_save_number += 1
--- a/sloleks_accetuation.ipynb
+++ b/sloleks_accetuation.ipynb