Commit before major RAM lack update

master
lkrsnik 6 years ago
parent a316574314
commit 9edad0ad07

@ -2,11 +2,19 @@
<project version="4">
<component name="ChangeListManager">
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
<change type="NEW" beforePath="" afterPath="$PROJECT_DIR$/sloleks_accentuation.py" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_errors.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_errors.pkl" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_predictions.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_predictions.pkl" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_error.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_error.pkl" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_predictions.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_predictions.pkl" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/accent_classification/error_analysis.ipynb" afterPath="$PROJECT_DIR$/cnn/accent_classification/error_analysis.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/connected_text_accetuation.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/connected_text_accetuation.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v3_10/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v3_10/cnn.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/sloleks_accetuation.ipynb" afterPath="$PROJECT_DIR$/sloleks_accetuation.ipynb" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="TRACKING_ENABLED" value="true" />
@ -36,19 +44,28 @@
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="298">
<caret line="514" column="33" lean-forward="false" selection-start-line="514" selection-start-column="20" selection-end-line="514" selection-end-column="33" />
<state relative-caret-position="-1034">
<caret line="71" column="114" lean-forward="true" selection-start-line="68" selection-start-column="12" selection-end-line="71" selection-end-column="114" />
<folding>
<element signature="e#24#63#0" expanded="true" />
<element signature="e#6485#7773#0" expanded="false" />
<element signature="e#9429#9724#0" expanded="false" />
<element signature="e#15725#16027#0" expanded="false" />
<element signature="e#17000#17346#0" expanded="false" />
<element signature="e#21415#22062#0" expanded="false" />
<element signature="e#32751#32892#0" expanded="false" />
</folding>
</state>
</provider>
</entry>
</file>
<file leaf-file-name="sloleks_accentuation.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/sloleks_accentuation.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180">
<caret line="16" column="53" lean-forward="false" selection-start-line="16" selection-start-column="53" selection-end-line="16" selection-end-column="53" />
<folding>
<element signature="e#24#63#0" expanded="true" />
<element signature="e#5979#7267#0" expanded="false" />
<element signature="e#8923#9218#0" expanded="false" />
<element signature="e#13768#14070#0" expanded="false" />
<element signature="e#14127#14956#0" expanded="false" />
<element signature="e#15020#15366#0" expanded="false" />
<element signature="e#18933#19129#0" expanded="false" />
<element signature="e#19448#20095#0" expanded="false" />
<element signature="e#20194#22492#0" expanded="false" />
<element signature="e#30252#30393#0" expanded="false" />
</folding>
</state>
</provider>
@ -77,8 +94,8 @@
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1044">
<caret line="69" column="22" lean-forward="false" selection-start-line="69" selection-start-column="22" selection-end-line="69" selection-end-column="22" />
<state relative-caret-position="1710">
<caret line="106" column="30" lean-forward="false" selection-start-line="106" selection-start-column="30" selection-end-line="106" selection-end-column="39" />
<folding>
<element signature="e#24#63#0" expanded="true" />
</folding>
@ -165,23 +182,6 @@
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>predict</find>
<find>_reverse_inputs</find>
<find>_letter_generator</find>
<find>_create_feature_dictionary</find>
<find>generate_data</find>
<find>Data</find>
<find>shuffle_vector</find>
<find>shuffle_vector_path</find>
<find>fit_generator</find>
<find>../../../data/</find>
<find>self.x_other_features_train</find>
<find>_create_x_features</find>
<find>force</find>
<find>test_and_validation_size</find>
<find>self.y_train</find>
<find>_additional_letter_attributes</find>
<find>np.random.seed</find>
<find>round</find>
<find>is_vow</find>
<find>self._input_type == 'l'</find>
@ -195,6 +195,23 @@
<find>_generator</find>
<find>_create_syllable_letters_translator</find>
<find>_accent_classification</find>
<find>wrong</find>
<find>wrong_word</find>
<find>predict</find>
<find>get_ensemble_type_predictions</find>
<find>_convert_to_multext_east_v4</find>
<find>_split_consonants</find>
<find>UNRECOGNIZED</find>
<find>word_glob_num</find>
<find>convert_multext</find>
<find>_syllable_generator</find>
<find>generator</find>
<find>generate_data</find>
<find>_x</find>
<find>bidirectional_basic_input</find>
<find>_bidirectional_basic_input</find>
<find>shuffeling</find>
<find>_generate_x_and_y</find>
</findStrings>
</component>
<component name="Git.Settings">
@ -216,13 +233,14 @@
<option value="$PROJECT_DIR$/notes" />
<option value="$PROJECT_DIR$/workbench.xrsl" />
<option value="$PROJECT_DIR$/workbench.py" />
<option value="$PROJECT_DIR$/sloleks_accentuation.py" />
<option value="$PROJECT_DIR$/prepare_data.py" />
</list>
</option>
</component>
<component name="ProjectFrameBounds">
<option name="x" value="65" />
<option name="y" value="144" />
<option name="y" value="24" />
<option name="width" value="1855" />
<option name="height" value="1056" />
</component>
@ -241,8 +259,6 @@
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scratches" />
<pane id="Scope" />
<pane id="ProjectPane">
<subPane>
<PATH>
@ -257,11 +273,13 @@
</PATH>
</subPane>
</pane>
<pane id="Scratches" />
<pane id="Scope" />
</panes>
</component>
<component name="PropertiesComponent">
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
<property name="last_opened_file_path" value="$USER_HOME$/miniconda3/bin/python" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
</component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
@ -519,7 +537,7 @@
<servers />
</component>
<component name="ToolWindowManager">
<frame x="65" y="144" width="1855" height="1056" extended-state="6" />
<frame x="65" y="24" width="1855" height="1056" extended-state="6" />
<editor active="true" />
<layout>
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.12227074" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
@ -574,17 +592,6 @@
<watches-manager />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/theano_tutorial/tutorial_derivates.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding>
<element signature="e#0#18#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/theanoTest.py" />
<entry file="file://$PROJECT_DIR$/theano_tutorial/test.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1368">
@ -920,8 +927,19 @@
</entry>
<entry file="file://$PROJECT_DIR$/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1044">
<caret line="69" column="22" lean-forward="false" selection-start-line="69" selection-start-column="22" selection-end-line="69" selection-end-column="22" />
<state relative-caret-position="1710">
<caret line="106" column="30" lean-forward="false" selection-start-line="106" selection-start-column="30" selection-end-line="106" selection-end-column="39" />
<folding>
<element signature="e#24#63#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../adventofcode/2017/2/1.py" />
<entry file="file://$PROJECT_DIR$/sloleks_accentuation.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180">
<caret line="16" column="53" lean-forward="false" selection-start-line="16" selection-start-column="53" selection-end-line="16" selection-end-column="53" />
<folding>
<element signature="e#24#63#0" expanded="true" />
</folding>
@ -930,19 +948,16 @@
</entry>
<entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="298">
<caret line="514" column="33" lean-forward="false" selection-start-line="514" selection-start-column="20" selection-end-line="514" selection-end-column="33" />
<state relative-caret-position="-1034">
<caret line="71" column="114" lean-forward="true" selection-start-line="68" selection-start-column="12" selection-end-line="71" selection-end-column="114" />
<folding>
<element signature="e#24#63#0" expanded="true" />
<element signature="e#5979#7267#0" expanded="false" />
<element signature="e#8923#9218#0" expanded="false" />
<element signature="e#13768#14070#0" expanded="false" />
<element signature="e#14127#14956#0" expanded="false" />
<element signature="e#15020#15366#0" expanded="false" />
<element signature="e#18933#19129#0" expanded="false" />
<element signature="e#19448#20095#0" expanded="false" />
<element signature="e#20194#22492#0" expanded="false" />
<element signature="e#30252#30393#0" expanded="false" />
<element signature="e#6485#7773#0" expanded="false" />
<element signature="e#9429#9724#0" expanded="false" />
<element signature="e#15725#16027#0" expanded="false" />
<element signature="e#17000#17346#0" expanded="false" />
<element signature="e#21415#22062#0" expanded="false" />
<element signature="e#32751#32892#0" expanded="false" />
</folding>
</state>
</provider>

@ -9,10 +9,20 @@ import keras.backend as K
import os.path
import codecs
from keras import optimizers
from keras.models import Model
from keras.layers import Dense, Dropout, Input
from keras.layers.merge import concatenate
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Flatten
from keras.models import load_model
class Data:
def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True,
additional_letter_attributes=True, reverse_inputs=True, accent_classification=False, number_of_syllables=False):
additional_letter_attributes=True, reverse_inputs=True, accent_classification=False, number_of_syllables=False,
convert_multext=True, bidirectional_basic_input=False):
self._input_type = input_type
self._save_generated_data = save_generated_data
self._allow_shuffle_vector_generation = allow_shuffle_vector_generation
@ -21,6 +31,8 @@ class Data:
self._reverse_inputs = reverse_inputs
self._accent_classification = accent_classification
self._number_of_syllables = number_of_syllables
self._convert_multext = convert_multext
self._bidirectional_basic_input = bidirectional_basic_input
self.x_train = None
self.x_other_features_train = None
@ -169,14 +181,20 @@ class Data:
def _x_letter_input(self, content, dictionary, max_word, vowels):
if self._additional_letter_attributes:
x = np.zeros((len(content), max_word, len(dictionary) + 6), dtype=int)
if not self._bidirectional_basic_input:
x = np.zeros((len(content), max_word, len(dictionary) + 6), dtype=int)
else:
x = np.zeros((len(content), 2 * max_word, len(dictionary) + 6), dtype=int)
voiced_consonants = self._get_voiced_consonants()
resonant_silent_consonants = self._get_resonant_silent_consonants()
nonresonant_silent_consonants = self._get_nonresonant_silent_consonants()
# print('HERE!!!')
else:
# print('HERE!!!')
x = np.zeros((len(content), max_word, len(dictionary)), dtype=int)
if not self._bidirectional_basic_input:
x = np.zeros((len(content), max_word, len(dictionary)), dtype=int)
else:
x = np.zeros((len(content), 2 * max_word, len(dictionary)), dtype=int)
i = 0
for el in content:
@ -185,25 +203,44 @@ class Data:
word = word[::-1]
j = 0
for c in list(word):
if j >= max_word:
continue
index = 0
if self._bidirectional_basic_input:
j2 = max_word + (len(word) - j - 1)
for d in dictionary:
if c == d:
x[i][j][index] = 1
if self._bidirectional_basic_input:
x[i][j2][index] = 1
break
index += 1
if self._additional_letter_attributes:
if self._is_vowel(word, j, vowels):
x[i][j][len(dictionary)] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary)] = 1
else:
x[i][j][len(dictionary) + 1] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary) + 1] = 1
if c in voiced_consonants:
x[i][j][len(dictionary) + 2] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary) + 2] = 1
else:
x[i][j][len(dictionary) + 3] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary) + 3] = 1
if c in resonant_silent_consonants:
x[i][j][len(dictionary) + 4] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary) + 4] = 1
elif c in nonresonant_silent_consonants:
x[i][j][len(dictionary) + 5] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary) + 5] = 1
j += 1
i += 1
return x
@ -218,6 +255,8 @@ class Data:
if self._reverse_inputs:
syllables = syllables[::-1]
for syllable in syllables:
if j >= max_num_vowels:
continue
if syllable in dictionary:
index = dictionary.index(syllable)
else:
@ -297,7 +336,7 @@ class Data:
consonants.append(word_list[i])
syllables.append(''.join(consonants))
else:
left_consonants, right_consonants = self._split_consonants(consonants)
left_consonants, right_consonants = self._split_consonants(list(''.join(consonants).lower()))
syllables[-1] += ''.join(left_consonants)
right_consonants.append(word_list[i])
syllables.append(''.join(right_consonants))
@ -344,9 +383,7 @@ class Data:
elif consonants[i] in unresonant_silent_consonants:
if consonants[i + 1] in resonant_silent_consonants:
split_options.append([i, 4])
else:
print(consonants)
print('UNRECOGNIZED LETTERS!')
if split_options == []:
return [''], consonants
else:
@ -358,7 +395,10 @@ class Data:
x_other_features = []
for el in content:
x_el_other_features = []
converted_el = ''.join(self._convert_to_multext_east_v4(list(el[2]), feature_dictionary))
if self._convert_multext:
converted_el = ''.join(self._convert_to_multext_east_v4(list(el[2]), feature_dictionary))
else:
converted_el = el[2]
for feature in feature_dictionary:
if converted_el[0] == feature[1]:
x_el_other_features.append(1)
@ -582,6 +622,15 @@ class Data:
input_x_other_features_stack = input_x_other_features_stack[batch_size:]
input_y_stack = input_y_stack[batch_size:]
else:
#print('-------------------------------------------------------------------------------------------')
#if dictionary is not None:
# print(self.decode_x(word_encoded, dictionary))
#print(input_x_stack)
#print(input_x_other_features_stack)
#print(input_y_stack)
#print(loc)
if len(input_x_stack) == 0:
continue
gen_orig_x = translator[np.array(input_x_stack)]
yield ([gen_orig_x, np.array(input_x_other_features_stack)], np.array(input_y_stack))
input_x_stack = []
@ -1005,6 +1054,310 @@ class Data:
else:
return ''.join(word_list[::-1])
def assign_stress_types(self, predictions, word, y, vowels, accented_vowels):
words = []
accentuation_index = 0
for i in range(len(y)):
wrong_word = word[i][::-1]
for j in range(len(y[i])):
if y[i][j] > 0:
stressed_letter = self.get_accentuated_letter(word[i][::-1], j, vowels, syllables=self._input_type != 'l')
possible_places = np.zeros(len(predictions[accentuation_index]))
if stressed_letter == 'r':
possible_places[0] = 1
elif stressed_letter == 'a':
possible_places[1] = 1
possible_places[2] = 1
elif stressed_letter == 'e':
possible_places[3] = 1
possible_places[4] = 1
possible_places[5] = 1
elif stressed_letter == 'i':
possible_places[6] = 1
possible_places[7] = 1
elif stressed_letter == 'o':
possible_places[8] = 1
possible_places[9] = 1
possible_places[10] = 1
elif stressed_letter == 'u':
possible_places[11] = 1
possible_places[12] = 1
possible_predictions = predictions[accentuation_index] * possible_places
arounded_predictions = np.zeros(len(predictions[accentuation_index]), dtype=int)
arounded_predictions[np.argmax(possible_predictions).astype(int)] = 1
if np.max(possible_predictions) != 0:
wrong_word = self.assign_word_accentuation_type(wrong_word, j, arounded_predictions, vowels, accented_vowels,
syllables=self._input_type != 'l', debug=i == 313)
accentuation_index += 1
words.append(wrong_word[::-1])
return words
@staticmethod
def load_location_models(letters_path, syllables_path, syllabled_letters_path):
############################ LOCATION ########################
letter_location_model = load_model(letters_path, custom_objects={'actual_accuracy': actual_accuracy})
# num_examples = len(data.x_train) # training set size
nn_output_dim = 10
conv_input_shape = (10, 5168)
othr_input = (140,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
# syllabled letters
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
syllable_location_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
syllable_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
syllable_location_model.load_weights(syllables_path)
conv_input_shape = (10, 252)
othr_input = (140,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
# syllabled letters
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
syllabled_letters_location_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
syllabled_letters_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
syllabled_letters_location_model.load_weights(syllabled_letters_path)
return letter_location_model, syllable_location_model, syllabled_letters_location_model
@staticmethod
def load_type_models(letters_path, syllables_path, syllabled_letters_path):
nn_output_dim = 13
# letters
conv_input_shape = (23, 36)
othr_input = (150,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
# letters
x_conv = Conv1D(115, (3), padding='same', activation='relu')(conv_input)
x_conv = Conv1D(46, (3), padding='same', activation='relu')(x_conv)
# syllabled letters
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
letter_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
letter_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
letter_type_model.load_weights(letters_path)
conv_input_shape = (10, 5168)
othr_input = (150,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
syllable_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
syllable_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
syllable_type_model.load_weights(syllables_path)
# syllabled letters
conv_input_shape = (10, 252)
othr_input = (150,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
syllabled_letter_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
syllabled_letter_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
syllabled_letter_type_model.load_weights(syllabled_letters_path)
return letter_type_model, syllable_type_model, syllabled_letter_type_model
@staticmethod
def get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
batch_size = 16
# print(tagged_input_words[pos])
data = Data('l', shuffle_all_inputs=False, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
feature_dictionary, 'who cares')
generator = data._letter_generator(x, x_other_features, fake_y, batch_size, accented_vowels)
letter_location_predictions = letter_location_model.predict_generator(generator, len(x) / (batch_size))
data = Data('s', shuffle_all_inputs=False, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
eye = np.eye(len(syllable_dictionary), dtype=int)
generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, eye, accented_vowels)
syllable_location_predictions = syllable_location_model.predict_generator(generator, len(x) / (batch_size))
data = Data('sl', shuffle_all_inputs=False, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
max_syllable = data._get_max_syllable(syllable_dictionary)
syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, syllable_letters_translator, accented_vowels)
syllabled_letters_location_predictions = syllabled_letters_location_model.predict_generator(generator, len(x) / (batch_size))
return np.mean(np.array([letter_location_predictions, syllable_location_predictions, syllabled_letters_location_predictions]), axis=0)
@staticmethod
def get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model, syllabled_letter_type_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
batch_size = 16
y_array = np.asarray(location_y)
accentuation_length = (y_array > 0).sum()
data = Data('l', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
feature_dictionary, 'who cares')
generator = data._letter_generator(x, x_other_features, location_y, batch_size, accented_vowels)
letter_type_predictions = letter_type_model.predict_generator(generator, accentuation_length / (batch_size))
data = Data('s', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
eye = np.eye(len(syllable_dictionary), dtype=int)
generator = data._syllable_generator(x, x_other_features, location_y, batch_size, eye, accented_vowels)
syllable_type_predictions = syllable_type_model.predict_generator(generator, accentuation_length / (batch_size))
data = Data('sl', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
max_syllable = data._get_max_syllable(syllable_dictionary)
syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
generator = data._syllable_generator(x, x_other_features, location_y, batch_size, syllable_letters_translator, accented_vowels)
syllabled_letter_type_predictions = syllabled_letter_type_model.predict_generator(generator, accentuation_length / batch_size)
return np.mean(np.array([letter_type_predictions, syllable_type_predictions, syllabled_letter_type_predictions]), axis=0)
def assign_location_stress(self, word, locations, vowels):
# word = list(word)
word_list = list(word)
for loc in locations:
vowel_num = 0
# if loc == 0:
# return word
for i in range(len(word_list)):
if self._is_vowel(word_list, i, vowels):
if word_list[i] == 'a' and vowel_num == loc:
word_list[i] = 'á'
elif word_list[i] == 'e' and vowel_num == loc:
word_list[i] = 'é'
elif word_list[i] == 'i' and vowel_num == loc:
word_list[i] = 'í'
elif word_list[i] == 'o' and vowel_num == loc:
word_list[i] = 'ó'
elif word_list[i] == 'u' and vowel_num == loc:
word_list[i] = 'ú'
elif word_list[i] == 'r' and vowel_num == loc:
word_list[i] = 'ŕ'
elif word_list[i] == 'A' and vowel_num == loc:
word_list[i] = 'Á'
elif word_list[i] == 'E' and vowel_num == loc:
word_list[i] = 'É'
elif word_list[i] == 'I' and vowel_num == loc:
word_list[i] = 'Í'
elif word_list[i] == 'O' and vowel_num == loc:
word_list[i] = 'Ó'
elif word_list[i] == 'U' and vowel_num == loc:
word_list[i] = 'Ú'
elif word_list[i] == 'R' and vowel_num == loc:
word_list[i] = 'Ŕ'
vowel_num += 1
# print(word_list)
return ''.join(word_list)
def accentuate_word(self, input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
letter_type_model, syllable_type_model, syllabled_letter_type_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
predictions = self.get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model,
syllabled_letters_location_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
syllable_dictionary)
if 'A' not in vowels:
vowels.extend(['A', 'E', 'I', 'O', 'U'])
location_accented_words = [self.assign_location_stress(input_words[i][0][::-1], self.decode_y(predictions[i]), vowels)[::-1] for i in
range(len(input_words))]
location_y = np.around(predictions)
type_predictions = self.get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model,
syllabled_letter_type_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
syllable_dictionary)
only_words = [el[0] for el in input_words]
accented_words = self.assign_stress_types(type_predictions, only_words, location_y, vowels, accented_vowels)
return location_accented_words, accented_words
# def count_vowels(content, vowels):
# num_all_vowels = 0
# for el in content:

@ -0,0 +1,161 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
from keras.models import load_model
import sys
from prepare_data import *
np.random.seed(7)
data = Data('l', shuffle_all_inputs=False)
content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex')
dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content)
feature_dictionary = data._create_slovene_feature_dictionary()
syllable_dictionary = data._create_syllables_dictionary(content, vowels)
accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']
letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(
'cnn/word_accetuation/cnn_dictionary/v3_10/20_test_epoch.h5',
'cnn/word_accetuation/syllables/v2_4/20_test_epoch.h5',
'cnn/word_accetuation/syllabled_letters/v2_5_3/20_test_epoch.h5')
letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models(
'cnn/accent_classification/letters/v2_1/20_test_epoch.h5',
'cnn/accent_classification/syllables/v1_0/20_test_epoch.h5',
'cnn/accent_classification/syllabled_letters/v1_0/20_test_epoch.h5')
from lxml import etree
def xml_words_generator(xml_path):
for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):
words = []
for child in element:
if child.tag == 'WordForm':
msd = None
word = None
for wf in child:
if 'att' in wf.attrib and wf.attrib['att'] == 'msd':
msd = wf.attrib['val']
elif wf.tag == 'FormRepresentation':
for form_rep in wf:
if form_rep.attrib['att'] == 'zapis_oblike':
word = form_rep.attrib['val']
# if msd is not None and word is not None:
# pass
# else:
# print('NOOOOO')
words.append([word, '', msd, word])
yield words
gen = xml_words_generator('data/Sloleks_v1.2.xml')
# Words proccesed: 650250
# Word indeks: 50023
# Word number: 50023
from lxml import etree
import time
gen = xml_words_generator('data/Sloleks_v1.2.xml')
word_glob_num = 0
word_limit = 0
iter_num = 50000
word_index = 0
start_timer = time.time()
iter_index = 0
words = []
lexical_entries_load_number = 0
lexical_entries_save_number = 0
# INSIDE
word_glob_num = 1500686
word_limit = 50000
iter_index = 30
done_lexical_entries = 33522
import gc
with open("data/new_sloleks/new_sloleks.xml", "ab") as myfile:
myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
# LOAD NEW WORDS AND ACCENTUATE THEM
# print("HERE")
if lexical_entries_save_number < done_lexical_entries:
g = next(gen)
# print(lexical_entries_save_number)
lexical_entries_save_number += 1
lexical_entries_load_number += 1
print(lexical_entries_save_number)
del g
gc.collect()
continue
if word_glob_num >= word_limit:
myfile2.close()
myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
iter_index += 1
print("Words proccesed: " + str(word_glob_num))
print("Word indeks: " + str(word_index))
print("Word number: " + str(len(words)))
print("lexical_entries_load_number: " + str(lexical_entries_load_number))
print("lexical_entries_save_number: " + str(lexical_entries_save_number))
end_timer = time.time()
print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
word_index = 0
words = []
while len(words) < iter_num:
try:
words.extend(next(gen))
lexical_entries_load_number += 1
except:
break
# if word_glob_num > 1:
# break
data = Data('l', shuffle_all_inputs=False)
location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model,
syllabled_letters_location_model,
letter_type_model, syllable_type_model, syllabled_letter_type_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels,
feature_dictionary, syllable_dictionary)
word_limit += len(words)
# READ DATA
for child in element:
if child.tag == 'WordForm':
msd = None
word = None
for wf in child:
if wf.tag == 'FormRepresentation':
new_element = etree.Element('feat')
new_element.attrib['att'] = 'naglasna_mesta_oblike'
new_element.attrib['val'] = location_accented_words[word_index]
wf.append(new_element)
new_element = etree.Element('feat')
new_element.attrib['att'] = 'naglašena_oblika'
new_element.attrib['val'] = accented_words[word_index]
wf.append(new_element)
word_glob_num += 1
word_index += 1
# print(etree.tostring(element, encoding="UTF-8"))
myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
element.clear()
lexical_entries_save_number += 1

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save