Commit before major RAM lack update
This commit is contained in:
parent
a316574314
commit
9edad0ad07
|
@ -2,11 +2,19 @@
|
|||
<project version="4">
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
|
||||
<change type="NEW" beforePath="" afterPath="$PROJECT_DIR$/sloleks_accentuation.py" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_errors.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_errors.pkl" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_predictions.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_predictions.pkl" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_error.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_error.pkl" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_predictions.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_predictions.pkl" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/accent_classification/error_analysis.ipynb" afterPath="$PROJECT_DIR$/cnn/accent_classification/error_analysis.ipynb" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/connected_text_accetuation.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/connected_text_accetuation.ipynb" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v3_10/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v3_10/cnn.ipynb" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis.ipynb" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/sloleks_accetuation.ipynb" afterPath="$PROJECT_DIR$/sloleks_accetuation.ipynb" />
|
||||
</list>
|
||||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||
<option name="TRACKING_ENABLED" value="true" />
|
||||
|
@ -36,19 +44,28 @@
|
|||
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="298">
|
||||
<caret line="514" column="33" lean-forward="false" selection-start-line="514" selection-start-column="20" selection-end-line="514" selection-end-column="33" />
|
||||
<state relative-caret-position="-1034">
|
||||
<caret line="71" column="114" lean-forward="true" selection-start-line="68" selection-start-column="12" selection-end-line="71" selection-end-column="114" />
|
||||
<folding>
|
||||
<element signature="e#24#63#0" expanded="true" />
|
||||
<element signature="e#6485#7773#0" expanded="false" />
|
||||
<element signature="e#9429#9724#0" expanded="false" />
|
||||
<element signature="e#15725#16027#0" expanded="false" />
|
||||
<element signature="e#17000#17346#0" expanded="false" />
|
||||
<element signature="e#21415#22062#0" expanded="false" />
|
||||
<element signature="e#32751#32892#0" expanded="false" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file leaf-file-name="sloleks_accentuation.py" pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/sloleks_accentuation.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="180">
|
||||
<caret line="16" column="53" lean-forward="false" selection-start-line="16" selection-start-column="53" selection-end-line="16" selection-end-column="53" />
|
||||
<folding>
|
||||
<element signature="e#24#63#0" expanded="true" />
|
||||
<element signature="e#5979#7267#0" expanded="false" />
|
||||
<element signature="e#8923#9218#0" expanded="false" />
|
||||
<element signature="e#13768#14070#0" expanded="false" />
|
||||
<element signature="e#14127#14956#0" expanded="false" />
|
||||
<element signature="e#15020#15366#0" expanded="false" />
|
||||
<element signature="e#18933#19129#0" expanded="false" />
|
||||
<element signature="e#19448#20095#0" expanded="false" />
|
||||
<element signature="e#20194#22492#0" expanded="false" />
|
||||
<element signature="e#30252#30393#0" expanded="false" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
|
@ -77,8 +94,8 @@
|
|||
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/workbench.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="1044">
|
||||
<caret line="69" column="22" lean-forward="false" selection-start-line="69" selection-start-column="22" selection-end-line="69" selection-end-column="22" />
|
||||
<state relative-caret-position="1710">
|
||||
<caret line="106" column="30" lean-forward="false" selection-start-line="106" selection-start-column="30" selection-end-line="106" selection-end-column="39" />
|
||||
<folding>
|
||||
<element signature="e#24#63#0" expanded="true" />
|
||||
</folding>
|
||||
|
@ -165,23 +182,6 @@
|
|||
</component>
|
||||
<component name="FindInProjectRecents">
|
||||
<findStrings>
|
||||
<find>predict</find>
|
||||
<find>_reverse_inputs</find>
|
||||
<find>_letter_generator</find>
|
||||
<find>_create_feature_dictionary</find>
|
||||
<find>generate_data</find>
|
||||
<find>Data</find>
|
||||
<find>shuffle_vector</find>
|
||||
<find>shuffle_vector_path</find>
|
||||
<find>fit_generator</find>
|
||||
<find>../../../data/</find>
|
||||
<find>self.x_other_features_train</find>
|
||||
<find>_create_x_features</find>
|
||||
<find>force</find>
|
||||
<find>test_and_validation_size</find>
|
||||
<find>self.y_train</find>
|
||||
<find>_additional_letter_attributes</find>
|
||||
<find>np.random.seed</find>
|
||||
<find>round</find>
|
||||
<find>is_vow</find>
|
||||
<find>self._input_type == 'l'</find>
|
||||
|
@ -195,6 +195,23 @@
|
|||
<find>_generator</find>
|
||||
<find>_create_syllable_letters_translator</find>
|
||||
<find>_accent_classification</find>
|
||||
<find>wrong</find>
|
||||
<find>wrong_word</find>
|
||||
<find>predict</find>
|
||||
<find>get_ensemble_type_predictions</find>
|
||||
<find>_convert_to_multext_east_v4</find>
|
||||
<find>_split_consonants</find>
|
||||
<find>UNRECOGNIZED</find>
|
||||
<find>word_glob_num</find>
|
||||
<find>convert_multext</find>
|
||||
<find>_syllable_generator</find>
|
||||
<find>generator</find>
|
||||
<find>generate_data</find>
|
||||
<find>_x</find>
|
||||
<find>bidirectional_basic_input</find>
|
||||
<find>_bidirectional_basic_input</find>
|
||||
<find>shuffeling</find>
|
||||
<find>_generate_x_and_y</find>
|
||||
</findStrings>
|
||||
</component>
|
||||
<component name="Git.Settings">
|
||||
|
@ -216,13 +233,14 @@
|
|||
<option value="$PROJECT_DIR$/notes" />
|
||||
<option value="$PROJECT_DIR$/workbench.xrsl" />
|
||||
<option value="$PROJECT_DIR$/workbench.py" />
|
||||
<option value="$PROJECT_DIR$/sloleks_accentuation.py" />
|
||||
<option value="$PROJECT_DIR$/prepare_data.py" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="ProjectFrameBounds">
|
||||
<option name="x" value="65" />
|
||||
<option name="y" value="144" />
|
||||
<option name="y" value="24" />
|
||||
<option name="width" value="1855" />
|
||||
<option name="height" value="1056" />
|
||||
</component>
|
||||
|
@ -241,8 +259,6 @@
|
|||
<foldersAlwaysOnTop value="true" />
|
||||
</navigator>
|
||||
<panes>
|
||||
<pane id="Scratches" />
|
||||
<pane id="Scope" />
|
||||
<pane id="ProjectPane">
|
||||
<subPane>
|
||||
<PATH>
|
||||
|
@ -257,11 +273,13 @@
|
|||
</PATH>
|
||||
</subPane>
|
||||
</pane>
|
||||
<pane id="Scratches" />
|
||||
<pane id="Scope" />
|
||||
</panes>
|
||||
</component>
|
||||
<component name="PropertiesComponent">
|
||||
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
|
||||
<property name="last_opened_file_path" value="$USER_HOME$/miniconda3/bin/python" />
|
||||
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
|
||||
</component>
|
||||
<component name="RecentsManager">
|
||||
<key name="CopyFile.RECENT_KEYS">
|
||||
|
@ -519,7 +537,7 @@
|
|||
<servers />
|
||||
</component>
|
||||
<component name="ToolWindowManager">
|
||||
<frame x="65" y="144" width="1855" height="1056" extended-state="6" />
|
||||
<frame x="65" y="24" width="1855" height="1056" extended-state="6" />
|
||||
<editor active="true" />
|
||||
<layout>
|
||||
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.12227074" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
|
||||
|
@ -574,17 +592,6 @@
|
|||
<watches-manager />
|
||||
</component>
|
||||
<component name="editorHistoryManager">
|
||||
<entry file="file://$PROJECT_DIR$/theano_tutorial/tutorial_derivates.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="0">
|
||||
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
|
||||
<folding>
|
||||
<element signature="e#0#18#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/theanoTest.py" />
|
||||
<entry file="file://$PROJECT_DIR$/theano_tutorial/test.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="1368">
|
||||
|
@ -920,8 +927,19 @@
|
|||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/workbench.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="1044">
|
||||
<caret line="69" column="22" lean-forward="false" selection-start-line="69" selection-start-column="22" selection-end-line="69" selection-end-column="22" />
|
||||
<state relative-caret-position="1710">
|
||||
<caret line="106" column="30" lean-forward="false" selection-start-line="106" selection-start-column="30" selection-end-line="106" selection-end-column="39" />
|
||||
<folding>
|
||||
<element signature="e#24#63#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/../adventofcode/2017/2/1.py" />
|
||||
<entry file="file://$PROJECT_DIR$/sloleks_accentuation.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="180">
|
||||
<caret line="16" column="53" lean-forward="false" selection-start-line="16" selection-start-column="53" selection-end-line="16" selection-end-column="53" />
|
||||
<folding>
|
||||
<element signature="e#24#63#0" expanded="true" />
|
||||
</folding>
|
||||
|
@ -930,19 +948,16 @@
|
|||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="298">
|
||||
<caret line="514" column="33" lean-forward="false" selection-start-line="514" selection-start-column="20" selection-end-line="514" selection-end-column="33" />
|
||||
<state relative-caret-position="-1034">
|
||||
<caret line="71" column="114" lean-forward="true" selection-start-line="68" selection-start-column="12" selection-end-line="71" selection-end-column="114" />
|
||||
<folding>
|
||||
<element signature="e#24#63#0" expanded="true" />
|
||||
<element signature="e#5979#7267#0" expanded="false" />
|
||||
<element signature="e#8923#9218#0" expanded="false" />
|
||||
<element signature="e#13768#14070#0" expanded="false" />
|
||||
<element signature="e#14127#14956#0" expanded="false" />
|
||||
<element signature="e#15020#15366#0" expanded="false" />
|
||||
<element signature="e#18933#19129#0" expanded="false" />
|
||||
<element signature="e#19448#20095#0" expanded="false" />
|
||||
<element signature="e#20194#22492#0" expanded="false" />
|
||||
<element signature="e#30252#30393#0" expanded="false" />
|
||||
<element signature="e#6485#7773#0" expanded="false" />
|
||||
<element signature="e#9429#9724#0" expanded="false" />
|
||||
<element signature="e#15725#16027#0" expanded="false" />
|
||||
<element signature="e#17000#17346#0" expanded="false" />
|
||||
<element signature="e#21415#22062#0" expanded="false" />
|
||||
<element signature="e#32751#32892#0" expanded="false" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
|
|
363
prepare_data.py
363
prepare_data.py
|
@ -9,10 +9,20 @@ import keras.backend as K
|
|||
import os.path
|
||||
import codecs
|
||||
|
||||
from keras import optimizers
|
||||
from keras.models import Model
|
||||
from keras.layers import Dense, Dropout, Input
|
||||
from keras.layers.merge import concatenate
|
||||
from keras.layers.convolutional import Conv1D
|
||||
from keras.layers.convolutional import MaxPooling1D
|
||||
from keras.layers import Flatten
|
||||
from keras.models import load_model
|
||||
|
||||
|
||||
class Data:
|
||||
def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True,
|
||||
additional_letter_attributes=True, reverse_inputs=True, accent_classification=False, number_of_syllables=False):
|
||||
additional_letter_attributes=True, reverse_inputs=True, accent_classification=False, number_of_syllables=False,
|
||||
convert_multext=True, bidirectional_basic_input=False):
|
||||
self._input_type = input_type
|
||||
self._save_generated_data = save_generated_data
|
||||
self._allow_shuffle_vector_generation = allow_shuffle_vector_generation
|
||||
|
@ -21,6 +31,8 @@ class Data:
|
|||
self._reverse_inputs = reverse_inputs
|
||||
self._accent_classification = accent_classification
|
||||
self._number_of_syllables = number_of_syllables
|
||||
self._convert_multext = convert_multext
|
||||
self._bidirectional_basic_input = bidirectional_basic_input
|
||||
|
||||
self.x_train = None
|
||||
self.x_other_features_train = None
|
||||
|
@ -169,14 +181,20 @@ class Data:
|
|||
|
||||
def _x_letter_input(self, content, dictionary, max_word, vowels):
|
||||
if self._additional_letter_attributes:
|
||||
if not self._bidirectional_basic_input:
|
||||
x = np.zeros((len(content), max_word, len(dictionary) + 6), dtype=int)
|
||||
else:
|
||||
x = np.zeros((len(content), 2 * max_word, len(dictionary) + 6), dtype=int)
|
||||
voiced_consonants = self._get_voiced_consonants()
|
||||
resonant_silent_consonants = self._get_resonant_silent_consonants()
|
||||
nonresonant_silent_consonants = self._get_nonresonant_silent_consonants()
|
||||
# print('HERE!!!')
|
||||
else:
|
||||
# print('HERE!!!')
|
||||
if not self._bidirectional_basic_input:
|
||||
x = np.zeros((len(content), max_word, len(dictionary)), dtype=int)
|
||||
else:
|
||||
x = np.zeros((len(content), 2 * max_word, len(dictionary)), dtype=int)
|
||||
|
||||
i = 0
|
||||
for el in content:
|
||||
|
@ -185,25 +203,44 @@ class Data:
|
|||
word = word[::-1]
|
||||
j = 0
|
||||
for c in list(word):
|
||||
if j >= max_word:
|
||||
continue
|
||||
index = 0
|
||||
if self._bidirectional_basic_input:
|
||||
j2 = max_word + (len(word) - j - 1)
|
||||
for d in dictionary:
|
||||
if c == d:
|
||||
x[i][j][index] = 1
|
||||
if self._bidirectional_basic_input:
|
||||
x[i][j2][index] = 1
|
||||
break
|
||||
index += 1
|
||||
if self._additional_letter_attributes:
|
||||
if self._is_vowel(word, j, vowels):
|
||||
x[i][j][len(dictionary)] = 1
|
||||
if self._bidirectional_basic_input:
|
||||
x[i][j2][len(dictionary)] = 1
|
||||
else:
|
||||
x[i][j][len(dictionary) + 1] = 1
|
||||
if self._bidirectional_basic_input:
|
||||
x[i][j2][len(dictionary) + 1] = 1
|
||||
if c in voiced_consonants:
|
||||
x[i][j][len(dictionary) + 2] = 1
|
||||
if self._bidirectional_basic_input:
|
||||
x[i][j2][len(dictionary) + 2] = 1
|
||||
else:
|
||||
x[i][j][len(dictionary) + 3] = 1
|
||||
if self._bidirectional_basic_input:
|
||||
x[i][j2][len(dictionary) + 3] = 1
|
||||
|
||||
if c in resonant_silent_consonants:
|
||||
x[i][j][len(dictionary) + 4] = 1
|
||||
if self._bidirectional_basic_input:
|
||||
x[i][j2][len(dictionary) + 4] = 1
|
||||
elif c in nonresonant_silent_consonants:
|
||||
x[i][j][len(dictionary) + 5] = 1
|
||||
if self._bidirectional_basic_input:
|
||||
x[i][j2][len(dictionary) + 5] = 1
|
||||
j += 1
|
||||
i += 1
|
||||
return x
|
||||
|
@ -218,6 +255,8 @@ class Data:
|
|||
if self._reverse_inputs:
|
||||
syllables = syllables[::-1]
|
||||
for syllable in syllables:
|
||||
if j >= max_num_vowels:
|
||||
continue
|
||||
if syllable in dictionary:
|
||||
index = dictionary.index(syllable)
|
||||
else:
|
||||
|
@ -297,7 +336,7 @@ class Data:
|
|||
consonants.append(word_list[i])
|
||||
syllables.append(''.join(consonants))
|
||||
else:
|
||||
left_consonants, right_consonants = self._split_consonants(consonants)
|
||||
left_consonants, right_consonants = self._split_consonants(list(''.join(consonants).lower()))
|
||||
syllables[-1] += ''.join(left_consonants)
|
||||
right_consonants.append(word_list[i])
|
||||
syllables.append(''.join(right_consonants))
|
||||
|
@ -344,9 +383,7 @@ class Data:
|
|||
elif consonants[i] in unresonant_silent_consonants:
|
||||
if consonants[i + 1] in resonant_silent_consonants:
|
||||
split_options.append([i, 4])
|
||||
else:
|
||||
print(consonants)
|
||||
print('UNRECOGNIZED LETTERS!')
|
||||
|
||||
if split_options == []:
|
||||
return [''], consonants
|
||||
else:
|
||||
|
@ -358,7 +395,10 @@ class Data:
|
|||
x_other_features = []
|
||||
for el in content:
|
||||
x_el_other_features = []
|
||||
if self._convert_multext:
|
||||
converted_el = ''.join(self._convert_to_multext_east_v4(list(el[2]), feature_dictionary))
|
||||
else:
|
||||
converted_el = el[2]
|
||||
for feature in feature_dictionary:
|
||||
if converted_el[0] == feature[1]:
|
||||
x_el_other_features.append(1)
|
||||
|
@ -582,6 +622,15 @@ class Data:
|
|||
input_x_other_features_stack = input_x_other_features_stack[batch_size:]
|
||||
input_y_stack = input_y_stack[batch_size:]
|
||||
else:
|
||||
#print('-------------------------------------------------------------------------------------------')
|
||||
#if dictionary is not None:
|
||||
# print(self.decode_x(word_encoded, dictionary))
|
||||
#print(input_x_stack)
|
||||
#print(input_x_other_features_stack)
|
||||
#print(input_y_stack)
|
||||
#print(loc)
|
||||
if len(input_x_stack) == 0:
|
||||
continue
|
||||
gen_orig_x = translator[np.array(input_x_stack)]
|
||||
yield ([gen_orig_x, np.array(input_x_other_features_stack)], np.array(input_y_stack))
|
||||
input_x_stack = []
|
||||
|
@ -1005,6 +1054,310 @@ class Data:
|
|||
else:
|
||||
return ''.join(word_list[::-1])
|
||||
|
||||
def assign_stress_types(self, predictions, word, y, vowels, accented_vowels):
|
||||
words = []
|
||||
accentuation_index = 0
|
||||
for i in range(len(y)):
|
||||
wrong_word = word[i][::-1]
|
||||
|
||||
for j in range(len(y[i])):
|
||||
if y[i][j] > 0:
|
||||
stressed_letter = self.get_accentuated_letter(word[i][::-1], j, vowels, syllables=self._input_type != 'l')
|
||||
possible_places = np.zeros(len(predictions[accentuation_index]))
|
||||
if stressed_letter == 'r':
|
||||
possible_places[0] = 1
|
||||
elif stressed_letter == 'a':
|
||||
possible_places[1] = 1
|
||||
possible_places[2] = 1
|
||||
elif stressed_letter == 'e':
|
||||
possible_places[3] = 1
|
||||
possible_places[4] = 1
|
||||
possible_places[5] = 1
|
||||
elif stressed_letter == 'i':
|
||||
possible_places[6] = 1
|
||||
possible_places[7] = 1
|
||||
elif stressed_letter == 'o':
|
||||
possible_places[8] = 1
|
||||
possible_places[9] = 1
|
||||
possible_places[10] = 1
|
||||
elif stressed_letter == 'u':
|
||||
possible_places[11] = 1
|
||||
possible_places[12] = 1
|
||||
possible_predictions = predictions[accentuation_index] * possible_places
|
||||
|
||||
arounded_predictions = np.zeros(len(predictions[accentuation_index]), dtype=int)
|
||||
|
||||
arounded_predictions[np.argmax(possible_predictions).astype(int)] = 1
|
||||
|
||||
if np.max(possible_predictions) != 0:
|
||||
wrong_word = self.assign_word_accentuation_type(wrong_word, j, arounded_predictions, vowels, accented_vowels,
|
||||
syllables=self._input_type != 'l', debug=i == 313)
|
||||
|
||||
accentuation_index += 1
|
||||
|
||||
words.append(wrong_word[::-1])
|
||||
return words
|
||||
|
||||
@staticmethod
|
||||
def load_location_models(letters_path, syllables_path, syllabled_letters_path):
|
||||
############################ LOCATION ########################
|
||||
letter_location_model = load_model(letters_path, custom_objects={'actual_accuracy': actual_accuracy})
|
||||
|
||||
# num_examples = len(data.x_train) # training set size
|
||||
nn_output_dim = 10
|
||||
|
||||
conv_input_shape = (10, 5168)
|
||||
othr_input = (140,)
|
||||
conv_input = Input(shape=conv_input_shape, name='conv_input')
|
||||
|
||||
# syllabled letters
|
||||
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
|
||||
x_conv = MaxPooling1D(pool_size=2)(x_conv)
|
||||
x_conv = Flatten()(x_conv)
|
||||
|
||||
othr_input = Input(shape=othr_input, name='othr_input')
|
||||
|
||||
x = concatenate([x_conv, othr_input])
|
||||
x = Dense(256, activation='relu')(x)
|
||||
x = Dropout(0.3)(x)
|
||||
x = Dense(256, activation='relu')(x)
|
||||
x = Dropout(0.3)(x)
|
||||
x = Dense(256, activation='relu')(x)
|
||||
x = Dropout(0.3)(x)
|
||||
x = Dense(nn_output_dim, activation='sigmoid')(x)
|
||||
|
||||
syllable_location_model = Model(inputs=[conv_input, othr_input], outputs=x)
|
||||
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
|
||||
syllable_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
|
||||
syllable_location_model.load_weights(syllables_path)
|
||||
|
||||
conv_input_shape = (10, 252)
|
||||
othr_input = (140,)
|
||||
|
||||
conv_input = Input(shape=conv_input_shape, name='conv_input')
|
||||
|
||||
# syllabled letters
|
||||
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
|
||||
x_conv = MaxPooling1D(pool_size=2)(x_conv)
|
||||
x_conv = Flatten()(x_conv)
|
||||
|
||||
othr_input = Input(shape=othr_input, name='othr_input')
|
||||
|
||||
x = concatenate([x_conv, othr_input])
|
||||
x = Dense(256, activation='relu')(x)
|
||||
x = Dropout(0.3)(x)
|
||||
x = Dense(256, activation='relu')(x)
|
||||
x = Dropout(0.3)(x)
|
||||
x = Dense(256, activation='relu')(x)
|
||||
x = Dropout(0.3)(x)
|
||||
x = Dense(nn_output_dim, activation='sigmoid')(x)
|
||||
|
||||
syllabled_letters_location_model = Model(inputs=[conv_input, othr_input], outputs=x)
|
||||
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
|
||||
syllabled_letters_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
|
||||
syllabled_letters_location_model.load_weights(syllabled_letters_path)
|
||||
|
||||
return letter_location_model, syllable_location_model, syllabled_letters_location_model
|
||||
|
||||
@staticmethod
|
||||
def load_type_models(letters_path, syllables_path, syllabled_letters_path):
|
||||
nn_output_dim = 13
|
||||
|
||||
# letters
|
||||
conv_input_shape = (23, 36)
|
||||
othr_input = (150,)
|
||||
conv_input = Input(shape=conv_input_shape, name='conv_input')
|
||||
# letters
|
||||
x_conv = Conv1D(115, (3), padding='same', activation='relu')(conv_input)
|
||||
x_conv = Conv1D(46, (3), padding='same', activation='relu')(x_conv)
|
||||
|
||||
# syllabled letters
|
||||
x_conv = MaxPooling1D(pool_size=2)(x_conv)
|
||||
x_conv = Flatten()(x_conv)
|
||||
|
||||
othr_input = Input(shape=othr_input, name='othr_input')
|
||||
x = concatenate([x_conv, othr_input])
|
||||
x = Dense(256, activation='relu')(x)
|
||||
x = Dropout(0.3)(x)
|
||||
x = Dense(256, activation='relu')(x)
|
||||
x = Dropout(0.3)(x)
|
||||
x = Dense(256, activation='relu')(x)
|
||||
x = Dropout(0.3)(x)
|
||||
x = Dense(nn_output_dim, activation='sigmoid')(x)
|
||||
|
||||
letter_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
|
||||
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
|
||||
letter_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
|
||||
letter_type_model.load_weights(letters_path)
|
||||
|
||||
conv_input_shape = (10, 5168)
|
||||
othr_input = (150,)
|
||||
conv_input = Input(shape=conv_input_shape, name='conv_input')
|
||||
|
||||
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
|
||||
x_conv = MaxPooling1D(pool_size=2)(x_conv)
|
||||
x_conv = Flatten()(x_conv)
|
||||
|
||||
othr_input = Input(shape=othr_input, name='othr_input')
|
||||
x = concatenate([x_conv, othr_input])
|
||||
x = Dense(256, activation='relu')(x)
|
||||
x = Dropout(0.3)(x)
|
||||
x = Dense(256, activation='relu')(x)
|
||||
x = Dropout(0.3)(x)
|
||||
x = Dense(256, activation='relu')(x)
|
||||
x = Dropout(0.3)(x)
|
||||
x = Dense(nn_output_dim, activation='sigmoid')(x)
|
||||
|
||||
syllable_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
|
||||
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
|
||||
syllable_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
|
||||
syllable_type_model.load_weights(syllables_path)
|
||||
|
||||
# syllabled letters
|
||||
conv_input_shape = (10, 252)
|
||||
othr_input = (150,)
|
||||
conv_input = Input(shape=conv_input_shape, name='conv_input')
|
||||
|
||||
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
|
||||
x_conv = MaxPooling1D(pool_size=2)(x_conv)
|
||||
x_conv = Flatten()(x_conv)
|
||||
|
||||
othr_input = Input(shape=othr_input, name='othr_input')
|
||||
x = concatenate([x_conv, othr_input])
|
||||
x = Dense(256, activation='relu')(x)
|
||||
x = Dropout(0.3)(x)
|
||||
x = Dense(256, activation='relu')(x)
|
||||
x = Dropout(0.3)(x)
|
||||
x = Dense(256, activation='relu')(x)
|
||||
x = Dropout(0.3)(x)
|
||||
x = Dense(nn_output_dim, activation='sigmoid')(x)
|
||||
|
||||
syllabled_letter_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
|
||||
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
|
||||
syllabled_letter_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
|
||||
syllabled_letter_type_model.load_weights(syllabled_letters_path)
|
||||
|
||||
return letter_type_model, syllable_type_model, syllabled_letter_type_model
|
||||
|
||||
@staticmethod
|
||||
def get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
|
||||
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
|
||||
batch_size = 16
|
||||
# print(tagged_input_words[pos])
|
||||
|
||||
data = Data('l', shuffle_all_inputs=False, convert_multext=False)
|
||||
x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
|
||||
feature_dictionary, 'who cares')
|
||||
generator = data._letter_generator(x, x_other_features, fake_y, batch_size, accented_vowels)
|
||||
letter_location_predictions = letter_location_model.predict_generator(generator, len(x) / (batch_size))
|
||||
|
||||
data = Data('s', shuffle_all_inputs=False, convert_multext=False)
|
||||
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
|
||||
accented_vowels, feature_dictionary, 'who cares')
|
||||
eye = np.eye(len(syllable_dictionary), dtype=int)
|
||||
generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, eye, accented_vowels)
|
||||
syllable_location_predictions = syllable_location_model.predict_generator(generator, len(x) / (batch_size))
|
||||
|
||||
data = Data('sl', shuffle_all_inputs=False, convert_multext=False)
|
||||
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
|
||||
accented_vowels, feature_dictionary, 'who cares')
|
||||
max_syllable = data._get_max_syllable(syllable_dictionary)
|
||||
syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
|
||||
generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, syllable_letters_translator, accented_vowels)
|
||||
syllabled_letters_location_predictions = syllabled_letters_location_model.predict_generator(generator, len(x) / (batch_size))
|
||||
|
||||
return np.mean(np.array([letter_location_predictions, syllable_location_predictions, syllabled_letters_location_predictions]), axis=0)
|
||||
|
||||
@staticmethod
|
||||
def get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model, syllabled_letter_type_model,
|
||||
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
|
||||
batch_size = 16
|
||||
y_array = np.asarray(location_y)
|
||||
accentuation_length = (y_array > 0).sum()
|
||||
|
||||
data = Data('l', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
|
||||
x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
|
||||
feature_dictionary, 'who cares')
|
||||
generator = data._letter_generator(x, x_other_features, location_y, batch_size, accented_vowels)
|
||||
letter_type_predictions = letter_type_model.predict_generator(generator, accentuation_length / (batch_size))
|
||||
|
||||
data = Data('s', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
|
||||
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
|
||||
accented_vowels, feature_dictionary, 'who cares')
|
||||
eye = np.eye(len(syllable_dictionary), dtype=int)
|
||||
generator = data._syllable_generator(x, x_other_features, location_y, batch_size, eye, accented_vowels)
|
||||
syllable_type_predictions = syllable_type_model.predict_generator(generator, accentuation_length / (batch_size))
|
||||
|
||||
data = Data('sl', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
|
||||
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
|
||||
accented_vowels, feature_dictionary, 'who cares')
|
||||
max_syllable = data._get_max_syllable(syllable_dictionary)
|
||||
syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
|
||||
generator = data._syllable_generator(x, x_other_features, location_y, batch_size, syllable_letters_translator, accented_vowels)
|
||||
syllabled_letter_type_predictions = syllabled_letter_type_model.predict_generator(generator, accentuation_length / batch_size)
|
||||
|
||||
return np.mean(np.array([letter_type_predictions, syllable_type_predictions, syllabled_letter_type_predictions]), axis=0)
|
||||
|
||||
def assign_location_stress(self, word, locations, vowels):
|
||||
# word = list(word)
|
||||
word_list = list(word)
|
||||
for loc in locations:
|
||||
vowel_num = 0
|
||||
# if loc == 0:
|
||||
# return word
|
||||
for i in range(len(word_list)):
|
||||
if self._is_vowel(word_list, i, vowels):
|
||||
if word_list[i] == 'a' and vowel_num == loc:
|
||||
word_list[i] = 'á'
|
||||
elif word_list[i] == 'e' and vowel_num == loc:
|
||||
word_list[i] = 'é'
|
||||
elif word_list[i] == 'i' and vowel_num == loc:
|
||||
word_list[i] = 'í'
|
||||
elif word_list[i] == 'o' and vowel_num == loc:
|
||||
word_list[i] = 'ó'
|
||||
elif word_list[i] == 'u' and vowel_num == loc:
|
||||
word_list[i] = 'ú'
|
||||
elif word_list[i] == 'r' and vowel_num == loc:
|
||||
word_list[i] = 'ŕ'
|
||||
elif word_list[i] == 'A' and vowel_num == loc:
|
||||
word_list[i] = 'Á'
|
||||
elif word_list[i] == 'E' and vowel_num == loc:
|
||||
word_list[i] = 'É'
|
||||
elif word_list[i] == 'I' and vowel_num == loc:
|
||||
word_list[i] = 'Í'
|
||||
elif word_list[i] == 'O' and vowel_num == loc:
|
||||
word_list[i] = 'Ó'
|
||||
elif word_list[i] == 'U' and vowel_num == loc:
|
||||
word_list[i] = 'Ú'
|
||||
elif word_list[i] == 'R' and vowel_num == loc:
|
||||
word_list[i] = 'Ŕ'
|
||||
vowel_num += 1
|
||||
# print(word_list)
|
||||
return ''.join(word_list)
|
||||
|
||||
def accentuate_word(self, input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
|
||||
letter_type_model, syllable_type_model, syllabled_letter_type_model,
|
||||
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
|
||||
predictions = self.get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model,
|
||||
syllabled_letters_location_model,
|
||||
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
|
||||
syllable_dictionary)
|
||||
if 'A' not in vowels:
|
||||
vowels.extend(['A', 'E', 'I', 'O', 'U'])
|
||||
location_accented_words = [self.assign_location_stress(input_words[i][0][::-1], self.decode_y(predictions[i]), vowels)[::-1] for i in
|
||||
range(len(input_words))]
|
||||
|
||||
location_y = np.around(predictions)
|
||||
type_predictions = self.get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model,
|
||||
syllabled_letter_type_model,
|
||||
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
|
||||
syllable_dictionary)
|
||||
|
||||
only_words = [el[0] for el in input_words]
|
||||
accented_words = self.assign_stress_types(type_predictions, only_words, location_y, vowels, accented_vowels)
|
||||
|
||||
return location_accented_words, accented_words
|
||||
|
||||
# def count_vowels(content, vowels):
|
||||
# num_all_vowels = 0
|
||||
# for el in content:
|
||||
|
|
161
sloleks_accentuation.py
Normal file
161
sloleks_accentuation.py
Normal file
|
@ -0,0 +1,161 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
from keras.models import load_model
|
||||
import sys
|
||||
|
||||
from prepare_data import *
|
||||
|
||||
np.random.seed(7)
|
||||
|
||||
data = Data('l', shuffle_all_inputs=False)
|
||||
content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex')
|
||||
dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content)
|
||||
feature_dictionary = data._create_slovene_feature_dictionary()
|
||||
syllable_dictionary = data._create_syllables_dictionary(content, vowels)
|
||||
accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']
|
||||
|
||||
|
||||
letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(
|
||||
'cnn/word_accetuation/cnn_dictionary/v3_10/20_test_epoch.h5',
|
||||
'cnn/word_accetuation/syllables/v2_4/20_test_epoch.h5',
|
||||
'cnn/word_accetuation/syllabled_letters/v2_5_3/20_test_epoch.h5')
|
||||
|
||||
letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models(
|
||||
'cnn/accent_classification/letters/v2_1/20_test_epoch.h5',
|
||||
'cnn/accent_classification/syllables/v1_0/20_test_epoch.h5',
|
||||
'cnn/accent_classification/syllabled_letters/v1_0/20_test_epoch.h5')
|
||||
|
||||
from lxml import etree
|
||||
|
||||
|
||||
def xml_words_generator(xml_path):
|
||||
for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):
|
||||
words = []
|
||||
for child in element:
|
||||
if child.tag == 'WordForm':
|
||||
msd = None
|
||||
word = None
|
||||
for wf in child:
|
||||
if 'att' in wf.attrib and wf.attrib['att'] == 'msd':
|
||||
msd = wf.attrib['val']
|
||||
elif wf.tag == 'FormRepresentation':
|
||||
for form_rep in wf:
|
||||
if form_rep.attrib['att'] == 'zapis_oblike':
|
||||
word = form_rep.attrib['val']
|
||||
# if msd is not None and word is not None:
|
||||
# pass
|
||||
# else:
|
||||
# print('NOOOOO')
|
||||
words.append([word, '', msd, word])
|
||||
yield words
|
||||
|
||||
|
||||
gen = xml_words_generator('data/Sloleks_v1.2.xml')
|
||||
|
||||
# Words proccesed: 650250
|
||||
# Word indeks: 50023
|
||||
# Word number: 50023
|
||||
|
||||
from lxml import etree
|
||||
import time
|
||||
|
||||
gen = xml_words_generator('data/Sloleks_v1.2.xml')
|
||||
word_glob_num = 0
|
||||
word_limit = 0
|
||||
iter_num = 50000
|
||||
word_index = 0
|
||||
start_timer = time.time()
|
||||
iter_index = 0
|
||||
words = []
|
||||
|
||||
lexical_entries_load_number = 0
|
||||
lexical_entries_save_number = 0
|
||||
|
||||
# INSIDE
|
||||
word_glob_num = 1500686
|
||||
|
||||
word_limit = 50000
|
||||
iter_index = 30
|
||||
|
||||
done_lexical_entries = 33522
|
||||
|
||||
import gc
|
||||
|
||||
with open("data/new_sloleks/new_sloleks.xml", "ab") as myfile:
|
||||
myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
|
||||
for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
|
||||
# LOAD NEW WORDS AND ACCENTUATE THEM
|
||||
# print("HERE")
|
||||
|
||||
if lexical_entries_save_number < done_lexical_entries:
|
||||
g = next(gen)
|
||||
# print(lexical_entries_save_number)
|
||||
lexical_entries_save_number += 1
|
||||
lexical_entries_load_number += 1
|
||||
print(lexical_entries_save_number)
|
||||
del g
|
||||
gc.collect()
|
||||
continue
|
||||
|
||||
if word_glob_num >= word_limit:
|
||||
myfile2.close()
|
||||
myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
|
||||
iter_index += 1
|
||||
print("Words proccesed: " + str(word_glob_num))
|
||||
|
||||
print("Word indeks: " + str(word_index))
|
||||
print("Word number: " + str(len(words)))
|
||||
|
||||
print("lexical_entries_load_number: " + str(lexical_entries_load_number))
|
||||
print("lexical_entries_save_number: " + str(lexical_entries_save_number))
|
||||
|
||||
end_timer = time.time()
|
||||
print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
|
||||
|
||||
word_index = 0
|
||||
words = []
|
||||
|
||||
while len(words) < iter_num:
|
||||
try:
|
||||
words.extend(next(gen))
|
||||
lexical_entries_load_number += 1
|
||||
except:
|
||||
break
|
||||
# if word_glob_num > 1:
|
||||
# break
|
||||
|
||||
data = Data('l', shuffle_all_inputs=False)
|
||||
location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model,
|
||||
syllabled_letters_location_model,
|
||||
letter_type_model, syllable_type_model, syllabled_letter_type_model,
|
||||
dictionary, max_word, max_num_vowels, vowels, accented_vowels,
|
||||
feature_dictionary, syllable_dictionary)
|
||||
|
||||
word_limit += len(words)
|
||||
|
||||
# READ DATA
|
||||
for child in element:
|
||||
if child.tag == 'WordForm':
|
||||
msd = None
|
||||
word = None
|
||||
for wf in child:
|
||||
if wf.tag == 'FormRepresentation':
|
||||
new_element = etree.Element('feat')
|
||||
new_element.attrib['att'] = 'naglasna_mesta_oblike'
|
||||
new_element.attrib['val'] = location_accented_words[word_index]
|
||||
wf.append(new_element)
|
||||
|
||||
new_element = etree.Element('feat')
|
||||
new_element.attrib['att'] = 'naglašena_oblika'
|
||||
new_element.attrib['val'] = accented_words[word_index]
|
||||
wf.append(new_element)
|
||||
word_glob_num += 1
|
||||
word_index += 1
|
||||
|
||||
# print(etree.tostring(element, encoding="UTF-8"))
|
||||
myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
|
||||
myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
|
||||
element.clear()
|
||||
lexical_entries_save_number += 1
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user