Commit before major RAM lack update

This commit is contained in:
lkrsnik 2018-03-21 11:35:05 +01:00
parent a316574314
commit 9edad0ad07
4 changed files with 1367 additions and 1108 deletions

View File

@ -2,11 +2,19 @@
<project version="4"> <project version="4">
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment=""> <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
<change type="NEW" beforePath="" afterPath="$PROJECT_DIR$/sloleks_accentuation.py" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_errors.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_errors.pkl" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_predictions.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/ensemble_test_predictions.pkl" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_error.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_error.pkl" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_predictions.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables_word_accetuation_test_predictions.pkl" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/accent_classification/error_analysis.ipynb" afterPath="$PROJECT_DIR$/cnn/accent_classification/error_analysis.ipynb" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/connected_text_accetuation.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/connected_text_accetuation.ipynb" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/connected_text_accetuation.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/connected_text_accetuation.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v3_10/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v3_10/cnn.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis.ipynb" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/sloleks_accetuation.ipynb" afterPath="$PROJECT_DIR$/sloleks_accetuation.ipynb" />
</list> </list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" /> <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="TRACKING_ENABLED" value="true" /> <option name="TRACKING_ENABLED" value="true" />
@ -36,19 +44,28 @@
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true"> <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/prepare_data.py"> <entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="298"> <state relative-caret-position="-1034">
<caret line="514" column="33" lean-forward="false" selection-start-line="514" selection-start-column="20" selection-end-line="514" selection-end-column="33" /> <caret line="71" column="114" lean-forward="true" selection-start-line="68" selection-start-column="12" selection-end-line="71" selection-end-column="114" />
<folding>
<element signature="e#24#63#0" expanded="true" />
<element signature="e#6485#7773#0" expanded="false" />
<element signature="e#9429#9724#0" expanded="false" />
<element signature="e#15725#16027#0" expanded="false" />
<element signature="e#17000#17346#0" expanded="false" />
<element signature="e#21415#22062#0" expanded="false" />
<element signature="e#32751#32892#0" expanded="false" />
</folding>
</state>
</provider>
</entry>
</file>
<file leaf-file-name="sloleks_accentuation.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/sloleks_accentuation.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180">
<caret line="16" column="53" lean-forward="false" selection-start-line="16" selection-start-column="53" selection-end-line="16" selection-end-column="53" />
<folding> <folding>
<element signature="e#24#63#0" expanded="true" /> <element signature="e#24#63#0" expanded="true" />
<element signature="e#5979#7267#0" expanded="false" />
<element signature="e#8923#9218#0" expanded="false" />
<element signature="e#13768#14070#0" expanded="false" />
<element signature="e#14127#14956#0" expanded="false" />
<element signature="e#15020#15366#0" expanded="false" />
<element signature="e#18933#19129#0" expanded="false" />
<element signature="e#19448#20095#0" expanded="false" />
<element signature="e#20194#22492#0" expanded="false" />
<element signature="e#30252#30393#0" expanded="false" />
</folding> </folding>
</state> </state>
</provider> </provider>
@ -77,8 +94,8 @@
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false"> <file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/workbench.py"> <entry file="file://$PROJECT_DIR$/workbench.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1044"> <state relative-caret-position="1710">
<caret line="69" column="22" lean-forward="false" selection-start-line="69" selection-start-column="22" selection-end-line="69" selection-end-column="22" /> <caret line="106" column="30" lean-forward="false" selection-start-line="106" selection-start-column="30" selection-end-line="106" selection-end-column="39" />
<folding> <folding>
<element signature="e#24#63#0" expanded="true" /> <element signature="e#24#63#0" expanded="true" />
</folding> </folding>
@ -165,23 +182,6 @@
</component> </component>
<component name="FindInProjectRecents"> <component name="FindInProjectRecents">
<findStrings> <findStrings>
<find>predict</find>
<find>_reverse_inputs</find>
<find>_letter_generator</find>
<find>_create_feature_dictionary</find>
<find>generate_data</find>
<find>Data</find>
<find>shuffle_vector</find>
<find>shuffle_vector_path</find>
<find>fit_generator</find>
<find>../../../data/</find>
<find>self.x_other_features_train</find>
<find>_create_x_features</find>
<find>force</find>
<find>test_and_validation_size</find>
<find>self.y_train</find>
<find>_additional_letter_attributes</find>
<find>np.random.seed</find>
<find>round</find> <find>round</find>
<find>is_vow</find> <find>is_vow</find>
<find>self._input_type == 'l'</find> <find>self._input_type == 'l'</find>
@ -195,6 +195,23 @@
<find>_generator</find> <find>_generator</find>
<find>_create_syllable_letters_translator</find> <find>_create_syllable_letters_translator</find>
<find>_accent_classification</find> <find>_accent_classification</find>
<find>wrong</find>
<find>wrong_word</find>
<find>predict</find>
<find>get_ensemble_type_predictions</find>
<find>_convert_to_multext_east_v4</find>
<find>_split_consonants</find>
<find>UNRECOGNIZED</find>
<find>word_glob_num</find>
<find>convert_multext</find>
<find>_syllable_generator</find>
<find>generator</find>
<find>generate_data</find>
<find>_x</find>
<find>bidirectional_basic_input</find>
<find>_bidirectional_basic_input</find>
<find>shuffeling</find>
<find>_generate_x_and_y</find>
</findStrings> </findStrings>
</component> </component>
<component name="Git.Settings"> <component name="Git.Settings">
@ -216,13 +233,14 @@
<option value="$PROJECT_DIR$/notes" /> <option value="$PROJECT_DIR$/notes" />
<option value="$PROJECT_DIR$/workbench.xrsl" /> <option value="$PROJECT_DIR$/workbench.xrsl" />
<option value="$PROJECT_DIR$/workbench.py" /> <option value="$PROJECT_DIR$/workbench.py" />
<option value="$PROJECT_DIR$/sloleks_accentuation.py" />
<option value="$PROJECT_DIR$/prepare_data.py" /> <option value="$PROJECT_DIR$/prepare_data.py" />
</list> </list>
</option> </option>
</component> </component>
<component name="ProjectFrameBounds"> <component name="ProjectFrameBounds">
<option name="x" value="65" /> <option name="x" value="65" />
<option name="y" value="144" /> <option name="y" value="24" />
<option name="width" value="1855" /> <option name="width" value="1855" />
<option name="height" value="1056" /> <option name="height" value="1056" />
</component> </component>
@ -241,8 +259,6 @@
<foldersAlwaysOnTop value="true" /> <foldersAlwaysOnTop value="true" />
</navigator> </navigator>
<panes> <panes>
<pane id="Scratches" />
<pane id="Scope" />
<pane id="ProjectPane"> <pane id="ProjectPane">
<subPane> <subPane>
<PATH> <PATH>
@ -257,11 +273,13 @@
</PATH> </PATH>
</subPane> </subPane>
</pane> </pane>
<pane id="Scratches" />
<pane id="Scope" />
</panes> </panes>
</component> </component>
<component name="PropertiesComponent"> <component name="PropertiesComponent">
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" /> <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
<property name="last_opened_file_path" value="$USER_HOME$/miniconda3/bin/python" /> <property name="last_opened_file_path" value="$PROJECT_DIR$" />
</component> </component>
<component name="RecentsManager"> <component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS"> <key name="CopyFile.RECENT_KEYS">
@ -519,7 +537,7 @@
<servers /> <servers />
</component> </component>
<component name="ToolWindowManager"> <component name="ToolWindowManager">
<frame x="65" y="144" width="1855" height="1056" extended-state="6" /> <frame x="65" y="24" width="1855" height="1056" extended-state="6" />
<editor active="true" /> <editor active="true" />
<layout> <layout>
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.12227074" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" /> <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.12227074" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
@ -574,17 +592,6 @@
<watches-manager /> <watches-manager />
</component> </component>
<component name="editorHistoryManager"> <component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/theano_tutorial/tutorial_derivates.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding>
<element signature="e#0#18#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/theanoTest.py" />
<entry file="file://$PROJECT_DIR$/theano_tutorial/test.py"> <entry file="file://$PROJECT_DIR$/theano_tutorial/test.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1368"> <state relative-caret-position="1368">
@ -920,8 +927,19 @@
</entry> </entry>
<entry file="file://$PROJECT_DIR$/workbench.py"> <entry file="file://$PROJECT_DIR$/workbench.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1044"> <state relative-caret-position="1710">
<caret line="69" column="22" lean-forward="false" selection-start-line="69" selection-start-column="22" selection-end-line="69" selection-end-column="22" /> <caret line="106" column="30" lean-forward="false" selection-start-line="106" selection-start-column="30" selection-end-line="106" selection-end-column="39" />
<folding>
<element signature="e#24#63#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../adventofcode/2017/2/1.py" />
<entry file="file://$PROJECT_DIR$/sloleks_accentuation.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180">
<caret line="16" column="53" lean-forward="false" selection-start-line="16" selection-start-column="53" selection-end-line="16" selection-end-column="53" />
<folding> <folding>
<element signature="e#24#63#0" expanded="true" /> <element signature="e#24#63#0" expanded="true" />
</folding> </folding>
@ -930,19 +948,16 @@
</entry> </entry>
<entry file="file://$PROJECT_DIR$/prepare_data.py"> <entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="298"> <state relative-caret-position="-1034">
<caret line="514" column="33" lean-forward="false" selection-start-line="514" selection-start-column="20" selection-end-line="514" selection-end-column="33" /> <caret line="71" column="114" lean-forward="true" selection-start-line="68" selection-start-column="12" selection-end-line="71" selection-end-column="114" />
<folding> <folding>
<element signature="e#24#63#0" expanded="true" /> <element signature="e#24#63#0" expanded="true" />
<element signature="e#5979#7267#0" expanded="false" /> <element signature="e#6485#7773#0" expanded="false" />
<element signature="e#8923#9218#0" expanded="false" /> <element signature="e#9429#9724#0" expanded="false" />
<element signature="e#13768#14070#0" expanded="false" /> <element signature="e#15725#16027#0" expanded="false" />
<element signature="e#14127#14956#0" expanded="false" /> <element signature="e#17000#17346#0" expanded="false" />
<element signature="e#15020#15366#0" expanded="false" /> <element signature="e#21415#22062#0" expanded="false" />
<element signature="e#18933#19129#0" expanded="false" /> <element signature="e#32751#32892#0" expanded="false" />
<element signature="e#19448#20095#0" expanded="false" />
<element signature="e#20194#22492#0" expanded="false" />
<element signature="e#30252#30393#0" expanded="false" />
</folding> </folding>
</state> </state>
</provider> </provider>

View File

@ -9,10 +9,20 @@ import keras.backend as K
import os.path import os.path
import codecs import codecs
from keras import optimizers
from keras.models import Model
from keras.layers import Dense, Dropout, Input
from keras.layers.merge import concatenate
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Flatten
from keras.models import load_model
class Data: class Data:
def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True, def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True,
additional_letter_attributes=True, reverse_inputs=True, accent_classification=False, number_of_syllables=False): additional_letter_attributes=True, reverse_inputs=True, accent_classification=False, number_of_syllables=False,
convert_multext=True, bidirectional_basic_input=False):
self._input_type = input_type self._input_type = input_type
self._save_generated_data = save_generated_data self._save_generated_data = save_generated_data
self._allow_shuffle_vector_generation = allow_shuffle_vector_generation self._allow_shuffle_vector_generation = allow_shuffle_vector_generation
@ -21,6 +31,8 @@ class Data:
self._reverse_inputs = reverse_inputs self._reverse_inputs = reverse_inputs
self._accent_classification = accent_classification self._accent_classification = accent_classification
self._number_of_syllables = number_of_syllables self._number_of_syllables = number_of_syllables
self._convert_multext = convert_multext
self._bidirectional_basic_input = bidirectional_basic_input
self.x_train = None self.x_train = None
self.x_other_features_train = None self.x_other_features_train = None
@ -169,14 +181,20 @@ class Data:
def _x_letter_input(self, content, dictionary, max_word, vowels): def _x_letter_input(self, content, dictionary, max_word, vowels):
if self._additional_letter_attributes: if self._additional_letter_attributes:
if not self._bidirectional_basic_input:
x = np.zeros((len(content), max_word, len(dictionary) + 6), dtype=int) x = np.zeros((len(content), max_word, len(dictionary) + 6), dtype=int)
else:
x = np.zeros((len(content), 2 * max_word, len(dictionary) + 6), dtype=int)
voiced_consonants = self._get_voiced_consonants() voiced_consonants = self._get_voiced_consonants()
resonant_silent_consonants = self._get_resonant_silent_consonants() resonant_silent_consonants = self._get_resonant_silent_consonants()
nonresonant_silent_consonants = self._get_nonresonant_silent_consonants() nonresonant_silent_consonants = self._get_nonresonant_silent_consonants()
# print('HERE!!!') # print('HERE!!!')
else: else:
# print('HERE!!!') # print('HERE!!!')
if not self._bidirectional_basic_input:
x = np.zeros((len(content), max_word, len(dictionary)), dtype=int) x = np.zeros((len(content), max_word, len(dictionary)), dtype=int)
else:
x = np.zeros((len(content), 2 * max_word, len(dictionary)), dtype=int)
i = 0 i = 0
for el in content: for el in content:
@ -185,25 +203,44 @@ class Data:
word = word[::-1] word = word[::-1]
j = 0 j = 0
for c in list(word): for c in list(word):
if j >= max_word:
continue
index = 0 index = 0
if self._bidirectional_basic_input:
j2 = max_word + (len(word) - j - 1)
for d in dictionary: for d in dictionary:
if c == d: if c == d:
x[i][j][index] = 1 x[i][j][index] = 1
if self._bidirectional_basic_input:
x[i][j2][index] = 1
break break
index += 1 index += 1
if self._additional_letter_attributes: if self._additional_letter_attributes:
if self._is_vowel(word, j, vowels): if self._is_vowel(word, j, vowels):
x[i][j][len(dictionary)] = 1 x[i][j][len(dictionary)] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary)] = 1
else: else:
x[i][j][len(dictionary) + 1] = 1 x[i][j][len(dictionary) + 1] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary) + 1] = 1
if c in voiced_consonants: if c in voiced_consonants:
x[i][j][len(dictionary) + 2] = 1 x[i][j][len(dictionary) + 2] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary) + 2] = 1
else: else:
x[i][j][len(dictionary) + 3] = 1 x[i][j][len(dictionary) + 3] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary) + 3] = 1
if c in resonant_silent_consonants: if c in resonant_silent_consonants:
x[i][j][len(dictionary) + 4] = 1 x[i][j][len(dictionary) + 4] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary) + 4] = 1
elif c in nonresonant_silent_consonants: elif c in nonresonant_silent_consonants:
x[i][j][len(dictionary) + 5] = 1 x[i][j][len(dictionary) + 5] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary) + 5] = 1
j += 1 j += 1
i += 1 i += 1
return x return x
@ -218,6 +255,8 @@ class Data:
if self._reverse_inputs: if self._reverse_inputs:
syllables = syllables[::-1] syllables = syllables[::-1]
for syllable in syllables: for syllable in syllables:
if j >= max_num_vowels:
continue
if syllable in dictionary: if syllable in dictionary:
index = dictionary.index(syllable) index = dictionary.index(syllable)
else: else:
@ -297,7 +336,7 @@ class Data:
consonants.append(word_list[i]) consonants.append(word_list[i])
syllables.append(''.join(consonants)) syllables.append(''.join(consonants))
else: else:
left_consonants, right_consonants = self._split_consonants(consonants) left_consonants, right_consonants = self._split_consonants(list(''.join(consonants).lower()))
syllables[-1] += ''.join(left_consonants) syllables[-1] += ''.join(left_consonants)
right_consonants.append(word_list[i]) right_consonants.append(word_list[i])
syllables.append(''.join(right_consonants)) syllables.append(''.join(right_consonants))
@ -344,9 +383,7 @@ class Data:
elif consonants[i] in unresonant_silent_consonants: elif consonants[i] in unresonant_silent_consonants:
if consonants[i + 1] in resonant_silent_consonants: if consonants[i + 1] in resonant_silent_consonants:
split_options.append([i, 4]) split_options.append([i, 4])
else:
print(consonants)
print('UNRECOGNIZED LETTERS!')
if split_options == []: if split_options == []:
return [''], consonants return [''], consonants
else: else:
@ -358,7 +395,10 @@ class Data:
x_other_features = [] x_other_features = []
for el in content: for el in content:
x_el_other_features = [] x_el_other_features = []
if self._convert_multext:
converted_el = ''.join(self._convert_to_multext_east_v4(list(el[2]), feature_dictionary)) converted_el = ''.join(self._convert_to_multext_east_v4(list(el[2]), feature_dictionary))
else:
converted_el = el[2]
for feature in feature_dictionary: for feature in feature_dictionary:
if converted_el[0] == feature[1]: if converted_el[0] == feature[1]:
x_el_other_features.append(1) x_el_other_features.append(1)
@ -582,6 +622,15 @@ class Data:
input_x_other_features_stack = input_x_other_features_stack[batch_size:] input_x_other_features_stack = input_x_other_features_stack[batch_size:]
input_y_stack = input_y_stack[batch_size:] input_y_stack = input_y_stack[batch_size:]
else: else:
#print('-------------------------------------------------------------------------------------------')
#if dictionary is not None:
# print(self.decode_x(word_encoded, dictionary))
#print(input_x_stack)
#print(input_x_other_features_stack)
#print(input_y_stack)
#print(loc)
if len(input_x_stack) == 0:
continue
gen_orig_x = translator[np.array(input_x_stack)] gen_orig_x = translator[np.array(input_x_stack)]
yield ([gen_orig_x, np.array(input_x_other_features_stack)], np.array(input_y_stack)) yield ([gen_orig_x, np.array(input_x_other_features_stack)], np.array(input_y_stack))
input_x_stack = [] input_x_stack = []
@ -1005,6 +1054,310 @@ class Data:
else: else:
return ''.join(word_list[::-1]) return ''.join(word_list[::-1])
def assign_stress_types(self, predictions, word, y, vowels, accented_vowels):
words = []
accentuation_index = 0
for i in range(len(y)):
wrong_word = word[i][::-1]
for j in range(len(y[i])):
if y[i][j] > 0:
stressed_letter = self.get_accentuated_letter(word[i][::-1], j, vowels, syllables=self._input_type != 'l')
possible_places = np.zeros(len(predictions[accentuation_index]))
if stressed_letter == 'r':
possible_places[0] = 1
elif stressed_letter == 'a':
possible_places[1] = 1
possible_places[2] = 1
elif stressed_letter == 'e':
possible_places[3] = 1
possible_places[4] = 1
possible_places[5] = 1
elif stressed_letter == 'i':
possible_places[6] = 1
possible_places[7] = 1
elif stressed_letter == 'o':
possible_places[8] = 1
possible_places[9] = 1
possible_places[10] = 1
elif stressed_letter == 'u':
possible_places[11] = 1
possible_places[12] = 1
possible_predictions = predictions[accentuation_index] * possible_places
arounded_predictions = np.zeros(len(predictions[accentuation_index]), dtype=int)
arounded_predictions[np.argmax(possible_predictions).astype(int)] = 1
if np.max(possible_predictions) != 0:
wrong_word = self.assign_word_accentuation_type(wrong_word, j, arounded_predictions, vowels, accented_vowels,
syllables=self._input_type != 'l', debug=i == 313)
accentuation_index += 1
words.append(wrong_word[::-1])
return words
@staticmethod
def load_location_models(letters_path, syllables_path, syllabled_letters_path):
############################ LOCATION ########################
letter_location_model = load_model(letters_path, custom_objects={'actual_accuracy': actual_accuracy})
# num_examples = len(data.x_train) # training set size
nn_output_dim = 10
conv_input_shape = (10, 5168)
othr_input = (140,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
# syllabled letters
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
syllable_location_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
syllable_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
syllable_location_model.load_weights(syllables_path)
conv_input_shape = (10, 252)
othr_input = (140,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
# syllabled letters
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
syllabled_letters_location_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
syllabled_letters_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
syllabled_letters_location_model.load_weights(syllabled_letters_path)
return letter_location_model, syllable_location_model, syllabled_letters_location_model
@staticmethod
def load_type_models(letters_path, syllables_path, syllabled_letters_path):
nn_output_dim = 13
# letters
conv_input_shape = (23, 36)
othr_input = (150,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
# letters
x_conv = Conv1D(115, (3), padding='same', activation='relu')(conv_input)
x_conv = Conv1D(46, (3), padding='same', activation='relu')(x_conv)
# syllabled letters
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
letter_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
letter_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
letter_type_model.load_weights(letters_path)
conv_input_shape = (10, 5168)
othr_input = (150,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
syllable_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
syllable_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
syllable_type_model.load_weights(syllables_path)
# syllabled letters
conv_input_shape = (10, 252)
othr_input = (150,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
syllabled_letter_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
syllabled_letter_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
syllabled_letter_type_model.load_weights(syllabled_letters_path)
return letter_type_model, syllable_type_model, syllabled_letter_type_model
@staticmethod
def get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
batch_size = 16
# print(tagged_input_words[pos])
data = Data('l', shuffle_all_inputs=False, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
feature_dictionary, 'who cares')
generator = data._letter_generator(x, x_other_features, fake_y, batch_size, accented_vowels)
letter_location_predictions = letter_location_model.predict_generator(generator, len(x) / (batch_size))
data = Data('s', shuffle_all_inputs=False, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
eye = np.eye(len(syllable_dictionary), dtype=int)
generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, eye, accented_vowels)
syllable_location_predictions = syllable_location_model.predict_generator(generator, len(x) / (batch_size))
data = Data('sl', shuffle_all_inputs=False, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
max_syllable = data._get_max_syllable(syllable_dictionary)
syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, syllable_letters_translator, accented_vowels)
syllabled_letters_location_predictions = syllabled_letters_location_model.predict_generator(generator, len(x) / (batch_size))
return np.mean(np.array([letter_location_predictions, syllable_location_predictions, syllabled_letters_location_predictions]), axis=0)
@staticmethod
def get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model, syllabled_letter_type_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
batch_size = 16
y_array = np.asarray(location_y)
accentuation_length = (y_array > 0).sum()
data = Data('l', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
feature_dictionary, 'who cares')
generator = data._letter_generator(x, x_other_features, location_y, batch_size, accented_vowels)
letter_type_predictions = letter_type_model.predict_generator(generator, accentuation_length / (batch_size))
data = Data('s', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
eye = np.eye(len(syllable_dictionary), dtype=int)
generator = data._syllable_generator(x, x_other_features, location_y, batch_size, eye, accented_vowels)
syllable_type_predictions = syllable_type_model.predict_generator(generator, accentuation_length / (batch_size))
data = Data('sl', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
max_syllable = data._get_max_syllable(syllable_dictionary)
syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
generator = data._syllable_generator(x, x_other_features, location_y, batch_size, syllable_letters_translator, accented_vowels)
syllabled_letter_type_predictions = syllabled_letter_type_model.predict_generator(generator, accentuation_length / batch_size)
return np.mean(np.array([letter_type_predictions, syllable_type_predictions, syllabled_letter_type_predictions]), axis=0)
def assign_location_stress(self, word, locations, vowels):
# word = list(word)
word_list = list(word)
for loc in locations:
vowel_num = 0
# if loc == 0:
# return word
for i in range(len(word_list)):
if self._is_vowel(word_list, i, vowels):
if word_list[i] == 'a' and vowel_num == loc:
word_list[i] = 'á'
elif word_list[i] == 'e' and vowel_num == loc:
word_list[i] = 'é'
elif word_list[i] == 'i' and vowel_num == loc:
word_list[i] = 'í'
elif word_list[i] == 'o' and vowel_num == loc:
word_list[i] = 'ó'
elif word_list[i] == 'u' and vowel_num == loc:
word_list[i] = 'ú'
elif word_list[i] == 'r' and vowel_num == loc:
word_list[i] = 'ŕ'
elif word_list[i] == 'A' and vowel_num == loc:
word_list[i] = 'Á'
elif word_list[i] == 'E' and vowel_num == loc:
word_list[i] = 'É'
elif word_list[i] == 'I' and vowel_num == loc:
word_list[i] = 'Í'
elif word_list[i] == 'O' and vowel_num == loc:
word_list[i] = 'Ó'
elif word_list[i] == 'U' and vowel_num == loc:
word_list[i] = 'Ú'
elif word_list[i] == 'R' and vowel_num == loc:
word_list[i] = 'Ŕ'
vowel_num += 1
# print(word_list)
return ''.join(word_list)
def accentuate_word(self, input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
letter_type_model, syllable_type_model, syllabled_letter_type_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
predictions = self.get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model,
syllabled_letters_location_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
syllable_dictionary)
if 'A' not in vowels:
vowels.extend(['A', 'E', 'I', 'O', 'U'])
location_accented_words = [self.assign_location_stress(input_words[i][0][::-1], self.decode_y(predictions[i]), vowels)[::-1] for i in
range(len(input_words))]
location_y = np.around(predictions)
type_predictions = self.get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model,
syllabled_letter_type_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
syllable_dictionary)
only_words = [el[0] for el in input_words]
accented_words = self.assign_stress_types(type_predictions, only_words, location_y, vowels, accented_vowels)
return location_accented_words, accented_words
# def count_vowels(content, vowels): # def count_vowels(content, vowels):
# num_all_vowels = 0 # num_all_vowels = 0
# for el in content: # for el in content:

161
sloleks_accentuation.py Normal file
View File

@ -0,0 +1,161 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
from keras.models import load_model
import sys
from prepare_data import *
np.random.seed(7)
data = Data('l', shuffle_all_inputs=False)
content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex')
dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content)
feature_dictionary = data._create_slovene_feature_dictionary()
syllable_dictionary = data._create_syllables_dictionary(content, vowels)
accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']
letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(
'cnn/word_accetuation/cnn_dictionary/v3_10/20_test_epoch.h5',
'cnn/word_accetuation/syllables/v2_4/20_test_epoch.h5',
'cnn/word_accetuation/syllabled_letters/v2_5_3/20_test_epoch.h5')
letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models(
'cnn/accent_classification/letters/v2_1/20_test_epoch.h5',
'cnn/accent_classification/syllables/v1_0/20_test_epoch.h5',
'cnn/accent_classification/syllabled_letters/v1_0/20_test_epoch.h5')
from lxml import etree
def xml_words_generator(xml_path):
for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):
words = []
for child in element:
if child.tag == 'WordForm':
msd = None
word = None
for wf in child:
if 'att' in wf.attrib and wf.attrib['att'] == 'msd':
msd = wf.attrib['val']
elif wf.tag == 'FormRepresentation':
for form_rep in wf:
if form_rep.attrib['att'] == 'zapis_oblike':
word = form_rep.attrib['val']
# if msd is not None and word is not None:
# pass
# else:
# print('NOOOOO')
words.append([word, '', msd, word])
yield words
gen = xml_words_generator('data/Sloleks_v1.2.xml')
# Words proccesed: 650250
# Word indeks: 50023
# Word number: 50023
from lxml import etree
import time
gen = xml_words_generator('data/Sloleks_v1.2.xml')
word_glob_num = 0
word_limit = 0
iter_num = 50000
word_index = 0
start_timer = time.time()
iter_index = 0
words = []
lexical_entries_load_number = 0
lexical_entries_save_number = 0
# INSIDE
word_glob_num = 1500686
word_limit = 50000
iter_index = 30
done_lexical_entries = 33522
import gc
with open("data/new_sloleks/new_sloleks.xml", "ab") as myfile:
myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
# LOAD NEW WORDS AND ACCENTUATE THEM
# print("HERE")
if lexical_entries_save_number < done_lexical_entries:
g = next(gen)
# print(lexical_entries_save_number)
lexical_entries_save_number += 1
lexical_entries_load_number += 1
print(lexical_entries_save_number)
del g
gc.collect()
continue
if word_glob_num >= word_limit:
myfile2.close()
myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
iter_index += 1
print("Words proccesed: " + str(word_glob_num))
print("Word indeks: " + str(word_index))
print("Word number: " + str(len(words)))
print("lexical_entries_load_number: " + str(lexical_entries_load_number))
print("lexical_entries_save_number: " + str(lexical_entries_save_number))
end_timer = time.time()
print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
word_index = 0
words = []
while len(words) < iter_num:
try:
words.extend(next(gen))
lexical_entries_load_number += 1
except:
break
# if word_glob_num > 1:
# break
data = Data('l', shuffle_all_inputs=False)
location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model,
syllabled_letters_location_model,
letter_type_model, syllable_type_model, syllabled_letter_type_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels,
feature_dictionary, syllable_dictionary)
word_limit += len(words)
# READ DATA
for child in element:
if child.tag == 'WordForm':
msd = None
word = None
for wf in child:
if wf.tag == 'FormRepresentation':
new_element = etree.Element('feat')
new_element.attrib['att'] = 'naglasna_mesta_oblike'
new_element.attrib['val'] = location_accented_words[word_index]
wf.append(new_element)
new_element = etree.Element('feat')
new_element.attrib['att'] = 'naglašena_oblika'
new_element.attrib['val'] = accented_words[word_index]
wf.append(new_element)
word_glob_num += 1
word_index += 1
# print(etree.tostring(element, encoding="UTF-8"))
myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
element.clear()
lexical_entries_save_number += 1

File diff suppressed because one or more lines are too long