[MAJOR REFACTOR] Added accent classification (from scratch) and deleted unnecessary y output (output where no accent should be employed) X-es in case of both syllabled inputs have also been changed in similar manner.

This commit is contained in:
lkrsnik 2017-07-27 18:20:18 +02:00
parent 83584a0c6f
commit 18348b78fc
2 changed files with 146 additions and 76 deletions

View File

@ -2,10 +2,8 @@
<project version="4"> <project version="4">
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment=""> <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/dictionaries/luka.xml" afterPath="$PROJECT_DIR$/.idea/dictionaries/luka.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/cnn.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
</list> </list>
@ -37,22 +35,25 @@
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true"> <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/prepare_data.py"> <entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="568"> <state relative-caret-position="244">
<caret line="482" column="74" lean-forward="true" selection-start-line="482" selection-start-column="74" selection-end-line="482" selection-end-column="74" /> <caret line="252" column="38" lean-forward="true" selection-start-line="252" selection-start-column="38" selection-end-line="252" selection-end-column="38" />
<folding> <folding>
<element signature="e#24#63#0" expanded="true" /> <element signature="e#24#63#0" expanded="true" />
<element signature="e#5524#5637#0" expanded="false" /> <element signature="e#5658#5771#0" expanded="false" />
<element signature="e#5684#6970#0" expanded="false" /> <element signature="e#5818#7106#0" expanded="false" />
<element signature="e#7131#8538#0" expanded="false" /> <element signature="e#7267#8674#0" expanded="false" />
<element signature="e#8626#8921#0" expanded="false" /> <element signature="e#8762#9057#0" expanded="false" />
<element signature="e#13363#13665#0" expanded="false" /> <element signature="e#13496#13798#0" expanded="false" />
<element signature="e#13722#14551#0" expanded="false" /> <element signature="e#13855#14684#0" expanded="false" />
<element signature="e#14615#14961#0" expanded="false" /> <element signature="e#14748#15094#0" expanded="false" />
<element signature="e#16836#17749#0" expanded="false" /> <element signature="e#16969#17882#0" expanded="false" />
<element signature="e#18179#18375#0" expanded="false" /> <element signature="e#18312#18508#0" expanded="false" />
<element signature="e#18436#18627#0" expanded="false" /> <element signature="e#18569#18760#0" expanded="false" />
<element signature="e#18694#19341#0" expanded="false" /> <element signature="e#18827#19474#0" expanded="false" />
<element signature="e#19440#21738#0" expanded="false" /> <element signature="e#19573#21871#0" expanded="false" />
<element signature="e#22137#22836#0" expanded="false" />
<element signature="e#29631#29772#0" expanded="false" />
<element signature="e#29922#32067#0" expanded="false" />
</folding> </folding>
</state> </state>
</provider> </provider>
@ -149,16 +150,6 @@
</component> </component>
<component name="FindInProjectRecents"> <component name="FindInProjectRecents">
<findStrings> <findStrings>
<find>_create_X_features</find>
<find>raise</find>
<find>create_syllables_dictionary</find>
<find>decode_</find>
<find>create_x_features</find>
<find>generate_x_and_y</find>
<find>create_syllables</find>
<find>split_consonants</find>
<find>get_unresonant_silent_consonants</find>
<find>dict_occurances_in_dataset_rate</find>
<find>count_vowels</find> <find>count_vowels</find>
<find>shuffle_full_vowel_inputs</find> <find>shuffle_full_vowel_inputs</find>
<find>generate_presentable_y</find> <find>generate_presentable_y</find>
@ -179,6 +170,16 @@
<find>_create_syllable_letters_translator</find> <find>_create_syllable_letters_translator</find>
<find>_get_unresonant_silent_consonants</find> <find>_get_unresonant_silent_consonants</find>
<find>el[0]</find> <find>el[0]</find>
<find>max_num_vowels</find>
<find>index</find>
<find>accentuated</find>
<find>create_syll</find>
<find>shuffle_all_inputs</find>
<find>accented</find>
<find>_accented</find>
<find>size</find>
<find>decode_x</find>
<find>self._input_type ==</find>
</findStrings> </findStrings>
</component> </component>
<component name="Git.Settings"> <component name="Git.Settings">
@ -532,7 +533,7 @@
</component> </component>
<component name="XDebuggerManager"> <component name="XDebuggerManager">
<breakpoint-manager> <breakpoint-manager>
<option name="time" value="5" /> <option name="time" value="6" />
</breakpoint-manager> </breakpoint-manager>
<watches-manager /> <watches-manager />
</component> </component>
@ -831,22 +832,25 @@
</entry> </entry>
<entry file="file://$PROJECT_DIR$/prepare_data.py"> <entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="568"> <state relative-caret-position="244">
<caret line="482" column="74" lean-forward="true" selection-start-line="482" selection-start-column="74" selection-end-line="482" selection-end-column="74" /> <caret line="252" column="38" lean-forward="true" selection-start-line="252" selection-start-column="38" selection-end-line="252" selection-end-column="38" />
<folding> <folding>
<element signature="e#24#63#0" expanded="true" /> <element signature="e#24#63#0" expanded="true" />
<element signature="e#5524#5637#0" expanded="false" /> <element signature="e#5658#5771#0" expanded="false" />
<element signature="e#5684#6970#0" expanded="false" /> <element signature="e#5818#7106#0" expanded="false" />
<element signature="e#7131#8538#0" expanded="false" /> <element signature="e#7267#8674#0" expanded="false" />
<element signature="e#8626#8921#0" expanded="false" /> <element signature="e#8762#9057#0" expanded="false" />
<element signature="e#13363#13665#0" expanded="false" /> <element signature="e#13496#13798#0" expanded="false" />
<element signature="e#13722#14551#0" expanded="false" /> <element signature="e#13855#14684#0" expanded="false" />
<element signature="e#14615#14961#0" expanded="false" /> <element signature="e#14748#15094#0" expanded="false" />
<element signature="e#16836#17749#0" expanded="false" /> <element signature="e#16969#17882#0" expanded="false" />
<element signature="e#18179#18375#0" expanded="false" /> <element signature="e#18312#18508#0" expanded="false" />
<element signature="e#18436#18627#0" expanded="false" /> <element signature="e#18569#18760#0" expanded="false" />
<element signature="e#18694#19341#0" expanded="false" /> <element signature="e#18827#19474#0" expanded="false" />
<element signature="e#19440#21738#0" expanded="false" /> <element signature="e#19573#21871#0" expanded="false" />
<element signature="e#22137#22836#0" expanded="false" />
<element signature="e#29631#29772#0" expanded="false" />
<element signature="e#29922#32067#0" expanded="false" />
</folding> </folding>
</state> </state>
</provider> </provider>

View File

@ -11,13 +11,14 @@ import os.path
class Data: class Data:
def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True, def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True,
additional_letter_attributes=True, reverse_inputs=True): additional_letter_attributes=True, reverse_inputs=True, accent_classification=False):
self._input_type = input_type self._input_type = input_type
self._save_generated_data = save_generated_data self._save_generated_data = save_generated_data
self._allow_shuffle_vector_generation = allow_shuffle_vector_generation self._allow_shuffle_vector_generation = allow_shuffle_vector_generation
self._shuffle_all_inputs = shuffle_all_inputs self._shuffle_all_inputs = shuffle_all_inputs
self._additional_letter_attributes = additional_letter_attributes self._additional_letter_attributes = additional_letter_attributes
self._reverse_inputs = reverse_inputs self._reverse_inputs = reverse_inputs
self._accent_classification = accent_classification
self.x_train = None self.x_train = None
self.x_other_features_train = None self.x_other_features_train = None
@ -30,14 +31,14 @@ class Data:
self.y_validate = None self.y_validate = None
def generate_data(self, train_inputs_name, test_inputs_name, validate_inputs_name, test_and_validation_size=0.1, def generate_data(self, train_inputs_name, test_inputs_name, validate_inputs_name, test_and_validation_size=0.1,
content_name='SlovarIJS_BESEDE_utf8.lex', force_override=False, content_name='SlovarIJS_BESEDE_utf8.lex',
content_shuffle_vector='content_shuffle_vector', shuffle_vector='shuffle_vector', content_shuffle_vector='content_shuffle_vector', shuffle_vector='shuffle_vector',
inputs_location='../../internal_representations/inputs/', content_location='../../../data/'): inputs_location='../../internal_representations/inputs/', content_location='../../../data/'):
content_path = '{}{}'.format(content_location, content_name) content_path = '{}{}'.format(content_location, content_name)
train_path = '{}{}.h5'.format(inputs_location, train_inputs_name) train_path = '{}{}.h5'.format(inputs_location, train_inputs_name)
test_path = '{}{}.h5'.format(inputs_location, test_inputs_name) test_path = '{}{}.h5'.format(inputs_location, test_inputs_name)
validate_path = '{}{}.h5'.format(inputs_location, validate_inputs_name) validate_path = '{}{}.h5'.format(inputs_location, validate_inputs_name)
if os.path.exists(train_path) and os.path.exists(test_path) and os.path.exists(validate_path): if not force_override and os.path.exists(train_path) and os.path.exists(test_path) and os.path.exists(validate_path):
print('LOADING DATA...') print('LOADING DATA...')
self.x_train, self.x_other_features_train, self.y_train = self._load_inputs(train_path) self.x_train, self.x_other_features_train, self.y_train = self._load_inputs(train_path)
self.x_test, self.x_other_features_test, self.y_test = self._load_inputs(test_path) self.x_test, self.x_other_features_test, self.y_test = self._load_inputs(test_path)
@ -62,7 +63,7 @@ class Data:
print('CONTENT READ SUCCESSFULLY') print('CONTENT READ SUCCESSFULLY')
print('CREATING DICTIONARY...') print('CREATING DICTIONARY...')
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content) dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
if self._input_type == 's' or self._input_type == 'ls': if self._input_type == 's' or self._input_type == 'sl':
dictionary = self._create_syllables_dictionary(content, vowels) dictionary = self._create_syllables_dictionary(content, vowels)
print('DICTIONARY CREATION SUCCESSFUL!') print('DICTIONARY CREATION SUCCESSFUL!')
# test_and_validation_size = 0.1 # test_and_validation_size = 0.1
@ -125,7 +126,7 @@ class Data:
break break
line += 1 line += 1
dictionary_input = sorted(dictionary_input) dictionary_input = sorted(dictionary_input)
max_num_vowels += 1 # max_num_vowels += 1
return dictionary_input, max_word, max_num_vowels, vowels, accented_vowels return dictionary_input, max_word, max_num_vowels, vowels, accented_vowels
# split content so that there is no overfitting # split content so that there is no overfitting
@ -230,23 +231,22 @@ class Data:
word = word[::-1] word = word[::-1]
j = 0 j = 0
word_accentuations = [] # word_accentuations = []
num_vowels = 0 num_vowels = 0
for c in list(word): for c in list(word):
index = 0 index = 0
if self._is_vowel(word, j, vowels):
num_vowels += 1
for d in accentuated_vowels: for d in accentuated_vowels:
if c == d: if c == d:
word_accentuations.append(num_vowels) if not self._accent_classification:
y[i][num_vowels] = 1
else:
y[i][num_vowels] = index
# word_accentuations.append(num_vowels)
break break
index += 1 index += 1
if self._is_vowel(word, j, vowels):
num_vowels += 1
j += 1 j += 1
if len(word_accentuations) > 0:
for word_accentuation in word_accentuations:
y[i][word_accentuation] = 1
else:
y[i][0] = 1
i += 1 i += 1
return y return y
@ -255,10 +255,10 @@ class Data:
shuffle_vector_location): shuffle_vector_location):
if self._input_type == 'l': if self._input_type == 'l':
x = self._x_letter_input(content, dictionary, max_word, vowels) x = self._x_letter_input(content, dictionary, max_word, vowels)
elif self._input_type == 's' or self._input_type == 'ls': elif self._input_type == 's' or self._input_type == 'sl':
x = self._x_syllable_input(content, dictionary, max_num_vowels, vowels) x = self._x_syllable_input(content, dictionary, max_num_vowels, vowels)
else: else:
raise ValueError('No input_type provided. It could be \'l\', \'s\' or \'ls\'.') raise ValueError('No input_type provided. It could be \'l\', \'s\' or \'sl\'.')
y = self._y_output(content, max_num_vowels, vowels, accentuated_vowels) y = self._y_output(content, max_num_vowels, vowels, accentuated_vowels)
print('CREATING OTHER FEATURES...') print('CREATING OTHER FEATURES...')
@ -476,46 +476,112 @@ class Data:
def _generator_instance(self, orig_x, orig_x_additional, orig_y, batch_size, content_path): def _generator_instance(self, orig_x, orig_x_additional, orig_y, batch_size, content_path):
if self._input_type == 'l': if self._input_type == 'l':
return self._letter_generator(orig_x, orig_x_additional, orig_y, batch_size) content = self._read_content(content_path)
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
return self._letter_generator(orig_x, orig_x_additional, orig_y, batch_size, accented_vowels)
elif self._input_type == 's': elif self._input_type == 's':
content = self._read_content(content_path) content = self._read_content(content_path)
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content) dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
syllable_dictionary = self._create_syllables_dictionary(content, vowels) syllable_dictionary = self._create_syllables_dictionary(content, vowels)
eye = np.eye(len(syllable_dictionary), dtype=int) eye = np.eye(len(syllable_dictionary), dtype=int)
return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye) return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye, accented_vowels)
elif self._input_type == 'sl': elif self._input_type == 'sl':
content = self._read_content(content_path) content = self._read_content(content_path)
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content) dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
syllable_dictionary = self._create_syllables_dictionary(content, vowels) syllable_dictionary = self._create_syllables_dictionary(content, vowels)
max_syllable = self._get_max_syllable(syllable_dictionary) max_syllable = self._get_max_syllable(syllable_dictionary)
syllable_letters_translator = self._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels) syllable_letters_translator = self._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator) return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator, accented_vowels)
# generator for inputs for tracking of data fitting # generator for inputs for tracking of data fitting
def _letter_generator(self, orig_x, orig_x_additional, orig_y, batch_size): def _letter_generator(self, orig_x, orig_x_additional, orig_y, batch_size, accented_vowels):
size = orig_x.shape[0] size = orig_x.shape[0]
while 1: while 1:
loc = 0 loc = 0
while loc < size: if self._accent_classification:
if loc + batch_size >= size: eye = np.eye(len(accented_vowels), dtype=int)
yield ([orig_x[loc:size], orig_x_additional[loc:size]], orig_y[loc:size]) eye_input_accent = np.eye(len(orig_y[0]), dtype=int)
else: input_x_stack = []
yield ([orig_x[loc:loc + batch_size], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size]) input_x_other_features_stack = []
loc += batch_size input_y_stack = []
while loc < size:
while len(input_x_stack) < batch_size and loc < size:
accent_loc = 0
for accent in orig_y[loc]:
if accent > 0:
new_orig_x_additional = orig_x_additional[loc]
new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc]))
input_x_stack.append(orig_x[loc])
input_x_other_features_stack.append(new_orig_x_additional)
input_y_stack.append(eye[int(accent)])
accent_loc += 1
loc += 1
if len(input_x_stack) > batch_size:
yield ([np.array(input_x_stack[:batch_size]),
np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size])
input_x_stack = input_x_stack[batch_size:]
input_x_other_features_stack = input_x_other_features_stack[batch_size:]
input_y_stack = input_y_stack[batch_size:]
else:
# print('BBB')
# print(np.array(input_stack))
# yield (np.array(input_stack))
yield ([np.array(input_x_stack), np.array(input_x_other_features_stack)], np.array(input_y_stack))
input_x_stack = []
input_x_other_features_stack = []
input_y_stack = []
else:
while loc < size:
if loc + batch_size >= size:
yield ([orig_x[loc:size], orig_x_additional[loc:size]], orig_y[loc:size])
else:
yield ([orig_x[loc:loc + batch_size], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
loc += batch_size
# generator for inputs for tracking of data fitting # generator for inputs for tracking of data fitting
def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator): def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator, accented_vowels):
size = orig_x.shape[0] size = orig_x.shape[0]
while 1: while 1:
loc = 0 loc = 0
while loc < size: if self._accent_classification:
if loc + batch_size >= size: eye = np.eye(len(accented_vowels), dtype=int)
gen_orig_x = translator[orig_x[loc:size]] eye_input_accent = np.eye(len(orig_y[0]), dtype=int)
yield ([gen_orig_x, orig_x_additional[loc:size]], orig_y[loc:size]) input_x_stack = []
else: input_x_other_features_stack = []
gen_orig_x = translator[orig_x[loc:loc + batch_size]] input_y_stack = []
yield ([gen_orig_x, orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size]) while loc < size:
loc += batch_size while len(input_x_stack) < batch_size and loc < size:
accent_loc = 0
for accent in orig_y[loc]:
if accent > 0:
new_orig_x_additional = orig_x_additional[loc]
new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc]))
input_x_stack.append(orig_x[loc])
input_x_other_features_stack.append(new_orig_x_additional)
input_y_stack.append(eye[int(accent)])
accent_loc += 1
loc += 1
if len(input_x_stack) > batch_size:
gen_orig_x = translator[np.array(input_x_stack[:batch_size])]
yield ([gen_orig_x, np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size])
input_x_stack = input_x_stack[batch_size:]
input_x_other_features_stack = input_x_other_features_stack[batch_size:]
input_y_stack = input_y_stack[batch_size:]
else:
gen_orig_x = translator[np.array(input_x_stack)]
yield ([gen_orig_x, np.array(input_x_other_features_stack)], np.array(input_y_stack))
input_x_stack = []
input_x_other_features_stack = []
input_y_stack = []
else:
while loc < size:
if loc + batch_size >= size:
gen_orig_x = translator[orig_x[loc:size]]
yield ([gen_orig_x, orig_x_additional[loc:size]], orig_y[loc:size])
else:
gen_orig_x = translator[orig_x[loc:loc + batch_size]]
yield ([gen_orig_x, orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
loc += batch_size
def _get_max_syllable(self, syllable_dictionary): def _get_max_syllable(self, syllable_dictionary):
max_len = 0 max_len = 0