[MAJOR REFACTOR] Added accent classification (from scratch) and deleted unnecessary y output (output where no accent should be employed) X-es in case of both syllabled inputs have also been changed in similar manner.

master
lkrsnik 7 years ago
parent 83584a0c6f
commit 18348b78fc

@ -2,10 +2,8 @@
<project version="4">
<component name="ChangeListManager">
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/dictionaries/luka.xml" afterPath="$PROJECT_DIR$/.idea/dictionaries/luka.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/cnn.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
</list>
@ -37,22 +35,25 @@
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="568">
<caret line="482" column="74" lean-forward="true" selection-start-line="482" selection-start-column="74" selection-end-line="482" selection-end-column="74" />
<state relative-caret-position="244">
<caret line="252" column="38" lean-forward="true" selection-start-line="252" selection-start-column="38" selection-end-line="252" selection-end-column="38" />
<folding>
<element signature="e#24#63#0" expanded="true" />
<element signature="e#5524#5637#0" expanded="false" />
<element signature="e#5684#6970#0" expanded="false" />
<element signature="e#7131#8538#0" expanded="false" />
<element signature="e#8626#8921#0" expanded="false" />
<element signature="e#13363#13665#0" expanded="false" />
<element signature="e#13722#14551#0" expanded="false" />
<element signature="e#14615#14961#0" expanded="false" />
<element signature="e#16836#17749#0" expanded="false" />
<element signature="e#18179#18375#0" expanded="false" />
<element signature="e#18436#18627#0" expanded="false" />
<element signature="e#18694#19341#0" expanded="false" />
<element signature="e#19440#21738#0" expanded="false" />
<element signature="e#5658#5771#0" expanded="false" />
<element signature="e#5818#7106#0" expanded="false" />
<element signature="e#7267#8674#0" expanded="false" />
<element signature="e#8762#9057#0" expanded="false" />
<element signature="e#13496#13798#0" expanded="false" />
<element signature="e#13855#14684#0" expanded="false" />
<element signature="e#14748#15094#0" expanded="false" />
<element signature="e#16969#17882#0" expanded="false" />
<element signature="e#18312#18508#0" expanded="false" />
<element signature="e#18569#18760#0" expanded="false" />
<element signature="e#18827#19474#0" expanded="false" />
<element signature="e#19573#21871#0" expanded="false" />
<element signature="e#22137#22836#0" expanded="false" />
<element signature="e#29631#29772#0" expanded="false" />
<element signature="e#29922#32067#0" expanded="false" />
</folding>
</state>
</provider>
@ -149,16 +150,6 @@
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>_create_X_features</find>
<find>raise</find>
<find>create_syllables_dictionary</find>
<find>decode_</find>
<find>create_x_features</find>
<find>generate_x_and_y</find>
<find>create_syllables</find>
<find>split_consonants</find>
<find>get_unresonant_silent_consonants</find>
<find>dict_occurances_in_dataset_rate</find>
<find>count_vowels</find>
<find>shuffle_full_vowel_inputs</find>
<find>generate_presentable_y</find>
@ -179,6 +170,16 @@
<find>_create_syllable_letters_translator</find>
<find>_get_unresonant_silent_consonants</find>
<find>el[0]</find>
<find>max_num_vowels</find>
<find>index</find>
<find>accentuated</find>
<find>create_syll</find>
<find>shuffle_all_inputs</find>
<find>accented</find>
<find>_accented</find>
<find>size</find>
<find>decode_x</find>
<find>self._input_type ==</find>
</findStrings>
</component>
<component name="Git.Settings">
@ -532,7 +533,7 @@
</component>
<component name="XDebuggerManager">
<breakpoint-manager>
<option name="time" value="5" />
<option name="time" value="6" />
</breakpoint-manager>
<watches-manager />
</component>
@ -831,22 +832,25 @@
</entry>
<entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="568">
<caret line="482" column="74" lean-forward="true" selection-start-line="482" selection-start-column="74" selection-end-line="482" selection-end-column="74" />
<state relative-caret-position="244">
<caret line="252" column="38" lean-forward="true" selection-start-line="252" selection-start-column="38" selection-end-line="252" selection-end-column="38" />
<folding>
<element signature="e#24#63#0" expanded="true" />
<element signature="e#5524#5637#0" expanded="false" />
<element signature="e#5684#6970#0" expanded="false" />
<element signature="e#7131#8538#0" expanded="false" />
<element signature="e#8626#8921#0" expanded="false" />
<element signature="e#13363#13665#0" expanded="false" />
<element signature="e#13722#14551#0" expanded="false" />
<element signature="e#14615#14961#0" expanded="false" />
<element signature="e#16836#17749#0" expanded="false" />
<element signature="e#18179#18375#0" expanded="false" />
<element signature="e#18436#18627#0" expanded="false" />
<element signature="e#18694#19341#0" expanded="false" />
<element signature="e#19440#21738#0" expanded="false" />
<element signature="e#5658#5771#0" expanded="false" />
<element signature="e#5818#7106#0" expanded="false" />
<element signature="e#7267#8674#0" expanded="false" />
<element signature="e#8762#9057#0" expanded="false" />
<element signature="e#13496#13798#0" expanded="false" />
<element signature="e#13855#14684#0" expanded="false" />
<element signature="e#14748#15094#0" expanded="false" />
<element signature="e#16969#17882#0" expanded="false" />
<element signature="e#18312#18508#0" expanded="false" />
<element signature="e#18569#18760#0" expanded="false" />
<element signature="e#18827#19474#0" expanded="false" />
<element signature="e#19573#21871#0" expanded="false" />
<element signature="e#22137#22836#0" expanded="false" />
<element signature="e#29631#29772#0" expanded="false" />
<element signature="e#29922#32067#0" expanded="false" />
</folding>
</state>
</provider>

@ -11,13 +11,14 @@ import os.path
class Data:
def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True,
additional_letter_attributes=True, reverse_inputs=True):
additional_letter_attributes=True, reverse_inputs=True, accent_classification=False):
self._input_type = input_type
self._save_generated_data = save_generated_data
self._allow_shuffle_vector_generation = allow_shuffle_vector_generation
self._shuffle_all_inputs = shuffle_all_inputs
self._additional_letter_attributes = additional_letter_attributes
self._reverse_inputs = reverse_inputs
self._accent_classification = accent_classification
self.x_train = None
self.x_other_features_train = None
@ -30,14 +31,14 @@ class Data:
self.y_validate = None
def generate_data(self, train_inputs_name, test_inputs_name, validate_inputs_name, test_and_validation_size=0.1,
content_name='SlovarIJS_BESEDE_utf8.lex',
force_override=False, content_name='SlovarIJS_BESEDE_utf8.lex',
content_shuffle_vector='content_shuffle_vector', shuffle_vector='shuffle_vector',
inputs_location='../../internal_representations/inputs/', content_location='../../../data/'):
content_path = '{}{}'.format(content_location, content_name)
train_path = '{}{}.h5'.format(inputs_location, train_inputs_name)
test_path = '{}{}.h5'.format(inputs_location, test_inputs_name)
validate_path = '{}{}.h5'.format(inputs_location, validate_inputs_name)
if os.path.exists(train_path) and os.path.exists(test_path) and os.path.exists(validate_path):
if not force_override and os.path.exists(train_path) and os.path.exists(test_path) and os.path.exists(validate_path):
print('LOADING DATA...')
self.x_train, self.x_other_features_train, self.y_train = self._load_inputs(train_path)
self.x_test, self.x_other_features_test, self.y_test = self._load_inputs(test_path)
@ -62,7 +63,7 @@ class Data:
print('CONTENT READ SUCCESSFULLY')
print('CREATING DICTIONARY...')
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
if self._input_type == 's' or self._input_type == 'ls':
if self._input_type == 's' or self._input_type == 'sl':
dictionary = self._create_syllables_dictionary(content, vowels)
print('DICTIONARY CREATION SUCCESSFUL!')
# test_and_validation_size = 0.1
@ -125,7 +126,7 @@ class Data:
break
line += 1
dictionary_input = sorted(dictionary_input)
max_num_vowels += 1
# max_num_vowels += 1
return dictionary_input, max_word, max_num_vowels, vowels, accented_vowels
# split content so that there is no overfitting
@ -230,23 +231,22 @@ class Data:
word = word[::-1]
j = 0
word_accentuations = []
# word_accentuations = []
num_vowels = 0
for c in list(word):
index = 0
if self._is_vowel(word, j, vowels):
num_vowels += 1
for d in accentuated_vowels:
if c == d:
word_accentuations.append(num_vowels)
if not self._accent_classification:
y[i][num_vowels] = 1
else:
y[i][num_vowels] = index
# word_accentuations.append(num_vowels)
break
index += 1
if self._is_vowel(word, j, vowels):
num_vowels += 1
j += 1
if len(word_accentuations) > 0:
for word_accentuation in word_accentuations:
y[i][word_accentuation] = 1
else:
y[i][0] = 1
i += 1
return y
@ -255,10 +255,10 @@ class Data:
shuffle_vector_location):
if self._input_type == 'l':
x = self._x_letter_input(content, dictionary, max_word, vowels)
elif self._input_type == 's' or self._input_type == 'ls':
elif self._input_type == 's' or self._input_type == 'sl':
x = self._x_syllable_input(content, dictionary, max_num_vowels, vowels)
else:
raise ValueError('No input_type provided. It could be \'l\', \'s\' or \'ls\'.')
raise ValueError('No input_type provided. It could be \'l\', \'s\' or \'sl\'.')
y = self._y_output(content, max_num_vowels, vowels, accentuated_vowels)
print('CREATING OTHER FEATURES...')
@ -476,46 +476,112 @@ class Data:
def _generator_instance(self, orig_x, orig_x_additional, orig_y, batch_size, content_path):
if self._input_type == 'l':
return self._letter_generator(orig_x, orig_x_additional, orig_y, batch_size)
content = self._read_content(content_path)
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
return self._letter_generator(orig_x, orig_x_additional, orig_y, batch_size, accented_vowels)
elif self._input_type == 's':
content = self._read_content(content_path)
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
syllable_dictionary = self._create_syllables_dictionary(content, vowels)
eye = np.eye(len(syllable_dictionary), dtype=int)
return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye)
return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye, accented_vowels)
elif self._input_type == 'sl':
content = self._read_content(content_path)
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
syllable_dictionary = self._create_syllables_dictionary(content, vowels)
max_syllable = self._get_max_syllable(syllable_dictionary)
syllable_letters_translator = self._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator)
return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator, accented_vowels)
# generator for inputs for tracking of data fitting
def _letter_generator(self, orig_x, orig_x_additional, orig_y, batch_size):
def _letter_generator(self, orig_x, orig_x_additional, orig_y, batch_size, accented_vowels):
size = orig_x.shape[0]
while 1:
loc = 0
while loc < size:
if loc + batch_size >= size:
yield ([orig_x[loc:size], orig_x_additional[loc:size]], orig_y[loc:size])
else:
yield ([orig_x[loc:loc + batch_size], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
loc += batch_size
if self._accent_classification:
eye = np.eye(len(accented_vowels), dtype=int)
eye_input_accent = np.eye(len(orig_y[0]), dtype=int)
input_x_stack = []
input_x_other_features_stack = []
input_y_stack = []
while loc < size:
while len(input_x_stack) < batch_size and loc < size:
accent_loc = 0
for accent in orig_y[loc]:
if accent > 0:
new_orig_x_additional = orig_x_additional[loc]
new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc]))
input_x_stack.append(orig_x[loc])
input_x_other_features_stack.append(new_orig_x_additional)
input_y_stack.append(eye[int(accent)])
accent_loc += 1
loc += 1
if len(input_x_stack) > batch_size:
yield ([np.array(input_x_stack[:batch_size]),
np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size])
input_x_stack = input_x_stack[batch_size:]
input_x_other_features_stack = input_x_other_features_stack[batch_size:]
input_y_stack = input_y_stack[batch_size:]
else:
# print('BBB')
# print(np.array(input_stack))
# yield (np.array(input_stack))
yield ([np.array(input_x_stack), np.array(input_x_other_features_stack)], np.array(input_y_stack))
input_x_stack = []
input_x_other_features_stack = []
input_y_stack = []
else:
while loc < size:
if loc + batch_size >= size:
yield ([orig_x[loc:size], orig_x_additional[loc:size]], orig_y[loc:size])
else:
yield ([orig_x[loc:loc + batch_size], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
loc += batch_size
# generator for inputs for tracking of data fitting
def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator):
def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator, accented_vowels):
size = orig_x.shape[0]
while 1:
loc = 0
while loc < size:
if loc + batch_size >= size:
gen_orig_x = translator[orig_x[loc:size]]
yield ([gen_orig_x, orig_x_additional[loc:size]], orig_y[loc:size])
else:
gen_orig_x = translator[orig_x[loc:loc + batch_size]]
yield ([gen_orig_x, orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
loc += batch_size
if self._accent_classification:
eye = np.eye(len(accented_vowels), dtype=int)
eye_input_accent = np.eye(len(orig_y[0]), dtype=int)
input_x_stack = []
input_x_other_features_stack = []
input_y_stack = []
while loc < size:
while len(input_x_stack) < batch_size and loc < size:
accent_loc = 0
for accent in orig_y[loc]:
if accent > 0:
new_orig_x_additional = orig_x_additional[loc]
new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc]))
input_x_stack.append(orig_x[loc])
input_x_other_features_stack.append(new_orig_x_additional)
input_y_stack.append(eye[int(accent)])
accent_loc += 1
loc += 1
if len(input_x_stack) > batch_size:
gen_orig_x = translator[np.array(input_x_stack[:batch_size])]
yield ([gen_orig_x, np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size])
input_x_stack = input_x_stack[batch_size:]
input_x_other_features_stack = input_x_other_features_stack[batch_size:]
input_y_stack = input_y_stack[batch_size:]
else:
gen_orig_x = translator[np.array(input_x_stack)]
yield ([gen_orig_x, np.array(input_x_other_features_stack)], np.array(input_y_stack))
input_x_stack = []
input_x_other_features_stack = []
input_y_stack = []
else:
while loc < size:
if loc + batch_size >= size:
gen_orig_x = translator[orig_x[loc:size]]
yield ([gen_orig_x, orig_x_additional[loc:size]], orig_y[loc:size])
else:
gen_orig_x = translator[orig_x[loc:loc + batch_size]]
yield ([gen_orig_x, orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
loc += batch_size
def _get_max_syllable(self, syllable_dictionary):
max_len = 0

Loading…
Cancel
Save