[MAJOR REFACTOR] Added accent classification (from scratch) and deleted unnecessary y output (output where no accent should be employed) X-es in case of both syllabled inputs have also been changed in similar manner.
This commit is contained in:
parent
83584a0c6f
commit
18348b78fc
|
@ -2,10 +2,8 @@
|
|||
<project version="4">
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/dictionaries/luka.xml" afterPath="$PROJECT_DIR$/.idea/dictionaries/luka.xml" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/cnn.ipynb" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/syllables/cnn.ipynb" />
|
||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
|
||||
</list>
|
||||
|
@ -37,22 +35,25 @@
|
|||
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="568">
|
||||
<caret line="482" column="74" lean-forward="true" selection-start-line="482" selection-start-column="74" selection-end-line="482" selection-end-column="74" />
|
||||
<state relative-caret-position="244">
|
||||
<caret line="252" column="38" lean-forward="true" selection-start-line="252" selection-start-column="38" selection-end-line="252" selection-end-column="38" />
|
||||
<folding>
|
||||
<element signature="e#24#63#0" expanded="true" />
|
||||
<element signature="e#5524#5637#0" expanded="false" />
|
||||
<element signature="e#5684#6970#0" expanded="false" />
|
||||
<element signature="e#7131#8538#0" expanded="false" />
|
||||
<element signature="e#8626#8921#0" expanded="false" />
|
||||
<element signature="e#13363#13665#0" expanded="false" />
|
||||
<element signature="e#13722#14551#0" expanded="false" />
|
||||
<element signature="e#14615#14961#0" expanded="false" />
|
||||
<element signature="e#16836#17749#0" expanded="false" />
|
||||
<element signature="e#18179#18375#0" expanded="false" />
|
||||
<element signature="e#18436#18627#0" expanded="false" />
|
||||
<element signature="e#18694#19341#0" expanded="false" />
|
||||
<element signature="e#19440#21738#0" expanded="false" />
|
||||
<element signature="e#5658#5771#0" expanded="false" />
|
||||
<element signature="e#5818#7106#0" expanded="false" />
|
||||
<element signature="e#7267#8674#0" expanded="false" />
|
||||
<element signature="e#8762#9057#0" expanded="false" />
|
||||
<element signature="e#13496#13798#0" expanded="false" />
|
||||
<element signature="e#13855#14684#0" expanded="false" />
|
||||
<element signature="e#14748#15094#0" expanded="false" />
|
||||
<element signature="e#16969#17882#0" expanded="false" />
|
||||
<element signature="e#18312#18508#0" expanded="false" />
|
||||
<element signature="e#18569#18760#0" expanded="false" />
|
||||
<element signature="e#18827#19474#0" expanded="false" />
|
||||
<element signature="e#19573#21871#0" expanded="false" />
|
||||
<element signature="e#22137#22836#0" expanded="false" />
|
||||
<element signature="e#29631#29772#0" expanded="false" />
|
||||
<element signature="e#29922#32067#0" expanded="false" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
|
@ -149,16 +150,6 @@
|
|||
</component>
|
||||
<component name="FindInProjectRecents">
|
||||
<findStrings>
|
||||
<find>_create_X_features</find>
|
||||
<find>raise</find>
|
||||
<find>create_syllables_dictionary</find>
|
||||
<find>decode_</find>
|
||||
<find>create_x_features</find>
|
||||
<find>generate_x_and_y</find>
|
||||
<find>create_syllables</find>
|
||||
<find>split_consonants</find>
|
||||
<find>get_unresonant_silent_consonants</find>
|
||||
<find>dict_occurances_in_dataset_rate</find>
|
||||
<find>count_vowels</find>
|
||||
<find>shuffle_full_vowel_inputs</find>
|
||||
<find>generate_presentable_y</find>
|
||||
|
@ -179,6 +170,16 @@
|
|||
<find>_create_syllable_letters_translator</find>
|
||||
<find>_get_unresonant_silent_consonants</find>
|
||||
<find>el[0]</find>
|
||||
<find>max_num_vowels</find>
|
||||
<find>index</find>
|
||||
<find>accentuated</find>
|
||||
<find>create_syll</find>
|
||||
<find>shuffle_all_inputs</find>
|
||||
<find>accented</find>
|
||||
<find>_accented</find>
|
||||
<find>size</find>
|
||||
<find>decode_x</find>
|
||||
<find>self._input_type ==</find>
|
||||
</findStrings>
|
||||
</component>
|
||||
<component name="Git.Settings">
|
||||
|
@ -532,7 +533,7 @@
|
|||
</component>
|
||||
<component name="XDebuggerManager">
|
||||
<breakpoint-manager>
|
||||
<option name="time" value="5" />
|
||||
<option name="time" value="6" />
|
||||
</breakpoint-manager>
|
||||
<watches-manager />
|
||||
</component>
|
||||
|
@ -831,22 +832,25 @@
|
|||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="568">
|
||||
<caret line="482" column="74" lean-forward="true" selection-start-line="482" selection-start-column="74" selection-end-line="482" selection-end-column="74" />
|
||||
<state relative-caret-position="244">
|
||||
<caret line="252" column="38" lean-forward="true" selection-start-line="252" selection-start-column="38" selection-end-line="252" selection-end-column="38" />
|
||||
<folding>
|
||||
<element signature="e#24#63#0" expanded="true" />
|
||||
<element signature="e#5524#5637#0" expanded="false" />
|
||||
<element signature="e#5684#6970#0" expanded="false" />
|
||||
<element signature="e#7131#8538#0" expanded="false" />
|
||||
<element signature="e#8626#8921#0" expanded="false" />
|
||||
<element signature="e#13363#13665#0" expanded="false" />
|
||||
<element signature="e#13722#14551#0" expanded="false" />
|
||||
<element signature="e#14615#14961#0" expanded="false" />
|
||||
<element signature="e#16836#17749#0" expanded="false" />
|
||||
<element signature="e#18179#18375#0" expanded="false" />
|
||||
<element signature="e#18436#18627#0" expanded="false" />
|
||||
<element signature="e#18694#19341#0" expanded="false" />
|
||||
<element signature="e#19440#21738#0" expanded="false" />
|
||||
<element signature="e#5658#5771#0" expanded="false" />
|
||||
<element signature="e#5818#7106#0" expanded="false" />
|
||||
<element signature="e#7267#8674#0" expanded="false" />
|
||||
<element signature="e#8762#9057#0" expanded="false" />
|
||||
<element signature="e#13496#13798#0" expanded="false" />
|
||||
<element signature="e#13855#14684#0" expanded="false" />
|
||||
<element signature="e#14748#15094#0" expanded="false" />
|
||||
<element signature="e#16969#17882#0" expanded="false" />
|
||||
<element signature="e#18312#18508#0" expanded="false" />
|
||||
<element signature="e#18569#18760#0" expanded="false" />
|
||||
<element signature="e#18827#19474#0" expanded="false" />
|
||||
<element signature="e#19573#21871#0" expanded="false" />
|
||||
<element signature="e#22137#22836#0" expanded="false" />
|
||||
<element signature="e#29631#29772#0" expanded="false" />
|
||||
<element signature="e#29922#32067#0" expanded="false" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
|
|
136
prepare_data.py
136
prepare_data.py
|
@ -11,13 +11,14 @@ import os.path
|
|||
|
||||
class Data:
|
||||
def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True,
|
||||
additional_letter_attributes=True, reverse_inputs=True):
|
||||
additional_letter_attributes=True, reverse_inputs=True, accent_classification=False):
|
||||
self._input_type = input_type
|
||||
self._save_generated_data = save_generated_data
|
||||
self._allow_shuffle_vector_generation = allow_shuffle_vector_generation
|
||||
self._shuffle_all_inputs = shuffle_all_inputs
|
||||
self._additional_letter_attributes = additional_letter_attributes
|
||||
self._reverse_inputs = reverse_inputs
|
||||
self._accent_classification = accent_classification
|
||||
|
||||
self.x_train = None
|
||||
self.x_other_features_train = None
|
||||
|
@ -30,14 +31,14 @@ class Data:
|
|||
self.y_validate = None
|
||||
|
||||
def generate_data(self, train_inputs_name, test_inputs_name, validate_inputs_name, test_and_validation_size=0.1,
|
||||
content_name='SlovarIJS_BESEDE_utf8.lex',
|
||||
force_override=False, content_name='SlovarIJS_BESEDE_utf8.lex',
|
||||
content_shuffle_vector='content_shuffle_vector', shuffle_vector='shuffle_vector',
|
||||
inputs_location='../../internal_representations/inputs/', content_location='../../../data/'):
|
||||
content_path = '{}{}'.format(content_location, content_name)
|
||||
train_path = '{}{}.h5'.format(inputs_location, train_inputs_name)
|
||||
test_path = '{}{}.h5'.format(inputs_location, test_inputs_name)
|
||||
validate_path = '{}{}.h5'.format(inputs_location, validate_inputs_name)
|
||||
if os.path.exists(train_path) and os.path.exists(test_path) and os.path.exists(validate_path):
|
||||
if not force_override and os.path.exists(train_path) and os.path.exists(test_path) and os.path.exists(validate_path):
|
||||
print('LOADING DATA...')
|
||||
self.x_train, self.x_other_features_train, self.y_train = self._load_inputs(train_path)
|
||||
self.x_test, self.x_other_features_test, self.y_test = self._load_inputs(test_path)
|
||||
|
@ -62,7 +63,7 @@ class Data:
|
|||
print('CONTENT READ SUCCESSFULLY')
|
||||
print('CREATING DICTIONARY...')
|
||||
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
|
||||
if self._input_type == 's' or self._input_type == 'ls':
|
||||
if self._input_type == 's' or self._input_type == 'sl':
|
||||
dictionary = self._create_syllables_dictionary(content, vowels)
|
||||
print('DICTIONARY CREATION SUCCESSFUL!')
|
||||
# test_and_validation_size = 0.1
|
||||
|
@ -125,7 +126,7 @@ class Data:
|
|||
break
|
||||
line += 1
|
||||
dictionary_input = sorted(dictionary_input)
|
||||
max_num_vowels += 1
|
||||
# max_num_vowels += 1
|
||||
return dictionary_input, max_word, max_num_vowels, vowels, accented_vowels
|
||||
|
||||
# split content so that there is no overfitting
|
||||
|
@ -230,23 +231,22 @@ class Data:
|
|||
word = word[::-1]
|
||||
|
||||
j = 0
|
||||
word_accentuations = []
|
||||
# word_accentuations = []
|
||||
num_vowels = 0
|
||||
for c in list(word):
|
||||
index = 0
|
||||
if self._is_vowel(word, j, vowels):
|
||||
num_vowels += 1
|
||||
for d in accentuated_vowels:
|
||||
if c == d:
|
||||
word_accentuations.append(num_vowels)
|
||||
if not self._accent_classification:
|
||||
y[i][num_vowels] = 1
|
||||
else:
|
||||
y[i][num_vowels] = index
|
||||
# word_accentuations.append(num_vowels)
|
||||
break
|
||||
index += 1
|
||||
if self._is_vowel(word, j, vowels):
|
||||
num_vowels += 1
|
||||
j += 1
|
||||
if len(word_accentuations) > 0:
|
||||
for word_accentuation in word_accentuations:
|
||||
y[i][word_accentuation] = 1
|
||||
else:
|
||||
y[i][0] = 1
|
||||
i += 1
|
||||
return y
|
||||
|
||||
|
@ -255,10 +255,10 @@ class Data:
|
|||
shuffle_vector_location):
|
||||
if self._input_type == 'l':
|
||||
x = self._x_letter_input(content, dictionary, max_word, vowels)
|
||||
elif self._input_type == 's' or self._input_type == 'ls':
|
||||
elif self._input_type == 's' or self._input_type == 'sl':
|
||||
x = self._x_syllable_input(content, dictionary, max_num_vowels, vowels)
|
||||
else:
|
||||
raise ValueError('No input_type provided. It could be \'l\', \'s\' or \'ls\'.')
|
||||
raise ValueError('No input_type provided. It could be \'l\', \'s\' or \'sl\'.')
|
||||
y = self._y_output(content, max_num_vowels, vowels, accentuated_vowels)
|
||||
|
||||
print('CREATING OTHER FEATURES...')
|
||||
|
@ -476,46 +476,112 @@ class Data:
|
|||
|
||||
def _generator_instance(self, orig_x, orig_x_additional, orig_y, batch_size, content_path):
|
||||
if self._input_type == 'l':
|
||||
return self._letter_generator(orig_x, orig_x_additional, orig_y, batch_size)
|
||||
content = self._read_content(content_path)
|
||||
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
|
||||
return self._letter_generator(orig_x, orig_x_additional, orig_y, batch_size, accented_vowels)
|
||||
elif self._input_type == 's':
|
||||
content = self._read_content(content_path)
|
||||
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
|
||||
syllable_dictionary = self._create_syllables_dictionary(content, vowels)
|
||||
eye = np.eye(len(syllable_dictionary), dtype=int)
|
||||
return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye)
|
||||
return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye, accented_vowels)
|
||||
elif self._input_type == 'sl':
|
||||
content = self._read_content(content_path)
|
||||
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
|
||||
syllable_dictionary = self._create_syllables_dictionary(content, vowels)
|
||||
max_syllable = self._get_max_syllable(syllable_dictionary)
|
||||
syllable_letters_translator = self._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
|
||||
return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator)
|
||||
return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator, accented_vowels)
|
||||
|
||||
# generator for inputs for tracking of data fitting
|
||||
def _letter_generator(self, orig_x, orig_x_additional, orig_y, batch_size):
|
||||
def _letter_generator(self, orig_x, orig_x_additional, orig_y, batch_size, accented_vowels):
|
||||
size = orig_x.shape[0]
|
||||
while 1:
|
||||
loc = 0
|
||||
while loc < size:
|
||||
if loc + batch_size >= size:
|
||||
yield ([orig_x[loc:size], orig_x_additional[loc:size]], orig_y[loc:size])
|
||||
else:
|
||||
yield ([orig_x[loc:loc + batch_size], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
|
||||
loc += batch_size
|
||||
if self._accent_classification:
|
||||
eye = np.eye(len(accented_vowels), dtype=int)
|
||||
eye_input_accent = np.eye(len(orig_y[0]), dtype=int)
|
||||
input_x_stack = []
|
||||
input_x_other_features_stack = []
|
||||
input_y_stack = []
|
||||
while loc < size:
|
||||
while len(input_x_stack) < batch_size and loc < size:
|
||||
accent_loc = 0
|
||||
for accent in orig_y[loc]:
|
||||
if accent > 0:
|
||||
new_orig_x_additional = orig_x_additional[loc]
|
||||
new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc]))
|
||||
input_x_stack.append(orig_x[loc])
|
||||
input_x_other_features_stack.append(new_orig_x_additional)
|
||||
input_y_stack.append(eye[int(accent)])
|
||||
accent_loc += 1
|
||||
loc += 1
|
||||
if len(input_x_stack) > batch_size:
|
||||
yield ([np.array(input_x_stack[:batch_size]),
|
||||
np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size])
|
||||
input_x_stack = input_x_stack[batch_size:]
|
||||
input_x_other_features_stack = input_x_other_features_stack[batch_size:]
|
||||
input_y_stack = input_y_stack[batch_size:]
|
||||
else:
|
||||
# print('BBB')
|
||||
# print(np.array(input_stack))
|
||||
# yield (np.array(input_stack))
|
||||
yield ([np.array(input_x_stack), np.array(input_x_other_features_stack)], np.array(input_y_stack))
|
||||
input_x_stack = []
|
||||
input_x_other_features_stack = []
|
||||
input_y_stack = []
|
||||
else:
|
||||
while loc < size:
|
||||
if loc + batch_size >= size:
|
||||
yield ([orig_x[loc:size], orig_x_additional[loc:size]], orig_y[loc:size])
|
||||
else:
|
||||
yield ([orig_x[loc:loc + batch_size], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
|
||||
loc += batch_size
|
||||
|
||||
# generator for inputs for tracking of data fitting
|
||||
def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator):
|
||||
def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator, accented_vowels):
|
||||
size = orig_x.shape[0]
|
||||
while 1:
|
||||
loc = 0
|
||||
while loc < size:
|
||||
if loc + batch_size >= size:
|
||||
gen_orig_x = translator[orig_x[loc:size]]
|
||||
yield ([gen_orig_x, orig_x_additional[loc:size]], orig_y[loc:size])
|
||||
else:
|
||||
gen_orig_x = translator[orig_x[loc:loc + batch_size]]
|
||||
yield ([gen_orig_x, orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
|
||||
loc += batch_size
|
||||
if self._accent_classification:
|
||||
eye = np.eye(len(accented_vowels), dtype=int)
|
||||
eye_input_accent = np.eye(len(orig_y[0]), dtype=int)
|
||||
input_x_stack = []
|
||||
input_x_other_features_stack = []
|
||||
input_y_stack = []
|
||||
while loc < size:
|
||||
while len(input_x_stack) < batch_size and loc < size:
|
||||
accent_loc = 0
|
||||
for accent in orig_y[loc]:
|
||||
if accent > 0:
|
||||
new_orig_x_additional = orig_x_additional[loc]
|
||||
new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc]))
|
||||
input_x_stack.append(orig_x[loc])
|
||||
input_x_other_features_stack.append(new_orig_x_additional)
|
||||
input_y_stack.append(eye[int(accent)])
|
||||
accent_loc += 1
|
||||
loc += 1
|
||||
if len(input_x_stack) > batch_size:
|
||||
gen_orig_x = translator[np.array(input_x_stack[:batch_size])]
|
||||
yield ([gen_orig_x, np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size])
|
||||
input_x_stack = input_x_stack[batch_size:]
|
||||
input_x_other_features_stack = input_x_other_features_stack[batch_size:]
|
||||
input_y_stack = input_y_stack[batch_size:]
|
||||
else:
|
||||
gen_orig_x = translator[np.array(input_x_stack)]
|
||||
yield ([gen_orig_x, np.array(input_x_other_features_stack)], np.array(input_y_stack))
|
||||
input_x_stack = []
|
||||
input_x_other_features_stack = []
|
||||
input_y_stack = []
|
||||
else:
|
||||
while loc < size:
|
||||
if loc + batch_size >= size:
|
||||
gen_orig_x = translator[orig_x[loc:size]]
|
||||
yield ([gen_orig_x, orig_x_additional[loc:size]], orig_y[loc:size])
|
||||
else:
|
||||
gen_orig_x = translator[orig_x[loc:loc + batch_size]]
|
||||
yield ([gen_orig_x, orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
|
||||
loc += batch_size
|
||||
|
||||
def _get_max_syllable(self, syllable_dictionary):
|
||||
max_len = 0
|
||||
|
|
Loading…
Reference in New Issue
Block a user