[MAJOR UPDATE] Added syllable neural networks
This commit is contained in:
parent
f0d263e429
commit
38797b37d4
|
@ -2,9 +2,9 @@
|
||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="ChangeListManager">
|
<component name="ChangeListManager">
|
||||||
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
|
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
|
||||||
<change type="NEW" beforePath="" afterPath="$PROJECT_DIR$/tex_hyphenation.py" />
|
|
||||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
|
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
|
||||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
|
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
|
||||||
|
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/results_presentation.ipynb" />
|
||||||
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
|
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
|
||||||
</list>
|
</list>
|
||||||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||||
|
@ -35,8 +35,8 @@
|
||||||
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
|
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
|
||||||
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="284">
|
<state relative-caret-position="662">
|
||||||
<caret line="592" column="36" lean-forward="true" selection-start-line="592" selection-start-column="36" selection-end-line="592" selection-end-column="36" />
|
<caret line="1166" column="15" lean-forward="false" selection-start-line="1166" selection-start-column="15" selection-end-line="1166" selection-end-column="15" />
|
||||||
<folding>
|
<folding>
|
||||||
<element signature="e#24#63#0" expanded="true" />
|
<element signature="e#24#63#0" expanded="true" />
|
||||||
</folding>
|
</folding>
|
||||||
|
@ -150,7 +150,6 @@
|
||||||
<find>shuffle_inputs</find>
|
<find>shuffle_inputs</find>
|
||||||
<find>generator</find>
|
<find>generator</find>
|
||||||
<find>content, feature_dictionary</find>
|
<find>content, feature_dictionary</find>
|
||||||
<find>decode</find>
|
|
||||||
<find>create_feature_dictionary</find>
|
<find>create_feature_dictionary</find>
|
||||||
<find>with</find>
|
<find>with</find>
|
||||||
<find>read</find>
|
<find>read</find>
|
||||||
|
@ -160,6 +159,10 @@
|
||||||
<find>dictionary</find>
|
<find>dictionary</find>
|
||||||
<find>create_dict</find>
|
<find>create_dict</find>
|
||||||
<find>split_content</find>
|
<find>split_content</find>
|
||||||
|
<find>decode_position</find>
|
||||||
|
<find>'r'</find>
|
||||||
|
<find>decode</find>
|
||||||
|
<find>complete_feature_dict</find>
|
||||||
</findStrings>
|
</findStrings>
|
||||||
</component>
|
</component>
|
||||||
<component name="Git.Settings">
|
<component name="Git.Settings">
|
||||||
|
@ -187,7 +190,7 @@
|
||||||
<option name="x" value="65" />
|
<option name="x" value="65" />
|
||||||
<option name="y" value="24" />
|
<option name="y" value="24" />
|
||||||
<option name="width" value="1855" />
|
<option name="width" value="1855" />
|
||||||
<option name="height" value="1176" />
|
<option name="height" value="1056" />
|
||||||
</component>
|
</component>
|
||||||
<component name="ProjectView">
|
<component name="ProjectView">
|
||||||
<navigator currentView="ProjectPane" proportions="" version="1">
|
<navigator currentView="ProjectPane" proportions="" version="1">
|
||||||
|
@ -482,7 +485,7 @@
|
||||||
<servers />
|
<servers />
|
||||||
</component>
|
</component>
|
||||||
<component name="ToolWindowManager">
|
<component name="ToolWindowManager">
|
||||||
<frame x="65" y="24" width="1855" height="1176" extended-state="6" />
|
<frame x="65" y="24" width="1855" height="1056" extended-state="6" />
|
||||||
<editor active="true" />
|
<editor active="true" />
|
||||||
<layout>
|
<layout>
|
||||||
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.16375546" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
|
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.16375546" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
|
||||||
|
@ -878,8 +881,8 @@
|
||||||
</entry>
|
</entry>
|
||||||
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
<entry file="file://$PROJECT_DIR$/prepare_data.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="284">
|
<state relative-caret-position="662">
|
||||||
<caret line="592" column="36" lean-forward="true" selection-start-line="592" selection-start-column="36" selection-end-line="592" selection-end-column="36" />
|
<caret line="1166" column="15" lean-forward="false" selection-start-line="1166" selection-start-column="15" selection-end-line="1166" selection-end-column="15" />
|
||||||
<folding>
|
<folding>
|
||||||
<element signature="e#24#63#0" expanded="true" />
|
<element signature="e#24#63#0" expanded="true" />
|
||||||
</folding>
|
</folding>
|
||||||
|
|
216
prepare_data.py
216
prepare_data.py
|
@ -124,7 +124,7 @@ def read_content():
|
||||||
def is_vowel(word_list, position, vowels):
|
def is_vowel(word_list, position, vowels):
|
||||||
if word_list[position] in vowels:
|
if word_list[position] in vowels:
|
||||||
return True
|
return True
|
||||||
if word_list[position] == u'r' and (position - 1 < 0 or word_list[position - 1] not in vowels) and (position + 1 >= len(word_list) or word_list[position + 1] not in vowels):
|
if (word_list[position] == u'r' or word_list[position] == u'R') and (position - 1 < 0 or word_list[position - 1] not in vowels) and (position + 1 >= len(word_list) or word_list[position + 1] not in vowels):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -310,7 +310,7 @@ def old_generate_full_matrix_inputs():
|
||||||
|
|
||||||
|
|
||||||
# Generate each y as an array of 11 numbers (with possible values between 0 and 1)
|
# Generate each y as an array of 11 numbers (with possible values between 0 and 1)
|
||||||
def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location):
|
def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location, shuffle=True):
|
||||||
y = np.zeros((len(content), max_num_vowels))
|
y = np.zeros((len(content), max_num_vowels))
|
||||||
X = np.zeros((len(content), max_word, len(dictionary)))
|
X = np.zeros((len(content), max_word, len(dictionary)))
|
||||||
print('CREATING OTHER FEATURES...')
|
print('CREATING OTHER FEATURES...')
|
||||||
|
@ -350,10 +350,10 @@ def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, acce
|
||||||
y[i][0] = 1
|
y[i][0] = 1
|
||||||
# y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
# y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
||||||
i += 1
|
i += 1
|
||||||
|
if shuffle:
|
||||||
print('SHUFFELING INPUTS...')
|
print('SHUFFELING INPUTS...')
|
||||||
X, y, X_other_features = shuffle_inputs(X, y, shuffle_vector_location, X_pure=X_other_features)
|
X, y, X_other_features = shuffle_inputs(X, y, shuffle_vector_location, X_pure=X_other_features)
|
||||||
print('INPUTS SHUFFELED!')
|
print('INPUTS SHUFFELED!')
|
||||||
return X, X_other_features, y
|
return X, X_other_features, y
|
||||||
|
|
||||||
|
|
||||||
|
@ -484,6 +484,12 @@ def generate_X_and_y_RAM_efficient(name, split_number):
|
||||||
|
|
||||||
|
|
||||||
# metric for calculation of correct results
|
# metric for calculation of correct results
|
||||||
|
# test with:
|
||||||
|
# print(mean_pred(y_validate[pos], predictions[pos]).eval())
|
||||||
|
# print(mean_pred(np.array([[ 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
|
||||||
|
# [ 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.]]),
|
||||||
|
# np.array([[ 0., 0.51, 0., 0.51, 0., 0., 0., 0., 0., 0., 0.],
|
||||||
|
# [ 0., 0.92, 0., 0.51, 0., 0., 0., 0., 0., 0., 0.]])).eval())
|
||||||
def actual_accuracy(y_true, y_pred):
|
def actual_accuracy(y_true, y_pred):
|
||||||
return K.mean(K.equal(K.mean(K.equal(K.round(y_true), K.round(y_pred)), axis=-1), 1.0))
|
return K.mean(K.equal(K.mean(K.equal(K.round(y_true), K.round(y_pred)), axis=-1), 1.0))
|
||||||
|
|
||||||
|
@ -602,7 +608,17 @@ def decode_X_features(feature_dictionary, X_other_features):
|
||||||
return u''.join(final_word)
|
return u''.join(final_word)
|
||||||
|
|
||||||
|
|
||||||
def decode_position(y, max_num_vowels):
|
def decode_position(y):
|
||||||
|
i = 0
|
||||||
|
res = []
|
||||||
|
for el in y:
|
||||||
|
if el >= 0.5:
|
||||||
|
res.append(i)
|
||||||
|
i += 1
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def old_decode_position(y, max_num_vowels):
|
||||||
max_el = 0
|
max_el = 0
|
||||||
i = 0
|
i = 0
|
||||||
pos = -1
|
pos = -1
|
||||||
|
@ -785,7 +801,7 @@ def convert_to_MULTEXT_east_v4(old_features, feature_dictionary):
|
||||||
new_features[2] = '-'
|
new_features[2] = '-'
|
||||||
return new_features[:len(feature_dictionary[3]) - 1]
|
return new_features[:len(feature_dictionary[3]) - 1]
|
||||||
if old_features[0] == 'N':
|
if old_features[0] == 'N':
|
||||||
if len(old_features) > 5:
|
if len(old_features) >= 7:
|
||||||
new_features[5] = old_features[7]
|
new_features[5] = old_features[7]
|
||||||
return new_features[:len(feature_dictionary[4]) - 1]
|
return new_features[:len(feature_dictionary[4]) - 1]
|
||||||
if old_features[0] == 'P':
|
if old_features[0] == 'P':
|
||||||
|
@ -974,3 +990,187 @@ def dict_occurances_in_dataset_rate(content):
|
||||||
|
|
||||||
case_numbers = np.sum(X_other_features, axis=0)
|
case_numbers = np.sum(X_other_features, axis=0)
|
||||||
print(case_numbers)
|
print(case_numbers)
|
||||||
|
|
||||||
|
|
||||||
|
def get_voiced_consonants():
|
||||||
|
return ['m', 'n', 'v', 'l', 'r', 'j', 'y', 'w']
|
||||||
|
|
||||||
|
def get_resonant_silent_consonants():
|
||||||
|
return ['b', 'd', 'z', 'ž', 'g']
|
||||||
|
|
||||||
|
def get_unresonant_silent_consonants():
|
||||||
|
return ['p', 't', 's', 'š', 'č', 'k', 'f', 'h', 'c']
|
||||||
|
|
||||||
|
|
||||||
|
def split_consonants(consonants):
|
||||||
|
# def voiced_consonants():
|
||||||
|
# return ['m', 'n', 'v', 'l', 'r', 'j', 'y', 'w']
|
||||||
|
#
|
||||||
|
# def resonant_silent_consonants():
|
||||||
|
# return ['b', 'd', 'z', 'ž', 'g']
|
||||||
|
#
|
||||||
|
# def unresonant_silent_consonants():
|
||||||
|
# return ['p', 't', 's', 'š', 'č', 'k', 'f', 'h', 'c']
|
||||||
|
# test = get_voiced_consonants()
|
||||||
|
voiced_consonants = get_voiced_consonants()
|
||||||
|
resonant_silent_consonants = get_resonant_silent_consonants()
|
||||||
|
unresonant_silent_consonants = get_unresonant_silent_consonants()
|
||||||
|
if len(consonants) == 0:
|
||||||
|
return [''], ['']
|
||||||
|
elif len(consonants) == 1:
|
||||||
|
return [''], consonants
|
||||||
|
else:
|
||||||
|
split_options = []
|
||||||
|
for i in range(len(consonants)-1):
|
||||||
|
if consonants[i] == '-' or consonants[i] == '_':
|
||||||
|
split_options.append([i, -1])
|
||||||
|
elif consonants[i] == consonants[i+1]:
|
||||||
|
split_options.append([i, 0])
|
||||||
|
elif consonants[i] in voiced_consonants:
|
||||||
|
if consonants[i+1] in resonant_silent_consonants or consonants[i+1] in unresonant_silent_consonants:
|
||||||
|
split_options.append([i, 2])
|
||||||
|
elif consonants[i] in resonant_silent_consonants:
|
||||||
|
if consonants[i+1] in resonant_silent_consonants:
|
||||||
|
split_options.append([i, 1])
|
||||||
|
elif consonants[i+1] in unresonant_silent_consonants:
|
||||||
|
split_options.append([i, 3])
|
||||||
|
elif consonants[i] in unresonant_silent_consonants:
|
||||||
|
if consonants[i+1] in resonant_silent_consonants:
|
||||||
|
split_options.append([i, 4])
|
||||||
|
else:
|
||||||
|
print(consonants)
|
||||||
|
print('UNRECOGNIZED LETTERS!')
|
||||||
|
if split_options == []:
|
||||||
|
return [''], consonants
|
||||||
|
else:
|
||||||
|
split = min(split_options, key=lambda x:x[1])
|
||||||
|
return consonants[:split[0]+1], consonants[split[0]+1:]
|
||||||
|
# print(consonants)
|
||||||
|
return [''], ['']
|
||||||
|
|
||||||
|
|
||||||
|
def create_syllables(word, vowels):
|
||||||
|
word_list = list(word)
|
||||||
|
consonants = []
|
||||||
|
syllables = []
|
||||||
|
for i in range(len(word_list)):
|
||||||
|
if is_vowel(word_list, i, vowels):
|
||||||
|
if syllables == []:
|
||||||
|
consonants.append(word_list[i])
|
||||||
|
syllables.append(''.join(consonants))
|
||||||
|
else:
|
||||||
|
left_consonants, right_consonants = split_consonants(consonants)
|
||||||
|
syllables[-1] += ''.join(left_consonants)
|
||||||
|
right_consonants.append(word_list[i])
|
||||||
|
syllables.append(''.join(right_consonants))
|
||||||
|
consonants = []
|
||||||
|
else:
|
||||||
|
consonants.append(word_list[i])
|
||||||
|
if len(syllables) < 1:
|
||||||
|
return word
|
||||||
|
syllables[-1] += ''.join(consonants)
|
||||||
|
|
||||||
|
return syllables
|
||||||
|
|
||||||
|
|
||||||
|
def create_syllables_dictionary(content, vowels):
|
||||||
|
dictionary = []
|
||||||
|
for el in content:
|
||||||
|
syllables = create_syllables(el[0], vowels)
|
||||||
|
for syllable in syllables:
|
||||||
|
if syllable not in dictionary:
|
||||||
|
dictionary.append(syllable)
|
||||||
|
dictionary.append('')
|
||||||
|
return sorted(dictionary)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_syllable_inputs(content_shuffle_vector_location, shuffle_vector_location):
|
||||||
|
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
||||||
|
train_content, test_content, validate_content = split_content(content, 0.2, content_shuffle_vector_location)
|
||||||
|
feature_dictionary = create_feature_dictionary()
|
||||||
|
print('CREATING SYLLABLE DICTIONARY...')
|
||||||
|
syllable_dictionary = create_syllables_dictionary(content, vowels)
|
||||||
|
print('CREATION SUCCESSFUL!')
|
||||||
|
|
||||||
|
# Generate X and y
|
||||||
|
print('GENERATING X AND y...')
|
||||||
|
X_train, X_other_features_train, y_train = generate_syllable_X_and_y(syllable_dictionary, max_word, max_num_vowels, train_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_train.h5')
|
||||||
|
X_test, X_other_features_test, y_test = generate_syllable_X_and_y(syllable_dictionary, max_word, max_num_vowels, test_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_test.h5')
|
||||||
|
X_validate, X_other_features_validate, y_validate = generate_syllable_X_and_y(syllable_dictionary, max_word, max_num_vowels, validate_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_validate.h5')
|
||||||
|
print('GENERATION SUCCESSFUL!')
|
||||||
|
return X_train, X_other_features_train, y_train, X_test, X_other_features_test, y_test, X_validate, X_other_features_validate, y_validate
|
||||||
|
|
||||||
|
|
||||||
|
# Generate each y as an array of 11 numbers (with possible values between 0 and 1)
|
||||||
|
def generate_syllable_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location, shuffle=True):
|
||||||
|
y = np.zeros((len(content), max_num_vowels))
|
||||||
|
X = np.zeros((len(content), max_num_vowels), dtype=int)
|
||||||
|
# X = []
|
||||||
|
print('CREATING OTHER FEATURES...')
|
||||||
|
X_other_features = create_X_features(content, feature_dictionary)
|
||||||
|
print('OTHER FEATURES CREATED!')
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
for el in content:
|
||||||
|
j = 0
|
||||||
|
syllables = create_syllables(el[0], vowels)
|
||||||
|
# X_el = [''] * max_num_vowels
|
||||||
|
for syllable in syllables:
|
||||||
|
|
||||||
|
index = dictionary.index(syllable)
|
||||||
|
X[i][j] = index
|
||||||
|
# X[i][j][index] = 1
|
||||||
|
j += 1
|
||||||
|
# X.append(X_el)
|
||||||
|
j = 0
|
||||||
|
word_accetuations = []
|
||||||
|
num_vowels = 0
|
||||||
|
for c in list(el[3]):
|
||||||
|
index = 0
|
||||||
|
if is_vowel(el[3], j, vowels):
|
||||||
|
num_vowels += 1
|
||||||
|
for d in accetuated_vowels:
|
||||||
|
if c == d:
|
||||||
|
word_accetuations.append(num_vowels)
|
||||||
|
break
|
||||||
|
index += 1
|
||||||
|
j += 1
|
||||||
|
if len(word_accetuations) > 0:
|
||||||
|
y_value = 1/len(word_accetuations)
|
||||||
|
for el in word_accetuations:
|
||||||
|
# y[i][el] = y_value
|
||||||
|
y[i][el] = 1
|
||||||
|
else:
|
||||||
|
y[i][0] = 1
|
||||||
|
# y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
||||||
|
i += 1
|
||||||
|
# print(len(X))
|
||||||
|
# print(X[0])
|
||||||
|
X = np.array(X)
|
||||||
|
# print(X.shape)
|
||||||
|
# print(X[0])
|
||||||
|
# print(len(X))
|
||||||
|
if shuffle:
|
||||||
|
print('SHUFFELING INPUTS...')
|
||||||
|
X, y, X_other_features = shuffle_inputs(X, y, shuffle_vector_location, X_pure=X_other_features)
|
||||||
|
print('INPUTS SHUFFELED!')
|
||||||
|
return X, X_other_features, y
|
||||||
|
|
||||||
|
|
||||||
|
# generator for inputs for tracking of data fitting
|
||||||
|
def generate_fake_epoch_syllables(orig_X, orig_X_additional, orig_y, batch_size, dictionary_size=5168):
|
||||||
|
size = orig_X.shape[0]
|
||||||
|
eye = np.eye(dictionary_size, dtype=int)
|
||||||
|
while 1:
|
||||||
|
loc = 0
|
||||||
|
while loc < size:
|
||||||
|
if loc + batch_size >= size:
|
||||||
|
# [eye[i] for i in range(size-loc)]
|
||||||
|
# gen_orig_X = eye[orig_X[loc:size]]
|
||||||
|
# gen_orig_X = [eye[i] for i in range(size-loc)]
|
||||||
|
gen_orig_X = eye[orig_X[loc:size]]
|
||||||
|
yield([gen_orig_X, orig_X_additional[loc:size]], orig_y[loc:size])
|
||||||
|
else:
|
||||||
|
gen_orig_X = eye[orig_X[loc:loc + batch_size]]
|
||||||
|
yield([gen_orig_X, orig_X_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
|
||||||
|
loc += batch_size
|
||||||
|
|
Loading…
Reference in New Issue
Block a user