diff --git a/.gitignore b/.gitignore index 16a6d2e..b7f581f 100644 --- a/.gitignore +++ b/.gitignore @@ -97,3 +97,4 @@ new_sloleks.xml grid_results/ .idea/ cnn/word_accetuation/svm/data/ +data_merge.ipynb diff --git a/.idea/workspace.xml b/.idea/workspace.xml index dc1ebcc..57f20f2 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,15 +2,16 @@ - - - + + + + - - + - + + - np.concatenate - prepare_data - assign_stress_locations - test_accuracy - test_acc - test_set - accent_classification - test_type - get_word_length - input_words - content - codecs - data - load_model - accentuate_word - get_word_le - load_location_models - get_ensemble_location_predictions - reverse_inputs - _x_letter_input - reverse - print( count_vowels count sylla @@ -306,6 +295,28 @@ get_ensemble_type_predictions rever accentuate_wo + content + transla + feature_dic + _create_slovene_feature_dictionary + feature_dictionary + morp + convert_multext + _convert_multext + _convert_to_multext_east_v4 + decode_x + assign_word_accentuation_type + accented_vowels + test_ + à + _get_accented_vowels + ô + ó + accent_class + i + for i + _syllable_generator + _generator_instance @@ -345,13 +356,14 @@ @@ -376,7 +388,7 @@ - + @@ -391,7 +403,7 @@ - + @@ -677,7 +689,7 @@ - + @@ -723,7 +735,6 @@ - @@ -830,14 +841,6 @@ - - - - - - - - @@ -852,26 +855,6 @@ - - - - - - - - - - - - - - - - - - - - @@ -893,16 +876,6 @@ - - - - - - - - - - @@ -938,18 +911,42 @@ - + - - + + + + + + + + + + + + + + + + + + - + - - + + + + + + + + + + @@ -970,26 +967,26 @@ - + - - + + - + - - + + - + - - + + @@ -1002,87 +999,109 @@ - + - - + + - + - - - + + + + + - + - - - + + + + + - + - - - + + + + + - + - - - + + + + + - + - - + + - + - - + + - - - - - - - + - - - - - + + + - + - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/prepare_data.py b/prepare_data.py index 9132711..74a6a91 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -580,20 +580,20 @@ class Data: # generator for inputs for tracking of data fitting def generator(self, data_type, batch_size, x=None, x_other_features_validate=None, y_validate=None, content_name='SlovarIJS_BESEDE_utf8.lex', - content_location='../../../data/'): + content_location='../../../data/', oversampling=np.ones(13)): content_path = '{}{}'.format(content_location, content_name) if data_type == 'train': - return self._generator_instance(self.x_train, self.x_other_features_train, self.y_train, batch_size, content_path) + return self._generator_instance(self.x_train, self.x_other_features_train, self.y_train, batch_size, content_path, oversampling) elif data_type == 'test': - return self._generator_instance(self.x_test, self.x_other_features_test, self.y_test, batch_size, content_path) + return self._generator_instance(self.x_test, self.x_other_features_test, self.y_test, batch_size, content_path, oversampling) elif data_type == 'validate': - return self._generator_instance(self.x_validate, self.x_other_features_validate, self.y_validate, batch_size, content_path) + return self._generator_instance(self.x_validate, self.x_other_features_validate, self.y_validate, batch_size, content_path, oversampling) else: return self._generator_instance(x, x_other_features_validate, y_validate, batch_size) # if self._input_type - def _generator_instance(self, orig_x, orig_x_additional, orig_y, batch_size, content_path): + def _generator_instance(self, orig_x, orig_x_additional, orig_y, batch_size, content_path, oversampling): if self._input_type == 'l': content = self._read_content(content_path) dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content) @@ -603,14 +603,14 @@ class Data: dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content) syllable_dictionary = self._create_syllables_dictionary(content, vowels) eye = np.eye(len(syllable_dictionary), dtype=int) - return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye, accented_vowels) + return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye, accented_vowels, oversampling) elif self._input_type == 'sl': content = self._read_content(content_path) dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content) syllable_dictionary = self._create_syllables_dictionary(content, vowels) max_syllable = self._get_max_syllable(syllable_dictionary) syllable_letters_translator = self._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels) - return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator, accented_vowels) + return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator, accented_vowels, oversampling) # generator for inputs for tracking of data fitting def _letter_generator(self, orig_x, orig_x_additional, orig_y, batch_size, accented_vowels): @@ -666,7 +666,7 @@ class Data: loc += batch_size # generator for inputs for tracking of data fitting - def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator, accented_vowels): + def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator, accented_vowels, oversampling): size = orig_x.shape[0] while 1: loc = 0 @@ -683,9 +683,10 @@ class Data: if accent > 0: new_orig_x_additional = orig_x_additional[loc] new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc])) - input_x_stack.append(orig_x[loc]) - input_x_other_features_stack.append(new_orig_x_additional) - input_y_stack.append(eye[int(accent)]) + for i in range(oversampling[int(accent)]): + input_x_stack.append(orig_x[loc]) + input_x_other_features_stack.append(new_orig_x_additional) + input_y_stack.append(eye[int(accent)]) accent_loc += 1 loc += 1 if len(input_x_stack) > batch_size: diff --git a/run_multiple_files.py b/run_multiple_files.py index 761ad79..677ac02 100644 --- a/run_multiple_files.py +++ b/run_multiple_files.py @@ -9,6 +9,6 @@ #import cnn.accent_classification.letters.v3_0.workbench #import cnn.accent_classification.syllables.v2_0.workbench #import cnn.accent_classification.syllabled_letters.v2_0.workbench -import cnn.accent_classification.letters.v3_1.workbench -import cnn.accent_classification.syllables.v2_1.workbench -import cnn.accent_classification.syllabled_letters.v2_1.workbench +#import cnn.accent_classification.letters.v3_1.workbench +import cnn.accent_classification.syllables.v2_2.workbench +#import cnn.accent_classification.syllabled_letters.v2_1.workbench diff --git a/sloleks_accetuation2.ipynb b/sloleks_accetuation2.ipynb index 7ee70b1..cb0e7fd 100644 --- a/sloleks_accetuation2.ipynb +++ b/sloleks_accetuation2.ipynb @@ -201,9 +201,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 79, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing data from 0 onward.\n", + "Elapsed time: 46.20 minutes\n", + "Writing data from 100000 onward.\n", + "Elapsed time: 89.81 minutes\n", + "Writing data from 200000 onward.\n", + "Elapsed time: 134.45 minutes\n" + ] + }, + { + "ename": "IndexError", + "evalue": "index 10 is out of bounds for axis 0 with size 10", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mletter_type_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msyllable_type_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msyllabled_letter_type_model\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0mletter_type_co_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msyllable_type_co_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msyllabled_letter_type_co_model\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)\n\u001b[0m\u001b[1;32m 22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Developement/accetuation/prepare_data.py\u001b[0m in \u001b[0;36maccentuate_word\u001b[0;34m(self, input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model, letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model, letter_type_model, syllable_type_model, syllabled_letter_type_model, letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model, dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)\u001b[0m\n\u001b[1;32m 1635\u001b[0m \u001b[0msyllabled_letters_location_co_model\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1636\u001b[0m \u001b[0mdictionary\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_word\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_num_vowels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvowels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccented_vowels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeature_dictionary\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1637\u001b[0;31m syllable_dictionary)\n\u001b[0m\u001b[1;32m 1638\u001b[0m \u001b[0;31m#print(predictions)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1639\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'A'\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mvowels\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Developement/accetuation/prepare_data.py\u001b[0m in \u001b[0;36mget_ensemble_location_predictions\u001b[0;34m(input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model, letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model, dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)\u001b[0m\n\u001b[1;32m 1465\u001b[0m \u001b[0mletter_location_co_predictions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mletter_location_co_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_generator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgenerator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1466\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1467\u001b[0;31m \u001b[0mletter_location_co_predictions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreverse_predictions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mletter_location_co_predictions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_words\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvowels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1468\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1469\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mData\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m's'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshuffle_all_inputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconvert_multext\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreverse_inputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Developement/accetuation/prepare_data.py\u001b[0m in \u001b[0;36mreverse_predictions\u001b[0;34m(self, predictions, words, vowels)\u001b[0m\n\u001b[1;32m 1503\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1504\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword_len\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1505\u001b[0;31m \u001b[0mnew_predictions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mpredictions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mword_len\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1506\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1507\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnew_predictions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIndexError\u001b[0m: index 10 is out of bounds for axis 0 with size 10" + ] + } + ], "source": [ "#Words proccesed: 650250\n", "#Word indeks: 50023\n",