diff --git a/.gitignore b/.gitignore
index 16a6d2e..b7f581f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,3 +97,4 @@ new_sloleks.xml
grid_results/
.idea/
cnn/word_accetuation/svm/data/
+data_merge.ipynb
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index dc1ebcc..57f20f2 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -2,15 +2,16 @@
-
-
-
+
+
+
+
-
-
+
-
+
+
@@ -40,8 +41,8 @@
-
-
+
+
@@ -58,7 +59,7 @@
-
+
@@ -68,8 +69,8 @@
-
-
+
+
@@ -118,7 +119,7 @@
-
+
@@ -128,8 +129,18 @@
-
-
+
+
+
+
+
+
+
+
+
+
+
+
@@ -175,11 +186,11 @@
-
+
-
-
+
+
@@ -212,8 +223,8 @@
-
-
+
+
@@ -221,11 +232,11 @@
-
+
-
-
+
+
@@ -276,28 +287,6 @@
- np.concatenate
- prepare_data
- assign_stress_locations
- test_accuracy
- test_acc
- test_set
- accent_classification
- test_type
- get_word_length
- input_words
- content
- codecs
- data
- load_model
- accentuate_word
- get_word_le
- load_location_models
- get_ensemble_location_predictions
- reverse_inputs
- _x_letter_input
- reverse
- print(
count_vowels
count
sylla
@@ -306,6 +295,28 @@
get_ensemble_type_predictions
rever
accentuate_wo
+ content
+ transla
+ feature_dic
+ _create_slovene_feature_dictionary
+ feature_dictionary
+ morp
+ convert_multext
+ _convert_multext
+ _convert_to_multext_east_v4
+ decode_x
+ assign_word_accentuation_type
+ accented_vowels
+ test_
+ à
+ _get_accented_vowels
+ ô
+ ó
+ accent_class
+ i
+ for i
+ _syllable_generator
+ _generator_instance
@@ -345,13 +356,14 @@
-
-
-
+
+
+
+
@@ -376,7 +388,7 @@
-
+
@@ -391,7 +403,7 @@
-
+
@@ -677,7 +689,7 @@
-
+
@@ -723,7 +735,6 @@
-
@@ -830,14 +841,6 @@
-
-
-
-
-
-
-
-
@@ -852,26 +855,6 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
@@ -893,16 +876,6 @@
-
-
-
-
-
-
-
-
-
-
@@ -938,18 +911,42 @@
-
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
-
-
+
+
+
+
+
+
+
+
+
+
@@ -970,6 +967,14 @@
+
+
+
+
+
+
+
+
@@ -978,30 +983,6 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
@@ -1010,6 +991,14 @@
+
+
+
+
+
+
+
+
@@ -1018,58 +1007,30 @@
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
+
+
@@ -1078,13 +1039,71 @@
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/prepare_data.py b/prepare_data.py
index 9132711..74a6a91 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -580,20 +580,20 @@ class Data:
# generator for inputs for tracking of data fitting
def generator(self, data_type, batch_size, x=None, x_other_features_validate=None, y_validate=None, content_name='SlovarIJS_BESEDE_utf8.lex',
- content_location='../../../data/'):
+ content_location='../../../data/', oversampling=np.ones(13)):
content_path = '{}{}'.format(content_location, content_name)
if data_type == 'train':
- return self._generator_instance(self.x_train, self.x_other_features_train, self.y_train, batch_size, content_path)
+ return self._generator_instance(self.x_train, self.x_other_features_train, self.y_train, batch_size, content_path, oversampling)
elif data_type == 'test':
- return self._generator_instance(self.x_test, self.x_other_features_test, self.y_test, batch_size, content_path)
+ return self._generator_instance(self.x_test, self.x_other_features_test, self.y_test, batch_size, content_path, oversampling)
elif data_type == 'validate':
- return self._generator_instance(self.x_validate, self.x_other_features_validate, self.y_validate, batch_size, content_path)
+ return self._generator_instance(self.x_validate, self.x_other_features_validate, self.y_validate, batch_size, content_path, oversampling)
else:
return self._generator_instance(x, x_other_features_validate, y_validate, batch_size)
# if self._input_type
- def _generator_instance(self, orig_x, orig_x_additional, orig_y, batch_size, content_path):
+ def _generator_instance(self, orig_x, orig_x_additional, orig_y, batch_size, content_path, oversampling):
if self._input_type == 'l':
content = self._read_content(content_path)
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
@@ -603,14 +603,14 @@ class Data:
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
syllable_dictionary = self._create_syllables_dictionary(content, vowels)
eye = np.eye(len(syllable_dictionary), dtype=int)
- return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye, accented_vowels)
+ return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye, accented_vowels, oversampling)
elif self._input_type == 'sl':
content = self._read_content(content_path)
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
syllable_dictionary = self._create_syllables_dictionary(content, vowels)
max_syllable = self._get_max_syllable(syllable_dictionary)
syllable_letters_translator = self._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
- return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator, accented_vowels)
+ return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator, accented_vowels, oversampling)
# generator for inputs for tracking of data fitting
def _letter_generator(self, orig_x, orig_x_additional, orig_y, batch_size, accented_vowels):
@@ -666,7 +666,7 @@ class Data:
loc += batch_size
# generator for inputs for tracking of data fitting
- def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator, accented_vowels):
+ def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator, accented_vowels, oversampling):
size = orig_x.shape[0]
while 1:
loc = 0
@@ -683,9 +683,10 @@ class Data:
if accent > 0:
new_orig_x_additional = orig_x_additional[loc]
new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc]))
- input_x_stack.append(orig_x[loc])
- input_x_other_features_stack.append(new_orig_x_additional)
- input_y_stack.append(eye[int(accent)])
+ for i in range(oversampling[int(accent)]):
+ input_x_stack.append(orig_x[loc])
+ input_x_other_features_stack.append(new_orig_x_additional)
+ input_y_stack.append(eye[int(accent)])
accent_loc += 1
loc += 1
if len(input_x_stack) > batch_size:
diff --git a/run_multiple_files.py b/run_multiple_files.py
index 761ad79..677ac02 100644
--- a/run_multiple_files.py
+++ b/run_multiple_files.py
@@ -9,6 +9,6 @@
#import cnn.accent_classification.letters.v3_0.workbench
#import cnn.accent_classification.syllables.v2_0.workbench
#import cnn.accent_classification.syllabled_letters.v2_0.workbench
-import cnn.accent_classification.letters.v3_1.workbench
-import cnn.accent_classification.syllables.v2_1.workbench
-import cnn.accent_classification.syllabled_letters.v2_1.workbench
+#import cnn.accent_classification.letters.v3_1.workbench
+import cnn.accent_classification.syllables.v2_2.workbench
+#import cnn.accent_classification.syllabled_letters.v2_1.workbench
diff --git a/sloleks_accetuation2.ipynb b/sloleks_accetuation2.ipynb
index 7ee70b1..cb0e7fd 100644
--- a/sloleks_accetuation2.ipynb
+++ b/sloleks_accetuation2.ipynb
@@ -201,9 +201,36 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 79,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Writing data from 0 onward.\n",
+ "Elapsed time: 46.20 minutes\n",
+ "Writing data from 100000 onward.\n",
+ "Elapsed time: 89.81 minutes\n",
+ "Writing data from 200000 onward.\n",
+ "Elapsed time: 134.45 minutes\n"
+ ]
+ },
+ {
+ "ename": "IndexError",
+ "evalue": "index 10 is out of bounds for axis 0 with size 10",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mletter_type_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msyllable_type_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msyllabled_letter_type_model\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0mletter_type_co_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msyllable_type_co_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msyllabled_letter_type_co_model\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)\n\u001b[0m\u001b[1;32m 22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/Developement/accetuation/prepare_data.py\u001b[0m in \u001b[0;36maccentuate_word\u001b[0;34m(self, input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model, letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model, letter_type_model, syllable_type_model, syllabled_letter_type_model, letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model, dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)\u001b[0m\n\u001b[1;32m 1635\u001b[0m \u001b[0msyllabled_letters_location_co_model\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1636\u001b[0m \u001b[0mdictionary\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_word\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_num_vowels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvowels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccented_vowels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeature_dictionary\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1637\u001b[0;31m syllable_dictionary)\n\u001b[0m\u001b[1;32m 1638\u001b[0m \u001b[0;31m#print(predictions)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1639\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'A'\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mvowels\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/Developement/accetuation/prepare_data.py\u001b[0m in \u001b[0;36mget_ensemble_location_predictions\u001b[0;34m(input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model, letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model, dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)\u001b[0m\n\u001b[1;32m 1465\u001b[0m \u001b[0mletter_location_co_predictions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mletter_location_co_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_generator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgenerator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1466\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1467\u001b[0;31m \u001b[0mletter_location_co_predictions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreverse_predictions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mletter_location_co_predictions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_words\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvowels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1468\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1469\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mData\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m's'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshuffle_all_inputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconvert_multext\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreverse_inputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/Developement/accetuation/prepare_data.py\u001b[0m in \u001b[0;36mreverse_predictions\u001b[0;34m(self, predictions, words, vowels)\u001b[0m\n\u001b[1;32m 1503\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1504\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword_len\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1505\u001b[0;31m \u001b[0mnew_predictions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mpredictions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mword_len\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1506\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1507\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnew_predictions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mIndexError\u001b[0m: index 10 is out of bounds for axis 0 with size 10"
+ ]
+ }
+ ],
"source": [
"#Words proccesed: 650250\n",
"#Word indeks: 50023\n",