diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index ac93972..c634d16 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -4,6 +4,7 @@
+
@@ -34,8 +35,8 @@
-
-
+
+
@@ -132,6 +133,7 @@
load_extended_inputs
create_and_save_shuffle_vector
h5f
+ generate_inputs
@@ -171,6 +173,7 @@
+
@@ -218,7 +221,6 @@
-
@@ -951,8 +953,8 @@
-
-
+
+
diff --git a/prepare_data.py b/prepare_data.py
index b6ffb52..9104d56 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -5,6 +5,7 @@ from __future__ import unicode_literals
import numpy as np
import h5py
import gc
+import math
import StringIO
import copy
@@ -75,7 +76,7 @@ def load_model(file_name):
def read_content():
print('READING CONTENT...')
- with open('../data/SlovarIJS_BESEDE_utf8.lex') as f:
+ with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f:
content = f.readlines()
print('CONTENT READ SUCCESSFULY')
return [x.decode('utf8').split('\t') for x in content]
@@ -159,97 +160,98 @@ def shuffle_inputs(X, y, X_pure):
X_pure = X_pure[s]
return X, y, X_pure
-def generate_inputs():
- dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
-
- print('GENERATING X AND y...')
- X = np.zeros((len(content), max_word*len(dictionary)))
- y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
-
- i = 0
- for el in content:
- j = 0
- for c in list(el[0]):
- index = 0
- for d in dictionary:
- if c == d:
- X[i][index + j * max_word] = 1
- break
- index += 1
- j += 1
- j = 0
- word_accetuations = []
- num_vowels = 0
- for c in list(el[3]):
- index = 0
- if is_vowel(el[3], j, vowels):
- num_vowels += 1
- for d in accetuated_vowels:
- if c == d:
- word_accetuations.append(num_vowels)
- break
- index += 1
- j += 1
- y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
- i += 1
- print('GENERATION SUCCESSFUL!')
- print('SHUFFELING INPUTS...')
- X, y = shuffle_inputs(X, y)
- print('INPUTS SHUFFELED!')
- return X, y
+# def generate_inputs():
+# dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
+#
+# print('GENERATING X AND y...')
+# X = np.zeros((len(content), max_word*len(dictionary)))
+# y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
+#
+# i = 0
+# for el in content:
+# j = 0
+# for c in list(el[0]):
+# index = 0
+# for d in dictionary:
+# if c == d:
+# X[i][index + j * max_word] = 1
+# break
+# index += 1
+# j += 1
+# j = 0
+# word_accetuations = []
+# num_vowels = 0
+# for c in list(el[3]):
+# index = 0
+# if is_vowel(el[3], j, vowels):
+# num_vowels += 1
+# for d in accetuated_vowels:
+# if c == d:
+# word_accetuations.append(num_vowels)
+# break
+# index += 1
+# j += 1
+# y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
+# i += 1
+# print('GENERATION SUCCESSFUL!')
+# print('SHUFFELING INPUTS...')
+# X, y = shuffle_inputs(X, y)
+# print('INPUTS SHUFFELED!')
+# return X, y
+#
+#
+# def generate_matrix_inputs():
+# dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
+#
+# print('GENERATING X AND y...')
+# # X = np.zeros((len(content), max_word*len(dictionary)))
+# y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
+#
+# X = []
+#
+# i = 0
+# for el in content:
+# # j = 0
+# word = []
+# for c in list(el[0]):
+# index = 0
+# character = np.zeros(len(dictionary))
+# for d in dictionary:
+# if c == d:
+# # X[i][index + j * max_word] = 1
+# character[index] = 1
+# break
+# index += 1
+# word.append(character)
+# # j += 1
+# j = 0
+# X.append(word)
+# word_accetuations = []
+# num_vowels = 0
+# for c in list(el[3]):
+# index = 0
+# if is_vowel(el[3], j, vowels):
+# num_vowels += 1
+# for d in accetuated_vowels:
+# if c == d:
+# word_accetuations.append(num_vowels)
+# break
+# index += 1
+# j += 1
+# y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
+# i += 1
+# X = np.array(X)
+# print('GENERATION SUCCESSFUL!')
+# print('SHUFFELING INPUTS...')
+# X, y = shuffle_inputs(X, y)
+# print('INPUTS SHUFFELED!')
+# return X, y
-def generate_matrix_inputs():
+def generate_full_matrix_inputs():
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
-
- print('GENERATING X AND y...')
- # X = np.zeros((len(content), max_word*len(dictionary)))
- y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
- X = []
- i = 0
- for el in content:
- # j = 0
- word = []
- for c in list(el[0]):
- index = 0
- character = np.zeros(len(dictionary))
- for d in dictionary:
- if c == d:
- # X[i][index + j * max_word] = 1
- character[index] = 1
- break
- index += 1
- word.append(character)
- # j += 1
- j = 0
- X.append(word)
- word_accetuations = []
- num_vowels = 0
- for c in list(el[3]):
- index = 0
- if is_vowel(el[3], j, vowels):
- num_vowels += 1
- for d in accetuated_vowels:
- if c == d:
- word_accetuations.append(num_vowels)
- break
- index += 1
- j += 1
- y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
- i += 1
- X = np.array(X)
- print('GENERATION SUCCESSFUL!')
- print('SHUFFELING INPUTS...')
- X, y = shuffle_inputs(X, y)
- print('INPUTS SHUFFELED!')
- return X, y
-
-
-def generate_full_matrix_inputs():
- dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
-
print('GENERATING X AND y...')
# X = np.zeros((len(content), max_word*len(dictionary)))
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
@@ -553,3 +555,54 @@ def generate_input_from_word(word, max_word, dictionary):
index += 1
j += 1
return x
+
+def generate_input_per_vowel_from_word(word, max_word, dictionary, vowels):
+ X_el = np.zeros((max_word, len(dictionary)))
+ j = 0
+ for c in list(word):
+ index = 0
+ for d in dictionary:
+ if c == d:
+ X_el[j][index] = 1
+ break
+ index += 1
+ j += 1
+
+ X = []
+ X_pure = []
+ vowel_i = 0
+ for i in range(len(word)):
+ if is_vowel(list(word), i, vowels):
+ X.append(X_el)
+ X_pure.append(vowel_i)
+ vowel_i += 1
+ return np.array(X), np.array(X_pure)
+
+def decode_position_from_vowel_to_final_number(y):
+ res = []
+ for i in range(len(y)):
+ if y[i][0] > 0.5:
+ res.append(i + 1)
+ return res
+
+
+def split_content(content, ratio):
+ expanded_content = [el[1] if el[1] != '=' else el[0] for el in content]
+ # print(len(content))
+ unique_content = sorted(set(expanded_content))
+
+ s = np.arange(len(unique_content))
+ np.random.shuffle(s)
+
+ split_num = math.floor(len(unique_content) * ratio)
+ validate_content = []
+ shuffled_unique_train_content = [unique_content[i] for i in range(len(s)) if s[i] >= split_num]
+
+ shuffled_unique_train_content_set = set(shuffled_unique_train_content)
+ shuffled_unique_validate_content = [unique_content[i] for i in range(len(s)) if s[i] < split_num]
+
+ shuffled_unique_validate_content_set = set(shuffled_unique_validate_content)
+
+ train_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_train_content_set]
+ validate_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_validate_content_set]
+ return train_content, validate_content
\ No newline at end of file