365 lines
11 KiB
Python
365 lines
11 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
from __future__ import unicode_literals
|
||
|
# text in Western (Windows 1252)
|
||
|
|
||
|
import numpy as np
|
||
|
import h5py
|
||
|
import gc
|
||
|
|
||
|
def save_inputs(file_name, X, y):
|
||
|
h5f = h5py.File(file_name, 'w')
|
||
|
adict=dict(X=X, y=y)
|
||
|
for k,v in adict.items():
|
||
|
h5f.create_dataset(k,data=v)
|
||
|
h5f.close()
|
||
|
|
||
|
def create_and_save_inputs(file_name):
|
||
|
X, y, X_pure = generate_full_vowel_matrix_inputs()
|
||
|
h5f = h5py.File(file_name, 'w')
|
||
|
adict=dict(X=X, y=y, X_pure=X_pure)
|
||
|
for k,v in adict.items():
|
||
|
h5f.create_dataset(k,data=v)
|
||
|
h5f.close()
|
||
|
|
||
|
def load_inputs(file_name):
|
||
|
h5f = h5py.File(file_name,'r')
|
||
|
X = h5f['X'][:]
|
||
|
y = h5f['y'][:]
|
||
|
|
||
|
h5f.close()
|
||
|
return X, y
|
||
|
|
||
|
def save_model(model, file_name):
|
||
|
h5f = h5py.File(file_name, 'w')
|
||
|
adict=dict(W1=model['W1'], b1=model['b1'], W2=model['W2'], b2=model['b2'])
|
||
|
for k,v in adict.items():
|
||
|
h5f.create_dataset(k,data=v)
|
||
|
|
||
|
h5f.close()
|
||
|
|
||
|
def load_model(file_name):
|
||
|
h5f = h5py.File(file_name,'r')
|
||
|
model = {}
|
||
|
W1.set_value(h5f['W1'][:])
|
||
|
b1.set_value(h5f['b1'][:])
|
||
|
W2.set_value(h5f['W2'][:])
|
||
|
b2.set_value(h5f['b2'][:])
|
||
|
h5f.close()
|
||
|
return model
|
||
|
|
||
|
def read_content():
|
||
|
print('READING CONTENT...')
|
||
|
with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f:
|
||
|
content = f.readlines()
|
||
|
print('CONTENT READ SUCCESSFULY')
|
||
|
return [x.decode('utf8').split('\t') for x in content]
|
||
|
|
||
|
|
||
|
def is_vowel(word_list, position, vowels):
|
||
|
if word_list[position] in vowels:
|
||
|
return True
|
||
|
if word_list[position] == u'r' and (position - 1 < 0 or word_list[position - 1] not in vowels) and (position + 1 >= len(word_list) or word_list[position + 1] not in vowels):
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
def create_dict():
|
||
|
|
||
|
content = read_content()
|
||
|
|
||
|
print('CREATING DICTIONARY...')
|
||
|
|
||
|
# CREATE dictionary AND max_word
|
||
|
accetuated_vowels = [u'à', u'á', u'ä', u'é', u'ë', u'ì', u'í', u'î', u'ó', u'ô', u'ö', u'ú', u'ü']
|
||
|
default_vowels = [u'a', u'e', u'i', u'o', u'u']
|
||
|
vowels = []
|
||
|
vowels.extend(accetuated_vowels)
|
||
|
vowels.extend(default_vowels)
|
||
|
|
||
|
dictionary = ['']
|
||
|
line = 0
|
||
|
max_word = 0
|
||
|
# ADD 'EMPTY' VOWEL
|
||
|
max_num_vowels = 0
|
||
|
for el in content:
|
||
|
num_vowels = 0
|
||
|
i = 0
|
||
|
try:
|
||
|
if len(el[3]) > max_word:
|
||
|
max_word = len(el[3])
|
||
|
if len(el[0]) > max_word:
|
||
|
max_word = len(el[0])
|
||
|
for c in list(el[3]):
|
||
|
if is_vowel(list(el[3]), i, vowels):
|
||
|
num_vowels += 1
|
||
|
if c not in dictionary:
|
||
|
dictionary.append(c)
|
||
|
i += 1
|
||
|
for c in list(el[0]):
|
||
|
if c not in dictionary:
|
||
|
dictionary.append(c)
|
||
|
if num_vowels > max_num_vowels:
|
||
|
max_num_vowels = num_vowels
|
||
|
except Exception, e:
|
||
|
print line - 1
|
||
|
print el
|
||
|
break
|
||
|
line += 1
|
||
|
dictionary = sorted(dictionary)
|
||
|
max_num_vowels += 1
|
||
|
print('DICTIONARY CREATION SUCCESSFUL!')
|
||
|
return dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels
|
||
|
|
||
|
|
||
|
# GENERATE X and y
|
||
|
def generate_presentable_y(accetuations_list, word_list, max_num_vowels):
|
||
|
while len(accetuations_list) < 2:
|
||
|
accetuations_list.append(0)
|
||
|
if len(accetuations_list) > 2:
|
||
|
accetuations_list = accetuations_list[:2]
|
||
|
accetuations_list = np.array(accetuations_list)
|
||
|
final_position = accetuations_list[0] + max_num_vowels * accetuations_list[1]
|
||
|
return final_position
|
||
|
|
||
|
def shuffle_inputs(X, y, X_pure):
|
||
|
s = np.arange(X.shape[0])
|
||
|
np.random.shuffle(s)
|
||
|
X = X[s]
|
||
|
y = y[s]
|
||
|
X_pure = X_pure[s]
|
||
|
return X, y, X_pure
|
||
|
|
||
|
def generate_inputs():
|
||
|
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
||
|
|
||
|
print('GENERATING X AND y...')
|
||
|
X = np.zeros((len(content), max_word*len(dictionary)))
|
||
|
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
||
|
|
||
|
i = 0
|
||
|
for el in content:
|
||
|
j = 0
|
||
|
for c in list(el[0]):
|
||
|
index = 0
|
||
|
for d in dictionary:
|
||
|
if c == d:
|
||
|
X[i][index + j * max_word] = 1
|
||
|
break
|
||
|
index += 1
|
||
|
j += 1
|
||
|
j = 0
|
||
|
word_accetuations = []
|
||
|
num_vowels = 0
|
||
|
for c in list(el[3]):
|
||
|
index = 0
|
||
|
if is_vowel(el[3], j, vowels):
|
||
|
num_vowels += 1
|
||
|
for d in accetuated_vowels:
|
||
|
if c == d:
|
||
|
word_accetuations.append(num_vowels)
|
||
|
break
|
||
|
index += 1
|
||
|
j += 1
|
||
|
y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
||
|
i += 1
|
||
|
print('GENERATION SUCCESSFUL!')
|
||
|
print('SHUFFELING INPUTS...')
|
||
|
X, y = shuffle_inputs(X, y)
|
||
|
print('INPUTS SHUFFELED!')
|
||
|
return X, y
|
||
|
|
||
|
|
||
|
def generate_matrix_inputs():
|
||
|
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
||
|
|
||
|
print('GENERATING X AND y...')
|
||
|
# X = np.zeros((len(content), max_word*len(dictionary)))
|
||
|
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
||
|
|
||
|
X = []
|
||
|
|
||
|
i = 0
|
||
|
for el in content:
|
||
|
# j = 0
|
||
|
word = []
|
||
|
for c in list(el[0]):
|
||
|
index = 0
|
||
|
character = np.zeros(len(dictionary))
|
||
|
for d in dictionary:
|
||
|
if c == d:
|
||
|
# X[i][index + j * max_word] = 1
|
||
|
character[index] = 1
|
||
|
break
|
||
|
index += 1
|
||
|
word.append(character)
|
||
|
# j += 1
|
||
|
j = 0
|
||
|
X.append(word)
|
||
|
word_accetuations = []
|
||
|
num_vowels = 0
|
||
|
for c in list(el[3]):
|
||
|
index = 0
|
||
|
if is_vowel(el[3], j, vowels):
|
||
|
num_vowels += 1
|
||
|
for d in accetuated_vowels:
|
||
|
if c == d:
|
||
|
word_accetuations.append(num_vowels)
|
||
|
break
|
||
|
index += 1
|
||
|
j += 1
|
||
|
y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
||
|
i += 1
|
||
|
X = np.array(X)
|
||
|
print('GENERATION SUCCESSFUL!')
|
||
|
print('SHUFFELING INPUTS...')
|
||
|
X, y = shuffle_inputs(X, y)
|
||
|
print('INPUTS SHUFFELED!')
|
||
|
return X, y
|
||
|
|
||
|
|
||
|
def generate_full_matrix_inputs():
|
||
|
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
||
|
|
||
|
print('GENERATING X AND y...')
|
||
|
# X = np.zeros((len(content), max_word*len(dictionary)))
|
||
|
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
||
|
X = np.zeros((len(content), max_word, len(dictionary)))
|
||
|
|
||
|
i = 0
|
||
|
for el in content:
|
||
|
j = 0
|
||
|
# word = []
|
||
|
for c in list(el[0]):
|
||
|
index = 0
|
||
|
# character = np.zeros(len(dictionary))
|
||
|
for d in dictionary:
|
||
|
if c == d:
|
||
|
X[i][j][index] = 1
|
||
|
# character[index] = 1
|
||
|
break
|
||
|
index += 1
|
||
|
# word.append(character)
|
||
|
j += 1
|
||
|
j = 0
|
||
|
# X.append(word)
|
||
|
word_accetuations = []
|
||
|
num_vowels = 0
|
||
|
for c in list(el[3]):
|
||
|
index = 0
|
||
|
if is_vowel(el[3], j, vowels):
|
||
|
num_vowels += 1
|
||
|
for d in accetuated_vowels:
|
||
|
if c == d:
|
||
|
word_accetuations.append(num_vowels)
|
||
|
break
|
||
|
index += 1
|
||
|
j += 1
|
||
|
y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
||
|
i += 1
|
||
|
# X = np.array(X)
|
||
|
print('GENERATION SUCCESSFUL!')
|
||
|
print('SHUFFELING INPUTS...')
|
||
|
X, y = shuffle_inputs(X, y)
|
||
|
print('INPUTS SHUFFELED!')
|
||
|
return X, y
|
||
|
|
||
|
def count_vowels(content, vowels):
|
||
|
num_all_vowels = 0
|
||
|
for el in content:
|
||
|
for m in range(len(el[0])):
|
||
|
if is_vowel(list(el[0]), m, vowels):
|
||
|
num_all_vowels += 1
|
||
|
return num_all_vowels
|
||
|
|
||
|
|
||
|
def generate_full_vowel_matrix_inputs():
|
||
|
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
||
|
gc.collect()
|
||
|
# print (2018553 * max_word * len(dictionary) / (2**30.0))
|
||
|
print('GENERATING X AND y...')
|
||
|
# X = np.zeros((len(content), max_word*len(dictionary)))
|
||
|
y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
|
||
|
# X = np.zeros((2018553, max_word, len(dictionary)))
|
||
|
X_pure = []
|
||
|
X = []
|
||
|
|
||
|
i = 0
|
||
|
for el in content:
|
||
|
j = 0
|
||
|
# word = []
|
||
|
X_el = np.zeros((max_word, len(dictionary)))
|
||
|
for c in list(el[0]):
|
||
|
index = 0
|
||
|
# character = np.zeros(len(dictionary))
|
||
|
for d in dictionary:
|
||
|
if c == d:
|
||
|
X_el[j][index] = 1
|
||
|
# character[index] = 1
|
||
|
break
|
||
|
index += 1
|
||
|
# word.append(character)
|
||
|
j += 1
|
||
|
# for c in list(el[0]):
|
||
|
vowel_i = 0
|
||
|
for m in range(len(el[0])):
|
||
|
if is_vowel(list(el[0]), m, vowels):
|
||
|
X.append(X_el)
|
||
|
X_pure.append(vowel_i)
|
||
|
vowel_i += 1
|
||
|
j = 0
|
||
|
# X.append(word)
|
||
|
word_accetuations = []
|
||
|
num_vowels = 0
|
||
|
for c in list(el[3]):
|
||
|
index = 0
|
||
|
if is_vowel(el[3], j, vowels):
|
||
|
num_vowels += 1
|
||
|
for d in accetuated_vowels:
|
||
|
if c == d:
|
||
|
word_accetuations.append(num_vowels)
|
||
|
break
|
||
|
index += 1
|
||
|
j += 1
|
||
|
y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
|
||
|
i += 1
|
||
|
# print(len(X))
|
||
|
# del X_pure
|
||
|
# del dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels
|
||
|
|
||
|
X = np.array(X)
|
||
|
X_pure = np.array(X_pure)
|
||
|
print('GENERATION SUCCESSFUL!')
|
||
|
print('SHUFFELING INPUTS...')
|
||
|
X, y, X_pure = shuffle_inputs(X, y, X_pure)
|
||
|
print('INPUTS SHUFFELED!')
|
||
|
return X, y, X_pure
|
||
|
|
||
|
|
||
|
def decode_position(y, max_num_vowels):
|
||
|
max_el = 0
|
||
|
i = 0
|
||
|
pos = -1
|
||
|
for el in y:
|
||
|
if el > max_el:
|
||
|
max_el = el
|
||
|
pos = i
|
||
|
i += 1
|
||
|
return [pos % max_num_vowels, pos / max_num_vowels]
|
||
|
|
||
|
|
||
|
def decode_position_from_number(y, max_num_vowels):
|
||
|
return [y % max_num_vowels, y / max_num_vowels]
|
||
|
|
||
|
|
||
|
def generate_input_from_word(word, max_word, dictionary):
|
||
|
x = np.zeros(max_word*len(dictionary))
|
||
|
j = 0
|
||
|
for c in list(word):
|
||
|
index = 0
|
||
|
for d in dictionary:
|
||
|
if c == d:
|
||
|
x[index + j * max_word] = 1
|
||
|
break
|
||
|
index += 1
|
||
|
j += 1
|
||
|
return x
|