stress_asignment/prepare_data.py

1780 lines
86 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
# text in Western (Windows 1252)
import numpy as np
import h5py
import math
import keras.backend as K
import os.path
from os import remove
import codecs
from copy import copy
from keras import optimizers
from keras.models import Model
from keras.layers import Dense, Dropout, Input
from keras.layers.merge import concatenate
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Flatten
from keras.models import load_model
class Data:
def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True,
additional_letter_attributes=True, reverse_inputs=True, accent_classification=False, number_of_syllables=False,
convert_multext=True, bidirectional_basic_input=False, bidirectional_architectural_input=False):
self._input_type = input_type
self._save_generated_data = save_generated_data
self._allow_shuffle_vector_generation = allow_shuffle_vector_generation
self._shuffle_all_inputs = shuffle_all_inputs
self._additional_letter_attributes = additional_letter_attributes
self._reverse_inputs = reverse_inputs
self._accent_classification = accent_classification
self._number_of_syllables = number_of_syllables
self._convert_multext = convert_multext
self._bidirectional_basic_input = bidirectional_basic_input
self._bidirectional_architectural_input = bidirectional_architectural_input
self.x_train = None
# self.x2_train = None
self.x_other_features_train = None
self.y_train = None
self.x_test = None
# self.x2_test = None
self.x_other_features_test = None
self.y_test = None
self.x_validate = None
# self.x2_validate = None
self.x_other_features_validate = None
self.y_validate = None
def generate_data(self, train_inputs_name, test_inputs_name, validate_inputs_name, test_and_validation_size=0.1,
force_override=False, content_name='SlovarIJS_BESEDE_utf8.lex',
content_shuffle_vector='content_shuffle_vector', shuffle_vector='shuffle_vector',
inputs_location='../../internal_representations/inputs/', content_location='../../../data/',
test_set=False, complete_set=False):
content_path = '{}{}'.format(content_location, content_name)
train_path = '{}{}.h5'.format(inputs_location, train_inputs_name)
test_path = '{}{}.h5'.format(inputs_location, test_inputs_name)
validate_path = '{}{}.h5'.format(inputs_location, validate_inputs_name)
if not force_override and os.path.exists(train_path) and os.path.exists(test_path) and os.path.exists(validate_path):
print('LOADING DATA...')
self.x_train, self.x_other_features_train, self.y_train = self._load_inputs(train_path)
self.x_test, self.x_other_features_test, self.y_test = self._load_inputs(test_path)
self.x_validate, self.x_other_features_validate, self.y_validate = self._load_inputs(validate_path)
print('LOAD SUCCESSFUL!')
else:
content_shuffle_vector_path = '{}{}.h5'.format(inputs_location, content_shuffle_vector)
shuffle_vector_path = '{}{}'.format(inputs_location, shuffle_vector)
# actual generation of inputs
self._generate_inputs(content_path, content_shuffle_vector_path, shuffle_vector_path, test_and_validation_size, train_path, test_path,
validate_path)
if test_set:
self.x_train = np.concatenate((self.x_train, self.x_test), axis=0)
self.x_other_features_train = np.concatenate((self.x_other_features_train, self.x_other_features_test), axis=0)
self.y_train = np.concatenate((self.y_train, self.y_test), axis=0)
self.x_test = self.x_validate
self.x_other_features_test = self.x_other_features_validate
self.y_test = self.y_validate
if complete_set:
self.x_train = np.concatenate((self.x_train, self.x_test, self.x_validate), axis=0)
self.x_other_features_train = np.concatenate((self.x_other_features_train, self.x_other_features_test, self.x_other_features_validate),
axis=0)
self.y_train = np.concatenate((self.y_train, self.y_test, self.y_validate), axis=0)
self.x_test = self.x_validate
self.x_other_features_test = self.x_other_features_validate
self.y_test = self.y_validate
def _generate_inputs(self, content_location, content_shuffle_vector_location, shuffle_vector_location, test_and_validation_size, train_path,
test_path, validate_path):
print('READING CONTENT...')
content = self._read_content(content_location)
print('CONTENT READ SUCCESSFULLY')
print('CREATING DICTIONARY...')
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
if self._input_type == 's' or self._input_type == 'sl':
dictionary = self._create_syllables_dictionary(content, vowels)
print('DICTIONARY CREATION SUCCESSFUL!')
# test_and_validation_size = 0.1
train_content, test_content, validate_content = self._split_content(content, test_and_validation_size, content_shuffle_vector_location)
feature_dictionary = self._create_feature_dictionary()
# Generate X and y
print('GENERATING X AND y...')
self.x_train, self.x_other_features_train, self.y_train = self._generate_x_and_y(dictionary, max_word, max_num_vowels, train_content, vowels,
accented_vowels,
feature_dictionary, shuffle_vector_location + '_train.h5')
self.x_test, self.x_other_features_test, self.y_test = self._generate_x_and_y(dictionary, max_word, max_num_vowels, test_content, vowels,
accented_vowels,
feature_dictionary, shuffle_vector_location + '_test.h5')
self.x_validate, self.x_other_features_validate, self.y_validate = self._generate_x_and_y(dictionary, max_word, max_num_vowels,
validate_content, vowels,
accented_vowels, feature_dictionary,
shuffle_vector_location + '_validate.h5')
print('GENERATION SUCCESSFUL!')
# save inputs
if self._save_generated_data:
self._save_inputs(train_path, self.x_train, self.x_other_features_train, self.y_train)
self._save_inputs(test_path, self.x_test, self.x_other_features_test, self.y_test)
self._save_inputs(validate_path, self.x_validate, self.x_other_features_validate, self.y_validate)
# return X_train, X_other_features_train, y_train, X_test, X_other_features_test, y_test, X_validate, X_other_features_validate, y_validate
# functions for creating X and y from content
@staticmethod
def _read_content(content_path):
# with open(content_path) as f:
with codecs.open(content_path, encoding='utf8') as f:
content = f.readlines()
return [x.split('\t') for x in content]
def _create_dict(self, content):
# CREATE dictionary AND max_word
accented_vowels = self._get_accented_vowels()
unaccented_vowels = self._get_unaccented_vowels()
vowels = []
vowels.extend(accented_vowels)
vowels.extend(unaccented_vowels)
dictionary_input = ['']
line = 0
max_word = 0
# ADD 'EMPTY' VOWEL
max_num_vowels = 0
for el in content:
num_vowels = 0
try:
if len(el[3]) > max_word:
max_word = len(el[3])
if len(el[0]) > max_word:
max_word = len(el[0])
for i in range(len(el[3])):
if self._is_vowel(list(el[3]), i, vowels):
num_vowels += 1
for c in list(el[0]):
if c not in dictionary_input:
dictionary_input.append(c)
if num_vowels > max_num_vowels:
max_num_vowels = num_vowels
except Exception:
print(line - 1)
print(el)
break
line += 1
dictionary_input = sorted(dictionary_input)
# max_num_vowels += 1
return dictionary_input, max_word, max_num_vowels, vowels, accented_vowels
# split content so that there is no overfitting
def _split_content(self, content, test_and_validation_ratio, content_shuffle_vector_location):
expanded_content = [el[1] if el[1] != '=' else el[0] for el in content]
# print(len(content))
unique_content = sorted(set(expanded_content))
s = self._load_shuffle_vector(content_shuffle_vector_location, len(unique_content))
test_num = math.floor(len(unique_content) * (test_and_validation_ratio * 2))
validation_num = math.floor(test_num * 0.5)
shuffled_unique_train_content = [unique_content[i] for i in range(len(s)) if s[i] >= test_num]
shuffled_unique_train_content_set = set(shuffled_unique_train_content)
shuffled_unique_test_content = [unique_content[i] for i in range(len(s)) if test_num > s[i] >= validation_num]
shuffled_unique_test_content_set = set(shuffled_unique_test_content)
shuffled_unique_validate_content = [unique_content[i] for i in range(len(s)) if s[i] < validation_num]
shuffled_unique_validate_content_set = set(shuffled_unique_validate_content)
train_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_train_content_set]
test_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_test_content_set]
validate_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_validate_content_set]
return train_content, test_content, validate_content
@staticmethod
def _create_and_save_shuffle_vector(file_name, length):
shuffle_vector = np.arange(length)
np.random.shuffle(shuffle_vector)
h5f = h5py.File(file_name, 'w')
adict = dict(shuffle_vector=shuffle_vector)
for k, v in adict.items():
h5f.create_dataset(k, data=v)
h5f.close()
return shuffle_vector
def _x_letter_input(self, content, dictionary, max_word, vowels, shuffle_vector_location):
if self._additional_letter_attributes:
if not self._bidirectional_basic_input:
x = np.zeros((len(content), max_word, len(dictionary) + 6), dtype=int)
else:
x = np.zeros((len(content), 2 * max_word, len(dictionary) + 6), dtype=int)
voiced_consonants = self._get_voiced_consonants()
resonant_silent_consonants = self._get_resonant_silent_consonants()
nonresonant_silent_consonants = self._get_nonresonant_silent_consonants()
# print('HERE!!!')
else:
# print('HERE!!!')
if not self._bidirectional_basic_input:
x = np.zeros((len(content), max_word, len(dictionary)), dtype=int)
else:
x = np.zeros((len(content), 2 * max_word, len(dictionary)), dtype=int)
if self._shuffle_all_inputs:
s = self._load_shuffle_vector(shuffle_vector_location, len(content))
else:
s = None
# i = 0
for i in range(len(content)):
if self._shuffle_all_inputs:
mod_i = s[i]
else:
mod_i = i
word = content[mod_i][0]
if self._reverse_inputs:
word = word[::-1]
j = 0
for c in list(word):
if j >= max_word:
continue
index = 0
if self._bidirectional_basic_input:
j2 = max_word + (len(word) - j - 1)
for d in dictionary:
if c == d:
x[i][j][index] = 1
if self._bidirectional_basic_input:
x[i][j2][index] = 1
break
index += 1
if self._additional_letter_attributes:
if self._is_vowel(word, j, vowels):
x[i][j][len(dictionary)] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary)] = 1
else:
x[i][j][len(dictionary) + 1] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary) + 1] = 1
if c in voiced_consonants:
x[i][j][len(dictionary) + 2] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary) + 2] = 1
else:
x[i][j][len(dictionary) + 3] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary) + 3] = 1
if c in resonant_silent_consonants:
x[i][j][len(dictionary) + 4] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary) + 4] = 1
elif c in nonresonant_silent_consonants:
x[i][j][len(dictionary) + 5] = 1
if self._bidirectional_basic_input:
x[i][j2][len(dictionary) + 5] = 1
j += 1
#i += 1
return x
def _x_syllable_input(self, content, dictionary, max_num_vowels, vowels, shuffle_vector_location):
if not self._bidirectional_basic_input:
x = np.zeros((len(content), max_num_vowels), dtype=int)
else:
x = np.zeros((len(content), 2 * max_num_vowels), dtype=int)
if self._shuffle_all_inputs:
s = self._load_shuffle_vector(shuffle_vector_location, len(content))
else:
s = None
for i in range(len(content)):
if self._shuffle_all_inputs:
mod_i = s[i]
else:
mod_i = i
j = 0
syllables = self._create_syllables(content[mod_i][0], vowels)
if self._reverse_inputs:
syllables = syllables[::-1]
for syllable in syllables:
if j >= max_num_vowels:
continue
if syllable in dictionary:
x[i][j] = dictionary.index(syllable)
if self._bidirectional_basic_input:
x[i][max_num_vowels + (len(syllables) - j - 1)] = dictionary.index(syllable)
else:
x[i][j] = 0
j += 1
#i += 1
return x
def _y_output(self, content, max_num_vowels, vowels, accentuated_vowels, shuffle_vector_location):
y = np.zeros((len(content), max_num_vowels))
i = 0
if self._shuffle_all_inputs:
s = self._load_shuffle_vector(shuffle_vector_location, len(content))
else:
s = None
for i in range(len(content)):
if self._shuffle_all_inputs:
mod_i = s[i]
else:
mod_i = i
el = content[mod_i]
word = el[3]
if self._reverse_inputs:
word = word[::-1]
j = 0
# word_accentuations = []
num_vowels = 0
for c in list(word):
index = 0
for d in accentuated_vowels:
if c == d:
if not self._accent_classification:
y[i][num_vowels] = 1
else:
y[i][num_vowels] = index
# word_accentuations.append(num_vowels)
break
index += 1
if self._is_vowel(word, j, vowels):
num_vowels += 1
j += 1
return y
# Generate each y as an array of 11 numbers (with possible values between 0 and 1)
def _generate_x_and_y(self, dictionary, max_word, max_num_vowels, content, vowels, accentuated_vowels, feature_dictionary,
shuffle_vector_location):
if self._input_type == 'l':
x = self._x_letter_input(content, dictionary, max_word, vowels, shuffle_vector_location)
elif self._input_type == 's' or self._input_type == 'sl':
x = self._x_syllable_input(content, dictionary, max_num_vowels, vowels, shuffle_vector_location)
else:
raise ValueError('No input_type provided. It could be \'l\', \'s\' or \'sl\'.')
y = self._y_output(content, max_num_vowels, vowels, accentuated_vowels, shuffle_vector_location)
# print('CREATING OTHER FEATURES...')
x_other_features = self._create_x_features(content, feature_dictionary, vowels, shuffle_vector_location)
# print('OTHER FEATURES CREATED!')
if self._shuffle_all_inputs:
print('SHUFFELING INPUTS...')
#x, x_other_features, y = self._shuffle_inputs(x, x_other_features, y, shuffle_vector_location)
print('INPUTS SHUFFELED!')
return x, x_other_features, y
def _create_syllables_dictionary(self, content, vowels):
dictionary = []
for el in content:
syllables = self._create_syllables(el[0], vowels)
for syllable in syllables:
if syllable not in dictionary:
dictionary.append(syllable)
dictionary.append('')
return sorted(dictionary)
def _create_syllables(self, word, vowels):
word_list = list(word)
consonants = []
syllables = []
for i in range(len(word_list)):
if self._is_vowel(word_list, i, vowels):
if syllables == []:
consonants.append(word_list[i])
syllables.append(''.join(consonants))
else:
left_consonants, right_consonants = self._split_consonants(list(''.join(consonants).lower()))
syllables[-1] += ''.join(left_consonants)
right_consonants.append(word_list[i])
syllables.append(''.join(right_consonants))
consonants = []
else:
consonants.append(word_list[i])
if len(syllables) < 1:
return word
syllables[-1] += ''.join(consonants)
return syllables
def _is_vowel(self, word_list, position, vowels):
if word_list[position] in vowels:
return True
if (word_list[position] == u'r' or word_list[position] == u'R') and (position - 1 < 0 or word_list[position - 1] not in vowels) and (
position + 1 >= len(word_list) or word_list[position + 1] not in vowels):
return True
return False
def _split_consonants(self, consonants):
voiced_consonants = self._get_voiced_consonants()
resonant_silent_consonants = self._get_resonant_silent_consonants()
unresonant_silent_consonants = self._get_nonresonant_silent_consonants()
if len(consonants) == 0:
return [''], ['']
elif len(consonants) == 1:
return [''], consonants
else:
split_options = []
for i in range(len(consonants) - 1):
if consonants[i] == '-' or consonants[i] == '_':
split_options.append([i, -1])
elif consonants[i] == consonants[i + 1]:
split_options.append([i, 0])
elif consonants[i] in voiced_consonants:
if consonants[i + 1] in resonant_silent_consonants or consonants[i + 1] in unresonant_silent_consonants:
split_options.append([i, 2])
elif consonants[i] in resonant_silent_consonants:
if consonants[i + 1] in resonant_silent_consonants:
split_options.append([i, 1])
elif consonants[i + 1] in unresonant_silent_consonants:
split_options.append([i, 3])
elif consonants[i] in unresonant_silent_consonants:
if consonants[i + 1] in resonant_silent_consonants:
split_options.append([i, 4])
if split_options == []:
return [''], consonants
else:
split = min(split_options, key=lambda x: x[1])
return consonants[:split[0] + 1], consonants[split[0] + 1:]
def _create_x_features(self, content, feature_dictionary, vowels, shuffle_vector_location):
content = content
x_other_features = []
if self._shuffle_all_inputs:
s = self._load_shuffle_vector(shuffle_vector_location, len(content))
else:
s = None
for index in range(len(content)):
if self._shuffle_all_inputs:
mod_i = s[index]
else:
mod_i = index
el = content[mod_i]
x_el_other_features = []
if self._convert_multext:
converted_el = ''.join(self._convert_to_multext_east_v4(list(el[2]), feature_dictionary))
else:
converted_el = el[2]
for feature in feature_dictionary:
if converted_el[0] == feature[1]:
x_el_other_features.append(1)
for i in range(2, len(feature)):
for j in range(len(feature[i])):
if i - 1 < len(converted_el) and feature[i][j] == converted_el[i - 1]:
x_el_other_features.append(1)
else:
x_el_other_features.append(0)
else:
x_el_other_features.extend([0] * feature[0])
if self._number_of_syllables:
list_of_letters = list(el[0])
num_of_vowels = 0
for i in range(len(list_of_letters)):
if self._is_vowel(list(el[0]), i, vowels):
num_of_vowels += 1
x_el_other_features.append(num_of_vowels)
x_other_features.append(x_el_other_features)
return np.array(x_other_features)
def _shuffle_inputs(self, x, x_other_features, y, shuffle_vector_location):
s = self._load_shuffle_vector(shuffle_vector_location, x.shape[0])
x = x[s]
y = y[s]
x_other_features = x_other_features[s]
return x, x_other_features, y
# functions for saving, loading and shuffling whole arrays to ram
@staticmethod
def _save_inputs(file_name, x, x_other_features, y):
h5f = h5py.File(file_name, 'w')
a_dict = dict(X=x, X_other_features=x_other_features, y=y)
for k, v in a_dict.items():
h5f.create_dataset(k, data=v)
h5f.close()
@staticmethod
def _load_inputs(file_name):
h5f = h5py.File(file_name, 'r')
x = h5f['X'][:]
y = h5f['y'][:]
x_other_features = h5f['X_other_features'][:]
h5f.close()
return x, x_other_features, y
def _load_shuffle_vector(self, file_path, length=0):
if os.path.exists(file_path):
h5f = h5py.File(file_path, 'r')
shuffle_vector = h5f['shuffle_vector'][:]
h5f.close()
else:
if self._allow_shuffle_vector_generation:
shuffle_vector = self._create_and_save_shuffle_vector(file_path, length)
else:
raise ValueError('Shuffle vector on path: \'{}\' does not exist! Either generate new vector (with initializing new Data object with '
'parameter allow_shuffle_vector_generation=True or paste one that is already generated!'.format(file_path))
return shuffle_vector
@staticmethod
def _convert_to_multext_east_v4(old_features, feature_dictionary):
new_features = ['-'] * 9
new_features[:len(old_features)] = old_features
if old_features[0] == 'A':
if old_features[1] == 'f' or old_features[1] == 'o':
new_features[1] = 'g'
return new_features[:len(feature_dictionary[0]) - 1]
if old_features[0] == 'C':
return new_features[:len(feature_dictionary[1]) - 1]
if old_features[0] == 'I':
return new_features[:len(feature_dictionary[2]) - 1]
if old_features[0] == 'M':
new_features[2:6] = old_features[1:5]
new_features[1] = old_features[5]
if new_features[2] == 'm':
new_features[2] = '-'
return new_features[:len(feature_dictionary[3]) - 1]
if old_features[0] == 'N':
if len(old_features) >= 7:
new_features[5] = old_features[7]
return new_features[:len(feature_dictionary[4]) - 1]
if old_features[0] == 'P':
if new_features[8] == 'n':
new_features[8] = 'b'
return new_features[:len(feature_dictionary[5]) - 1]
if old_features[0] == 'Q':
return new_features[:len(feature_dictionary[6]) - 1]
if old_features[0] == 'R':
return new_features[:len(feature_dictionary[7]) - 1]
if old_features[0] == 'S':
if len(old_features) == 4:
new_features[1] = old_features[3]
else:
new_features[1] = '-'
return new_features[:len(feature_dictionary[8]) - 1]
if old_features[0] == 'V':
if old_features[1] == 'o' or old_features[1] == 'c':
new_features[1] = 'm'
new_features[3] = old_features[2]
new_features[2] = '-'
if old_features[2] == 'i':
new_features[3] = 'r'
if len(old_features) > 3 and old_features[3] == 'p':
new_features[3] = 'r'
elif len(old_features) > 3 and old_features[3] == 'f':
new_features[3] = 'f'
if len(old_features) >= 9:
new_features[7] = old_features[8]
else:
new_features[7] = '-'
return new_features[:len(feature_dictionary[9]) - 1]
return ''
# generator for inputs for tracking of data fitting
def generator(self, data_type, batch_size, x=None, x_other_features_validate=None, y_validate=None, content_name='SlovarIJS_BESEDE_utf8.lex',
content_location='../../../data/', oversampling=np.ones(13)):
content_path = '{}{}'.format(content_location, content_name)
if data_type == 'train':
return self._generator_instance(self.x_train, self.x_other_features_train, self.y_train, batch_size, content_path, oversampling)
elif data_type == 'test':
return self._generator_instance(self.x_test, self.x_other_features_test, self.y_test, batch_size, content_path, oversampling)
elif data_type == 'validate':
return self._generator_instance(self.x_validate, self.x_other_features_validate, self.y_validate, batch_size, content_path, oversampling)
else:
return self._generator_instance(x, x_other_features_validate, y_validate, batch_size)
# if self._input_type
def _generator_instance(self, orig_x, orig_x_additional, orig_y, batch_size, content_path, oversampling):
if self._input_type == 'l':
content = self._read_content(content_path)
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
return self._letter_generator(orig_x, orig_x_additional, orig_y, batch_size, accented_vowels)
elif self._input_type == 's':
content = self._read_content(content_path)
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
syllable_dictionary = self._create_syllables_dictionary(content, vowels)
eye = np.eye(len(syllable_dictionary), dtype=int)
return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye, accented_vowels, oversampling)
elif self._input_type == 'sl':
content = self._read_content(content_path)
dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content)
syllable_dictionary = self._create_syllables_dictionary(content, vowels)
max_syllable = self._get_max_syllable(syllable_dictionary)
syllable_letters_translator = self._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator, accented_vowels, oversampling)
# generator for inputs for tracking of data fitting
def _letter_generator(self, orig_x, orig_x_additional, orig_y, batch_size, accented_vowels):
size = orig_x.shape[0]
while 1:
loc = 0
if self._accent_classification:
eye = np.eye(len(accented_vowels), dtype=int)
eye_input_accent = np.eye(len(orig_y[0]), dtype=int)
input_x_stack = []
input_x_other_features_stack = []
input_y_stack = []
while loc < size:
while len(input_x_stack) < batch_size and loc < size:
accent_loc = 0
for accent in orig_y[loc]:
if accent > 0:
new_orig_x_additional = orig_x_additional[loc]
new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc]))
input_x_stack.append(orig_x[loc])
input_x_other_features_stack.append(new_orig_x_additional)
input_y_stack.append(eye[int(accent)])
accent_loc += 1
loc += 1
if len(input_x_stack) > batch_size:
yield ([np.array(input_x_stack[:batch_size]),
np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size])
input_x_stack = input_x_stack[batch_size:]
input_x_other_features_stack = input_x_other_features_stack[batch_size:]
input_y_stack = input_y_stack[batch_size:]
else:
# print('BBB')
# print(np.array(input_stack))
# yield (np.array(input_stack))
yield ([np.array(input_x_stack), np.array(input_x_other_features_stack)], np.array(input_y_stack))
input_x_stack = []
input_x_other_features_stack = []
input_y_stack = []
else:
while loc < size:
if loc + batch_size >= size:
if self._bidirectional_architectural_input:
split_orig_x = np.hsplit(orig_x[loc:size], 2)
yield ([split_orig_x[0], split_orig_x[1], orig_x_additional[loc:size]], orig_y[loc:size])
else:
yield ([orig_x[loc:size], orig_x_additional[loc:size]], orig_y[loc:size])
else:
if self._bidirectional_architectural_input:
split_orig_x = np.hsplit(orig_x[loc:loc + batch_size], 2)
yield ([split_orig_x[0], split_orig_x[1], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
else:
yield ([orig_x[loc:loc + batch_size], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
loc += batch_size
# generator for inputs for tracking of data fitting
def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator, accented_vowels, oversampling=np.ones(13)):
size = orig_x.shape[0]
while 1:
loc = 0
if self._accent_classification:
eye = np.eye(len(accented_vowels), dtype=int)
eye_input_accent = np.eye(len(orig_y[0]), dtype=int)
input_x_stack = []
input_x_other_features_stack = []
input_y_stack = []
while loc < size:
while len(input_x_stack) < batch_size and loc < size:
accent_loc = 0
for accent in orig_y[loc]:
if accent > 0:
new_orig_x_additional = orig_x_additional[loc]
new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc]))
for i in range(int(oversampling[int(accent)])):
input_x_stack.append(orig_x[loc])
input_x_other_features_stack.append(new_orig_x_additional)
input_y_stack.append(eye[int(accent)])
accent_loc += 1
loc += 1
if len(input_x_stack) > batch_size:
gen_orig_x = translator[np.array(input_x_stack[:batch_size])]
if self._bidirectional_architectural_input:
split_orig_x = np.hsplit(gen_orig_x, 2)
yield ([split_orig_x[0], split_orig_x[1], np.array(input_x_other_features_stack[:batch_size])],
np.array(input_y_stack)[:batch_size])
else:
yield ([gen_orig_x, np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size])
# yield ([gen_orig_x, np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size])
input_x_stack = input_x_stack[batch_size:]
input_x_other_features_stack = input_x_other_features_stack[batch_size:]
input_y_stack = input_y_stack[batch_size:]
else:
#print('-------------------------------------------------------------------------------------------')
#if dictionary is not None:
# print(self.decode_x(word_encoded, dictionary))
#print(input_x_stack)
#print(input_x_other_features_stack)
#print(input_y_stack)
#print(loc)
if len(input_x_stack) == 0:
continue
gen_orig_x = translator[np.array(input_x_stack)]
if self._bidirectional_architectural_input:
split_orig_x = np.hsplit(gen_orig_x, 2)
yield ([split_orig_x[0], split_orig_x[1], np.array(input_x_other_features_stack)],
np.array(input_y_stack))
else:
yield ([gen_orig_x, np.array(input_x_other_features_stack)], np.array(input_y_stack))
# yield ([gen_orig_x, np.array(input_x_other_features_stack)], np.array(input_y_stack))
input_x_stack = []
input_x_other_features_stack = []
input_y_stack = []
else:
while loc < size:
if loc + batch_size >= size:
gen_orig_x = translator[orig_x[loc:size]]
if self._bidirectional_architectural_input:
split_orig_x = np.hsplit(gen_orig_x, 2)
yield ([split_orig_x[0], split_orig_x[1], orig_x_additional[loc:size]], orig_y[loc:size])
else:
yield ([gen_orig_x, orig_x_additional[loc:size]], orig_y[loc:size])
#yield ([gen_orig_x, orig_x_additional[loc:size]], orig_y[loc:size])
else:
gen_orig_x = translator[orig_x[loc:loc + batch_size]]
if self._bidirectional_architectural_input:
split_orig_x = np.hsplit(gen_orig_x, 2)
yield ([split_orig_x[0], split_orig_x[1], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
else:
yield ([gen_orig_x, orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
#yield ([gen_orig_x, orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size])
loc += batch_size
def _get_max_syllable(self, syllable_dictionary):
max_len = 0
for el in syllable_dictionary:
if len(el) > max_len:
max_len = len(el)
return max_len
def _create_syllable_letters_translator(self, max_syllable, syllable_dictionary, dictionary, vowels, aditional_letter_attributes=True):
if aditional_letter_attributes:
voiced_consonants = self._get_voiced_consonants()
resonant_silent_consonants = self._get_resonant_silent_consonants()
nonresonant_silent_consonants = self._get_nonresonant_silent_consonants()
syllable_letters_translator = []
for syllable in syllable_dictionary:
di_syllable = []
for let in range(max_syllable):
# di_let = []
for a in dictionary:
if let < len(syllable) and a == list(syllable)[let]:
di_syllable.append(1)
else:
di_syllable.append(0)
if aditional_letter_attributes:
if let >= len(syllable):
di_syllable.extend([0, 0, 0, 0, 0, 0])
elif self._is_vowel(list(syllable), let, vowels):
di_syllable.extend([1, 0, 0, 0, 0, 0])
else:
# X[i][j][len(dictionary) + 1] = 1
if list(syllable)[let] in voiced_consonants:
# X[i][j][len(dictionary) + 2] = 1
di_syllable.extend([0, 1, 1, 0, 0, 0])
else:
# X[i][j][len(dictionary) + 3] = 1
if list(syllable)[let] in resonant_silent_consonants:
# X[i][j][len(dictionary) + 4] = 1
di_syllable.extend([0, 1, 0, 1, 1, 0])
elif list(syllable)[let] in nonresonant_silent_consonants:
# X[i][j][len(dictionary) + 5] = 1
di_syllable.extend([0, 1, 0, 1, 0, 1])
else:
di_syllable.extend([0, 0, 0, 0, 0, 0])
# di_syllable.append(di_let)
syllable_letters_translator.append(di_syllable)
syllable_letters_translator = np.array(syllable_letters_translator, dtype=int)
return syllable_letters_translator
@staticmethod
def _get_accented_vowels():
return [u'à', u'á', u'ä', u'é', u'ë', u'ì', u'í', u'î', u'ó', u'ô', u'ö', u'ú', u'ü']
@staticmethod
def _get_unaccented_vowels():
return [u'a', u'e', u'i', u'o', u'u']
@staticmethod
def _get_voiced_consonants():
return ['m', 'n', 'v', 'l', 'r', 'j', 'y', 'w']
@staticmethod
def _get_resonant_silent_consonants():
return ['b', 'd', 'z', 'ž', 'g']
@staticmethod
def _get_nonresonant_silent_consonants():
return ['p', 't', 's', 'š', 'č', 'k', 'f', 'h', 'c']
@staticmethod
def _create_slovene_feature_dictionary():
# old: http://nl.ijs.si/ME/Vault/V3/msd/html/
# new: http://nl.ijs.si/ME/V4/msd/html/
# changes: http://nl.ijs.si/jos/msd/html-en/msd.diffs.html
return [[21,
'P',
['p', 's'],
['n', 'p', 's'],
['m', 'z', 's'],
['e', 'd', 'm'],
['i', 'r', 'd', 't', 'm', 'o'],
['-', 'n', 'd']],
[3, 'V', ['p', 'd']],
[1, 'M'],
[21,
'K',
['b'],
['-', 'g', 'v', 'd'],
['m', 'z', 's'],
['e', 'd', 'm'],
['i', 'r', 'd', 't', 'm', 'o'],
['-', 'n', 'd']],
[17,
'S',
['o'],
['m', 'z', 's'],
['e', 'd', 'm'],
['i', 'r', 'd', 't', 'm', 'o'],
['-', 'n', 'd']],
[40,
'Z',
['o', 's', 'k', 'z', 'p', 'c', 'v', 'n', 'l'],
['-', 'p', 'd', 't'],
['-', 'm', 'z', 's'],
['-', 'e', 'd', 'm'],
['-', 'i', 'r', 'd', 't', 'm', 'o'],
['-', 'e', 'd', 'm'],
['-', 'm', 'z', 's'],
['-', 'k', 'z']],
[1, 'L'],
[5, 'R', ['s'], ['n', 'r', 's']],
[7, 'D', ['-', 'r', 'd', 't', 'm', 'o']],
[24,
'G',
['g'],
['-'],
['n', 'm', 'd', 's', 'p', 'g'],
['-', 'p', 'd', 't'],
['-', 'e', 'm', 'd'],
['-', 'm', 'z', 's'],
['-', 'n', 'd']]
]
@staticmethod
def _create_feature_dictionary():
# old: http://nl.ijs.si/ME/Vault/V3/msd/html/
# new: http://nl.ijs.si/ME/V4/msd/html/
# changes: http://nl.ijs.si/jos/msd/html-en/msd.diffs.html
return [[21,
'A',
['g', 's'],
['p', 'c', 's'],
['m', 'f', 'n'],
['s', 'd', 'p'],
['n', 'g', 'd', 'a', 'l', 'i'],
['-', 'n', 'y']],
[3, 'C', ['c', 's']],
[1, 'I'],
[21,
'M',
['l'],
['-', 'c', 'o', 's'],
['m', 'f', 'n'],
['s', 'd', 'p'],
['n', 'g', 'd', 'a', 'l', 'i'],
['-', 'n', 'y']],
[17,
'N',
['c'],
['m', 'f', 'n'],
['s', 'd', 'p'],
['n', 'g', 'd', 'a', 'l', 'i'],
['-', 'n', 'y']],
[40,
'P',
['p', 's', 'd', 'r', 'x', 'g', 'q', 'i', 'z'],
['-', '1', '2', '3'],
['-', 'm', 'f', 'n'],
['-', 's', 'd', 'p'],
['-', 'n', 'g', 'd', 'a', 'l', 'i'],
['-', 's', 'd', 'p'],
['-', 'm', 'f', 'n'],
['-', 'y', 'b']],
[1, 'Q'],
[5, 'R', ['g'], ['p', 'c', 's']],
[7, 'S', ['-', 'g', 'd', 'a', 'l', 'i']],
[24,
'V',
['m'],
['-'],
['n', 'u', 'p', 'r', 'f', 'c'],
['-', '1', '2', '3'],
['-', 's', 'p', 'd'],
['-', 'm', 'f', 'n'],
['-', 'n', 'y']]
]
# Decoders for inputs and outputs
@staticmethod
def decode_x(word_encoded, dictionary):
word = ''
for el in word_encoded:
i = 0
for num in el:
if num == 1:
word += dictionary[i]
break
i += 1
return word
@staticmethod
def decode_x_other_features(feature_dictionary, x_other_features):
final_word = []
for word in x_other_features:
final_word = []
i = 0
for z in range(len(feature_dictionary)):
for j in range(1, len(feature_dictionary[z])):
if j == 1:
if word[i] == 1:
final_word.append(feature_dictionary[z][1])
i += 1
else:
for k in range(len(feature_dictionary[z][j])):
if word[i] == 1:
final_word.append(feature_dictionary[z][j][k])
i += 1
# print(u''.join(final_word))
return u''.join(final_word)
@staticmethod
def decode_y(y):
i = 0
res = []
for el in y:
if el >= 0.5:
res.append(i)
i += 1
return res
def test_accuracy(self, predictions, x, x_other_features, y, dictionary, feature_dictionary, vowels, syllable_dictionary=None,
threshold=0.4999955, patterns=None):
errors = []
num_of_pred = len(predictions)
num_of_correct_pred = 0
# wrong_patterns = 0
# wrong_pattern_prediction = 0
for i in range(predictions.shape[0]):
correct_prediction = True
round_predictions = np.zeros(predictions[i].shape)
for j in range(len(y[i])):
if predictions[i][j] < threshold:
round_predictions[j] = 0.0
else:
round_predictions[j] = 1.0
if (predictions[i][j] < threshold and y[i][j] == 1.0) or (predictions[i][j] >= threshold and y[i][j] == 0.0):
correct_prediction = False
# in_pattern = False
# if patterns is not None:
# test_predictions = copy(predictions[i])
# l = self.get_word_length(x[i])
# round_predictions = np.zeros(test_predictions.shape)
# for j in range(len(y[i])):
# if test_predictions[j] < threshold:
# round_predictions[j] = 0.0
# else:
# round_predictions[j] = 1.0
#
# in_pattern = False
# for pattern in patterns[l]:
# if (pattern == round_predictions).all():
# in_pattern = True
# if not in_pattern:
# wrong_patterns += 1
#
# for j in range(len(y[i])):
# if (predictions[i][j] < threshold and y[i][j] == 1.0) or (predictions[i][j] >= threshold and y[i][j] == 0.0):
# correct_prediction = False
#
# if not in_pattern and not correct_prediction:
# wrong_pattern_prediction += 1
# if (np.around(predictions[i]) == y[i]).all():
if correct_prediction:
num_of_correct_pred += 1
else:
if self._input_type == 'l':
decoded_x = self.decode_x(x[i], dictionary)
else:
decoded_x = self.decode_syllable_x(x[i], syllable_dictionary)
if self._bidirectional_basic_input:
decoded_x = decoded_x[:int(len(decoded_x)/2)]
errors.append([i,
decoded_x,
self.decode_x_other_features(feature_dictionary, [x_other_features[i]]),
self.assign_stress_locations(decoded_x, round_predictions, vowels, syllables=self._input_type != 'l'),
self.assign_stress_locations(decoded_x, y[i], vowels, syllables=self._input_type != 'l')
])
# print(wrong_patterns)
# print(wrong_pattern_prediction)
return (num_of_correct_pred / float(num_of_pred)) * 100, errors
# def get_word_length(self, x_el):
# i = 0
# for el in x_el:
# if el == 0:
# return i
# i += 1
# return 10
@staticmethod
def decode_syllable_x(word_encoded, syllable_dictionary):
word = []
for i in range(len(word_encoded)):
word.append(syllable_dictionary[word_encoded[i]])
return ''.join(word[::-1])
def assign_stress_locations(self, word, y, vowels, syllables=False):
if not syllables:
word_list = list(word)
else:
if self._reverse_inputs:
word_list = list(word)[::-1]
else:
word_list = list(word)
vowel_num = 0
for i in range(len(word_list)):
if self._is_vowel(word_list, i, vowels):
if word_list[i] == 'a' and y[vowel_num] == 1:
word_list[i] = 'á'
elif word_list[i] == 'e' and y[vowel_num] == 1:
word_list[i] = 'é'
elif word_list[i] == 'i' and y[vowel_num] == 1:
word_list[i] = 'í'
elif word_list[i] == 'o' and y[vowel_num] == 1:
word_list[i] = 'ó'
elif word_list[i] == 'u' and y[vowel_num] == 1:
word_list[i] = 'ú'
elif word_list[i] == 'r' and y[vowel_num] == 1:
word_list[i] = 'ŕ'
elif word_list[i] == 'A' and y[vowel_num] == 1:
word_list[i] = 'Á'
elif word_list[i] == 'E' and y[vowel_num] == 1:
word_list[i] = 'É'
elif word_list[i] == 'I' and y[vowel_num] == 1:
word_list[i] = 'Í'
elif word_list[i] == 'O' and y[vowel_num] == 1:
word_list[i] = 'Ó'
elif word_list[i] == 'U' and y[vowel_num] == 1:
word_list[i] = 'Ú'
elif word_list[i] == 'R' and y[vowel_num] == 1:
word_list[i] = 'Ŕ'
vowel_num += 1
if not syllables:
return ''.join(word_list)
else:
return ''.join(word_list[::-1])
def test_type_accuracy(self, predictions, x, x_other_features, y, dictionary, feature_dictionary, vowels, accented_vowels,
syllable_dictionary=None):
errors = []
num_of_pred = len(predictions)
num_of_correct_pred = 0
num_of_correct_pred_words = 0
accentuation_index = 0
eye = np.eye(len(accented_vowels), dtype=int)
for i in range(len(y)):
correct_prediction = True
if self._input_type == 'l':
decoded_x = self.decode_x(x[i], dictionary)
else:
decoded_x = self.decode_syllable_x(x[i], syllable_dictionary)
wrong_word = decoded_x
correct_word = decoded_x
for j in range(len(y[i])):
if y[i][j] > 0:
# ERROR AS IT IS CALCULATED
# arounded_predictions = np.around(predictions[accentuation_index]).astype(int)
# MAX ELEMENT ONLY
# arounded_predictions = np.zeros(len(predictions[accentuation_index]))
# arounded_predictions[np.argmax(predictions[accentuation_index]).astype(int)] = 1
# MAX ELEMENT AMONGT POSSIBLE ONES
# if i == 313:
# print(decoded_x)
stressed_letter = self.get_accentuated_letter(decoded_x, j, vowels, syllables=self._input_type != 'l')
possible_places = np.zeros(len(predictions[accentuation_index]))
if stressed_letter == 'r':
possible_places[0] = 1
elif stressed_letter == 'a':
possible_places[1] = 1
possible_places[2] = 1
elif stressed_letter == 'e':
possible_places[3] = 1
possible_places[4] = 1
possible_places[5] = 1
elif stressed_letter == 'i':
possible_places[6] = 1
possible_places[7] = 1
elif stressed_letter == 'o':
possible_places[8] = 1
possible_places[9] = 1
possible_places[10] = 1
elif stressed_letter == 'u':
possible_places[11] = 1
possible_places[12] = 1
possible_predictions = predictions[accentuation_index] * possible_places
arounded_predictions = np.zeros(len(predictions[accentuation_index]), dtype=int)
arounded_predictions[np.argmax(possible_predictions).astype(int)] = 1
wrong_word = self.assign_word_accentuation_type(wrong_word, j, arounded_predictions, vowels, accented_vowels,
syllables=self._input_type != 'l', debug=i == 313)
correct_word = self.assign_word_accentuation_type(correct_word, j, eye[int(y[i][j])], vowels, accented_vowels,
syllables=self._input_type != 'l', debug=i == 313)
if (eye[int(y[i][j])] == arounded_predictions).all():
num_of_correct_pred += 1
else:
correct_prediction = False
accentuation_index += 1
if correct_prediction:
num_of_correct_pred_words += 1
else:
if self._input_type == 'l':
errors.append([i,
decoded_x[::-1],
self.decode_x_other_features(feature_dictionary, [x_other_features[i]]),
wrong_word[::-1],
correct_word[::-1]
])
else:
errors.append([i,
decoded_x,
self.decode_x_other_features(feature_dictionary, [x_other_features[i]]),
wrong_word,
correct_word
])
print(num_of_pred)
print(len(y))
print(num_of_correct_pred_words)
print(len(errors))
print(num_of_correct_pred_words + len(errors))
return (num_of_correct_pred / float(num_of_pred)) * 100, (num_of_correct_pred_words / float(len(y))) * 100, errors
def get_accentuated_letter(self, word, location, vowels, syllables=False, debug=False):
# print(location)
vowel_index = 0
word_list = list(word)
if not syllables:
word_list = list(word)
else:
word_list = list(word[::-1])
for i in range(len(word_list)):
if self._is_vowel(word_list, i, vowels):
if location == vowel_index:
return word_list[i]
vowel_index += 1
def assign_word_accentuation_type(self, word, location, y, vowels, accented_vowels, syllables=False, debug=False):
vowel_index = 0
if not syllables:
word_list = list(word)
else:
word_list = list(word[::-1])
for i in range(len(word_list)):
if self._is_vowel(word_list, i, vowels + accented_vowels):
if location == vowel_index:
if len(np.where(y == 1)[0]) == 1:
word_list[i] = accented_vowels[np.where(y == 1)[0][0]]
vowel_index += 1
if not syllables:
return ''.join(word_list)
else:
return ''.join(word_list[::-1])
def assign_stress_types(self, predictions, word, y, vowels, accented_vowels):
words = []
accentuation_index = 0
for i in range(len(y)):
wrong_word = word[i][::-1]
for j in range(len(y[i])):
if y[i][j] > 0:
stressed_letter = self.get_accentuated_letter(word[i][::-1], j, vowels, syllables=self._input_type != 'l')
possible_places = np.zeros(len(predictions[accentuation_index]))
if stressed_letter == 'r':
possible_places[0] = 1
elif stressed_letter == 'a':
possible_places[1] = 1
possible_places[2] = 1
elif stressed_letter == 'e':
possible_places[3] = 1
possible_places[4] = 1
possible_places[5] = 1
elif stressed_letter == 'i':
possible_places[6] = 1
possible_places[7] = 1
elif stressed_letter == 'o':
possible_places[8] = 1
possible_places[9] = 1
possible_places[10] = 1
elif stressed_letter == 'u':
possible_places[11] = 1
possible_places[12] = 1
possible_predictions = predictions[accentuation_index] * possible_places
arounded_predictions = np.zeros(len(predictions[accentuation_index]), dtype=int)
arounded_predictions[np.argmax(possible_predictions).astype(int)] = 1
if np.max(possible_predictions) != 0:
wrong_word = self.assign_word_accentuation_type(wrong_word, j, arounded_predictions, vowels, accented_vowels,
syllables=self._input_type != 'l', debug=i == 313)
accentuation_index += 1
words.append(wrong_word[::-1])
return words
@staticmethod
def load_location_models(letters_path, syllables_path, syllabled_letters_path):
############################ LOCATION ########################
nn_output_dim = 10
conv_input_shape = (23, 36)
othr_input = (140,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
x_conv = Conv1D(115, (3), padding='same', activation='relu')(conv_input)
x_conv = Conv1D(46, (3), padding='same', activation='relu')(x_conv)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
# x = Dense(1024, input_dim=(516 + 256), activation='relu')(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
letter_location_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
letter_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
letter_location_model.load_weights(letters_path)
##############################################################
# num_examples = len(data.x_train) # training set size
nn_output_dim = 10
conv_input_shape = (10, 5168)
othr_input = (140,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
# syllabled letters
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
syllable_location_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
syllable_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
syllable_location_model.load_weights(syllables_path)
#####################################################
conv_input_shape = (10, 252)
othr_input = (140,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
# syllabled letters
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
syllabled_letters_location_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
syllabled_letters_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
syllabled_letters_location_model.load_weights(syllabled_letters_path)
return letter_location_model, syllable_location_model, syllabled_letters_location_model
@staticmethod
def load_type_models(letters_path, syllables_path, syllabled_letters_path):
nn_output_dim = 13
# letters
conv_input_shape = (23, 36)
othr_input = (150,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
# letters
x_conv = Conv1D(115, (3), padding='same', activation='relu')(conv_input)
x_conv = Conv1D(46, (3), padding='same', activation='relu')(x_conv)
# syllabled letters
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
letter_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
letter_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
letter_type_model.load_weights(letters_path)
conv_input_shape = (10, 5168)
othr_input = (150,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
syllable_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
syllable_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
syllable_type_model.load_weights(syllables_path)
# syllabled letters
conv_input_shape = (10, 252)
othr_input = (150,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
syllabled_letter_type_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
syllabled_letter_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
syllabled_letter_type_model.load_weights(syllabled_letters_path)
return letter_type_model, syllable_type_model, syllabled_letter_type_model
@staticmethod
def get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
batch_size = 16
# print(tagged_input_words[pos])
data = Data('l', shuffle_all_inputs=False, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
feature_dictionary, 'who cares')
generator = data._letter_generator(x, x_other_features, fake_y, batch_size, accented_vowels)
letter_location_predictions = letter_location_model.predict_generator(generator, len(x) / (batch_size))
data = Data('s', shuffle_all_inputs=False, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
eye = np.eye(len(syllable_dictionary), dtype=int)
generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, eye, accented_vowels)
syllable_location_predictions = syllable_location_model.predict_generator(generator, len(x) / (batch_size))
data = Data('sl', shuffle_all_inputs=False, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
max_syllable = data._get_max_syllable(syllable_dictionary)
syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, syllable_letters_translator, accented_vowels)
syllabled_letters_location_predictions = syllabled_letters_location_model.predict_generator(generator, len(x) / (batch_size))
############## CORRECT ORDER INPUT ##############
data = Data('l', shuffle_all_inputs=False, convert_multext=False, reverse_inputs=False)
x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
feature_dictionary, 'who cares')
generator = data._letter_generator(x, x_other_features, fake_y, batch_size, accented_vowels)
letter_location_co_predictions = letter_location_co_model.predict_generator(generator, len(x) / (batch_size))
letter_location_co_predictions = data.reverse_predictions(letter_location_co_predictions, input_words, vowels)
data = Data('s', shuffle_all_inputs=False, convert_multext=False, reverse_inputs=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
eye = np.eye(len(syllable_dictionary), dtype=int)
generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, eye, accented_vowels)
syllable_location_co_predictions = syllable_location_co_model.predict_generator(generator, len(x) / (batch_size))
syllable_location_co_predictions = data.reverse_predictions(syllable_location_co_predictions, input_words, vowels)
data = Data('sl', shuffle_all_inputs=False, convert_multext=False, reverse_inputs=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
max_syllable = data._get_max_syllable(syllable_dictionary)
syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, syllable_letters_translator, accented_vowels)
syllabled_letters_location_co_predictions = syllabled_letters_location_co_model.predict_generator(generator, len(x) / (batch_size))
syllabled_letters_location_co_predictions = data.reverse_predictions(syllabled_letters_location_co_predictions, input_words, vowels)
return np.mean(np.array([letter_location_predictions, syllable_location_predictions, syllabled_letters_location_predictions,
letter_location_co_predictions, syllable_location_co_predictions, syllabled_letters_location_co_predictions]), axis=0)
def count_syllables(self, word, vowels):
j = 0
num_vowels = 0
for j in range(len(word)):
if self._is_vowel(word, j, vowels):
num_vowels += 1
return num_vowels
def reverse_predictions(self, predictions, words, vowels):
new_predictions = np.zeros(predictions.shape, dtype='float32')
for i in range(len(predictions)):
word_len = self.count_syllables(words[i][0], vowels)
if word_len > 10:
word_len = 10
for k in range(word_len):
new_predictions[i][k] += predictions[i][word_len - 1 - k]
return new_predictions
@staticmethod
def get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model, syllabled_letter_type_model,
letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
batch_size = 16
y_array = np.asarray(location_y)
accentuation_length = (y_array > 0).sum()
data = Data('l', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
feature_dictionary, 'who cares')
generator = data._letter_generator(x, x_other_features, location_y, batch_size, accented_vowels)
letter_type_predictions = letter_type_model.predict_generator(generator, accentuation_length / (batch_size))
data = Data('s', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
eye = np.eye(len(syllable_dictionary), dtype=int)
generator = data._syllable_generator(x, x_other_features, location_y, batch_size, eye, accented_vowels)
syllable_type_predictions = syllable_type_model.predict_generator(generator, accentuation_length / (batch_size))
data = Data('sl', shuffle_all_inputs=False, accent_classification=True, convert_multext=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
max_syllable = data._get_max_syllable(syllable_dictionary)
syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
generator = data._syllable_generator(x, x_other_features, location_y, batch_size, syllable_letters_translator, accented_vowels)
syllabled_letter_type_predictions = syllabled_letter_type_model.predict_generator(generator, accentuation_length / batch_size)
############## CORRECT ORDER INPUT ##############
location_y = data.reverse_predictions(location_y, input_words, vowels)
data = Data('l', shuffle_all_inputs=False, accent_classification=True, convert_multext=False, reverse_inputs=False)
x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
feature_dictionary, 'who cares')
generator = data._letter_generator(x, x_other_features, location_y, batch_size, accented_vowels)
letter_type_co_predictions = letter_type_co_model.predict_generator(generator, accentuation_length / (batch_size))
data.reorder_correct_direction_inputs(letter_type_co_predictions, location_y)
data = Data('s', shuffle_all_inputs=False, accent_classification=True, convert_multext=False, reverse_inputs=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
eye = np.eye(len(syllable_dictionary), dtype=int)
generator = data._syllable_generator(x, x_other_features, location_y, batch_size, eye, accented_vowels)
syllable_type_co_predictions = syllable_type_co_model.predict_generator(generator, accentuation_length / (batch_size))
data.reorder_correct_direction_inputs(syllable_type_co_predictions, location_y)
data = Data('sl', shuffle_all_inputs=False, accent_classification=True, convert_multext=False, reverse_inputs=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
max_syllable = data._get_max_syllable(syllable_dictionary)
syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
generator = data._syllable_generator(x, x_other_features, location_y, batch_size, syllable_letters_translator, accented_vowels)
syllabled_letter_type_co_predictions = syllabled_letter_type_co_model.predict_generator(generator, accentuation_length / batch_size)
data.reorder_correct_direction_inputs(syllabled_letter_type_co_predictions, location_y)
return np.mean(np.array([letter_type_predictions, syllable_type_predictions, syllabled_letter_type_predictions,
letter_type_co_predictions, syllable_type_co_predictions, syllabled_letter_type_co_predictions]), axis=0)
def reorder_correct_direction_inputs(self, predictions, y):
pred_i = 0
for i in range(len(y)):
num_accented_syllables = 0
for el in y[i]:
if el > 0:
num_accented_syllables += 1
if num_accented_syllables > 1:
min_i = pred_i
max_i = pred_i + num_accented_syllables - 1
while (max_i > min_i):
min_pred = copy(predictions[min_i])
max_pred = copy(predictions[max_i])
predictions[min_i] = max_pred
predictions[max_i] = min_pred
min_i += 1
max_i -= 1
pred_i += num_accented_syllables
def assign_location_stress(self, word, locations, vowels):
# word = list(word)
word_list = list(word)
for loc in locations:
vowel_num = 0
# if loc == 0:
# return word
for i in range(len(word_list)):
if self._is_vowel(word_list, i, vowels):
if word_list[i] == 'a' and vowel_num == loc:
word_list[i] = 'á'
elif word_list[i] == 'e' and vowel_num == loc:
word_list[i] = 'é'
elif word_list[i] == 'i' and vowel_num == loc:
word_list[i] = 'í'
elif word_list[i] == 'o' and vowel_num == loc:
word_list[i] = 'ó'
elif word_list[i] == 'u' and vowel_num == loc:
word_list[i] = 'ú'
elif word_list[i] == 'r' and vowel_num == loc:
word_list[i] = 'ŕ'
elif word_list[i] == 'A' and vowel_num == loc:
word_list[i] = 'Á'
elif word_list[i] == 'E' and vowel_num == loc:
word_list[i] = 'É'
elif word_list[i] == 'I' and vowel_num == loc:
word_list[i] = 'Í'
elif word_list[i] == 'O' and vowel_num == loc:
word_list[i] = 'Ó'
elif word_list[i] == 'U' and vowel_num == loc:
word_list[i] = 'Ú'
elif word_list[i] == 'R' and vowel_num == loc:
word_list[i] = 'Ŕ'
vowel_num += 1
# print(word_list)
return ''.join(word_list)
def accentuate_word(self, input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model,
letter_type_model, syllable_type_model, syllabled_letter_type_model,
letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
predictions = self.get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model,
syllabled_letters_location_model,
letter_location_co_model, syllable_location_co_model,
syllabled_letters_location_co_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
syllable_dictionary)
#print(predictions)
if 'A' not in vowels:
vowels.extend(['A', 'E', 'I', 'O', 'U'])
location_accented_words = [self.assign_location_stress(input_words[i][0][::-1], self.decode_y(predictions[i]), vowels)[::-1] for i in
range(len(input_words))]
location_y = np.around(predictions)
type_predictions = self.get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model,
syllabled_letter_type_model,
letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
syllable_dictionary)
only_words = [el[0] for el in input_words]
accented_words = self.assign_stress_types(type_predictions, only_words, location_y, vowels, accented_vowels)
return location_accented_words, accented_words
def tag_words(self, reldi_location, original_location):
# generates text with every word in new line
with open(original_location) as f:
original_text = f.readlines()
original_text = ''.join(original_text)
# print(original_text)
text_with_whitespaces = original_text.replace(',', ' ,').replace('.', ' .').replace('\n', ' ').replace("\"", " \" ").replace(":",
" :").replace(
"ć", "č").replace('', '-')
# print('-------------------------------------------------')
text_with_whitespaces = '\n'.join(text_with_whitespaces.split())
text_with_whitespaces += '\n\n'
# print(text_with_whitespaces)
with open('.words_with_whitespaces', "w") as text_file:
text_file.write(text_with_whitespaces)
# generates text with PoS tags
import subprocess
myinput = open('.words_with_whitespaces', 'r')
myoutput = open('.word_tags', 'w')
# print(myinput.readlines())
python3_command = reldi_location + "/tagger.py sl" # launch your python2 script using bash
process = subprocess.run(python3_command.split(), stdin=myinput, stdout=myoutput)
# generates interesting words
pointless_words = ['.', ',', '\"', ':', '-']
with open('.word_tags', "r") as text_file:
tagged_input_words = []
for x in text_file.readlines()[:-1]:
splited_line = x[:-1].split('\t')
if splited_line[0] not in pointless_words and not any(char.isdigit() for char in splited_line[0]):
tagged_input_words.append([splited_line[0].lower(), '', splited_line[1], splited_line[0].lower()])
remove(".words_with_whitespaces")
remove(".word_tags")
return tagged_input_words, original_text
def create_connected_text_locations(self, tagged_input_words, original_text, predictions, vowels):
if 'A' not in vowels:
vowels.extend(['A', 'E', 'I', 'O', 'U'])
accented_words = [self.assign_location_stress(tagged_input_words[i][0][::-1], self.decode_y(predictions[i]), vowels)[::-1] for i in
range(len(tagged_input_words))]
# print(accented_words[:20])
# print(tagged_input_words[:20])
words_and_accetuation_loc = [[tagged_input_words[i][0], self.decode_y(predictions[i])] for i in range(len(tagged_input_words))]
original_text_list = list(original_text)
original_text_lowercase = original_text.lower()
end_pos = 0
for word in words_and_accetuation_loc:
posit = original_text_lowercase.find(word[0], end_pos)
if posit != -1:
start_pos = posit
end_pos = start_pos + len(word[0])
original_text_list[start_pos:end_pos] = list(
self.assign_location_stress(''.join(original_text_list[start_pos:end_pos][::-1]), word[1], vowels)[::-1])
return ''.join(original_text_list)
def create_connected_text_accented(self, tagged_input_words, original_text, type_predictions, location_y, vowels, accented_vowels):
input_words = [el[0] for el in tagged_input_words]
words = self.assign_stress_types(type_predictions, input_words, location_y, vowels, accented_vowels)
# print(original_text)
original_text_list = list(original_text)
original_text_lowercase = original_text.lower()
end_pos = 0
for i in range(len(words)):
posit = original_text_lowercase.find(input_words[i], end_pos)
if posit != -1:
start_pos = posit
end_pos = start_pos + len(words[i])
orig_word = original_text_list[start_pos:end_pos]
new_word = list(words[i])
for j in range(len(orig_word)):
if orig_word[j].isupper():
new_word[j] = new_word[j].upper()
original_text_list[start_pos:end_pos] = new_word
return ''.join(original_text_list)
# def count_vowels(content, vowels):
# num_all_vowels = 0
# for el in content:
# for m in range(len(el[0])):
# if is_vowel(list(el[0]), m, vowels):
# num_all_vowels += 1
# return num_all_vowels
# metric for calculation of correct results
# test with:
# print(mean_pred(y_validate[pos], predictions[pos]).eval())
# print(mean_pred(np.array([[ 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
# [ 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.]]),
# np.array([[ 0., 0.51, 0., 0.51, 0., 0., 0., 0., 0., 0., 0.],
# [ 0., 0.92, 0., 0.51, 0., 0., 0., 0., 0., 0., 0.]])).eval())
def actual_accuracy(y_true, y_pred):
return K.mean(K.equal(K.mean(K.equal(K.round(y_true), K.round(y_pred)), axis=-1), 1.0))
def convert_to_correct_stress(w):
w = w.replace('ì', 'ê')
w = w.replace('à', 'ŕ')
w = w.replace('ä', 'à')
w = w.replace('ë', 'è')
w = w.replace('ě', 'ê')
w = w.replace('î', 'ì')
w = w.replace('ö', 'ò')
w = w.replace('ü', 'ù')
return w