# -*- coding: utf-8 -*- from __future__ import unicode_literals # text in Western (Windows 1252) import numpy as np import h5py import math import keras.backend as K import os.path from os import remove import codecs from copy import copy from keras import optimizers from keras.models import Model from keras.layers import Dense, Dropout, Input from keras.layers.merge import concatenate from keras.layers.convolutional import Conv1D from keras.layers.convolutional import MaxPooling1D from keras.layers import Flatten from keras.models import load_model class Data: def __init__(self, input_type, allow_shuffle_vector_generation=False, save_generated_data=True, shuffle_all_inputs=True, additional_letter_attributes=True, reverse_inputs=True, accent_classification=False, number_of_syllables=False, convert_multext=True, bidirectional_basic_input=False, bidirectional_architectural_input=False): self._input_type = input_type self._save_generated_data = save_generated_data self._allow_shuffle_vector_generation = allow_shuffle_vector_generation self._shuffle_all_inputs = shuffle_all_inputs self._additional_letter_attributes = additional_letter_attributes self._reverse_inputs = reverse_inputs self._accent_classification = accent_classification self._number_of_syllables = number_of_syllables self._convert_multext = convert_multext self._bidirectional_basic_input = bidirectional_basic_input self._bidirectional_architectural_input = bidirectional_architectural_input self.x_train = None # self.x2_train = None self.x_other_features_train = None self.y_train = None self.x_test = None # self.x2_test = None self.x_other_features_test = None self.y_test = None self.x_validate = None # self.x2_validate = None self.x_other_features_validate = None self.y_validate = None def generate_data(self, train_inputs_name, test_inputs_name, validate_inputs_name, test_and_validation_size=0.1, force_override=False, content_name='SlovarIJS_BESEDE_utf8.lex', content_shuffle_vector='content_shuffle_vector', shuffle_vector='shuffle_vector', inputs_location='../../internal_representations/inputs/', content_location='../../../data/', test_set=False, complete_set=False): content_path = '{}{}'.format(content_location, content_name) train_path = '{}{}.h5'.format(inputs_location, train_inputs_name) test_path = '{}{}.h5'.format(inputs_location, test_inputs_name) validate_path = '{}{}.h5'.format(inputs_location, validate_inputs_name) if not force_override and os.path.exists(train_path) and os.path.exists(test_path) and os.path.exists(validate_path): print('LOADING DATA...') self.x_train, self.x_other_features_train, self.y_train = self._load_inputs(train_path) self.x_test, self.x_other_features_test, self.y_test = self._load_inputs(test_path) self.x_validate, self.x_other_features_validate, self.y_validate = self._load_inputs(validate_path) print('LOAD SUCCESSFUL!') else: content_shuffle_vector_path = '{}{}.h5'.format(inputs_location, content_shuffle_vector) shuffle_vector_path = '{}{}'.format(inputs_location, shuffle_vector) # actual generation of inputs self._generate_inputs(content_path, content_shuffle_vector_path, shuffle_vector_path, test_and_validation_size, train_path, test_path, validate_path) if test_set: self.x_train = np.concatenate((self.x_train, self.x_test), axis=0) self.x_other_features_train = np.concatenate((self.x_other_features_train, self.x_other_features_test), axis=0) self.y_train = np.concatenate((self.y_train, self.y_test), axis=0) self.x_test = self.x_validate self.x_other_features_test = self.x_other_features_validate self.y_test = self.y_validate if complete_set: self.x_train = np.concatenate((self.x_train, self.x_test, self.x_validate), axis=0) self.x_other_features_train = np.concatenate((self.x_other_features_train, self.x_other_features_test, self.x_other_features_validate), axis=0) self.y_train = np.concatenate((self.y_train, self.y_test, self.y_validate), axis=0) self.x_test = self.x_validate self.x_other_features_test = self.x_other_features_validate self.y_test = self.y_validate def _generate_inputs(self, content_location, content_shuffle_vector_location, shuffle_vector_location, test_and_validation_size, train_path, test_path, validate_path): print('READING CONTENT...') content = self._read_content(content_location) print('CONTENT READ SUCCESSFULLY') print('CREATING DICTIONARY...') dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content) if self._input_type == 's' or self._input_type == 'sl': dictionary = self._create_syllables_dictionary(content, vowels) print('DICTIONARY CREATION SUCCESSFUL!') # test_and_validation_size = 0.1 train_content, test_content, validate_content = self._split_content(content, test_and_validation_size, content_shuffle_vector_location) feature_dictionary = self._create_feature_dictionary() # Generate X and y print('GENERATING X AND y...') self.x_train, self.x_other_features_train, self.y_train = self._generate_x_and_y(dictionary, max_word, max_num_vowels, train_content, vowels, accented_vowels, feature_dictionary, shuffle_vector_location + '_train.h5') self.x_test, self.x_other_features_test, self.y_test = self._generate_x_and_y(dictionary, max_word, max_num_vowels, test_content, vowels, accented_vowels, feature_dictionary, shuffle_vector_location + '_test.h5') self.x_validate, self.x_other_features_validate, self.y_validate = self._generate_x_and_y(dictionary, max_word, max_num_vowels, validate_content, vowels, accented_vowels, feature_dictionary, shuffle_vector_location + '_validate.h5') print('GENERATION SUCCESSFUL!') # save inputs if self._save_generated_data: self._save_inputs(train_path, self.x_train, self.x_other_features_train, self.y_train) self._save_inputs(test_path, self.x_test, self.x_other_features_test, self.y_test) self._save_inputs(validate_path, self.x_validate, self.x_other_features_validate, self.y_validate) # return X_train, X_other_features_train, y_train, X_test, X_other_features_test, y_test, X_validate, X_other_features_validate, y_validate # functions for creating X and y from content @staticmethod def _read_content(content_path): # with open(content_path) as f: with codecs.open(content_path, encoding='utf8') as f: content = f.readlines() return [x.split('\t') for x in content] def _create_dict(self, content): # CREATE dictionary AND max_word accented_vowels = self._get_accented_vowels() unaccented_vowels = self._get_unaccented_vowels() vowels = [] vowels.extend(accented_vowels) vowels.extend(unaccented_vowels) dictionary_input = [''] line = 0 max_word = 0 # ADD 'EMPTY' VOWEL max_num_vowels = 0 for el in content: num_vowels = 0 try: if len(el[3]) > max_word: max_word = len(el[3]) if len(el[0]) > max_word: max_word = len(el[0]) for i in range(len(el[3])): if self._is_vowel(list(el[3]), i, vowels): num_vowels += 1 for c in list(el[0]): if c not in dictionary_input: dictionary_input.append(c) if num_vowels > max_num_vowels: max_num_vowels = num_vowels except Exception: print(line - 1) print(el) break line += 1 dictionary_input = sorted(dictionary_input) # max_num_vowels += 1 return dictionary_input, max_word, max_num_vowels, vowels, accented_vowels # split content so that there is no overfitting def _split_content(self, content, test_and_validation_ratio, content_shuffle_vector_location): expanded_content = [el[1] if el[1] != '=' else el[0] for el in content] # print(len(content)) unique_content = sorted(set(expanded_content)) s = self._load_shuffle_vector(content_shuffle_vector_location, len(unique_content)) test_num = math.floor(len(unique_content) * (test_and_validation_ratio * 2)) validation_num = math.floor(test_num * 0.5) shuffled_unique_train_content = [unique_content[i] for i in range(len(s)) if s[i] >= test_num] shuffled_unique_train_content_set = set(shuffled_unique_train_content) shuffled_unique_test_content = [unique_content[i] for i in range(len(s)) if test_num > s[i] >= validation_num] shuffled_unique_test_content_set = set(shuffled_unique_test_content) shuffled_unique_validate_content = [unique_content[i] for i in range(len(s)) if s[i] < validation_num] shuffled_unique_validate_content_set = set(shuffled_unique_validate_content) train_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_train_content_set] test_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_test_content_set] validate_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_validate_content_set] return train_content, test_content, validate_content @staticmethod def _create_and_save_shuffle_vector(file_name, length): shuffle_vector = np.arange(length) np.random.shuffle(shuffle_vector) h5f = h5py.File(file_name, 'w') adict = dict(shuffle_vector=shuffle_vector) for k, v in adict.items(): h5f.create_dataset(k, data=v) h5f.close() return shuffle_vector def _x_letter_input(self, content, dictionary, max_word, vowels, shuffle_vector_location): if self._additional_letter_attributes: if not self._bidirectional_basic_input: x = np.zeros((len(content), max_word, len(dictionary) + 6), dtype=int) else: x = np.zeros((len(content), 2 * max_word, len(dictionary) + 6), dtype=int) voiced_consonants = self._get_voiced_consonants() resonant_silent_consonants = self._get_resonant_silent_consonants() nonresonant_silent_consonants = self._get_nonresonant_silent_consonants() # print('HERE!!!') else: # print('HERE!!!') if not self._bidirectional_basic_input: x = np.zeros((len(content), max_word, len(dictionary)), dtype=int) else: x = np.zeros((len(content), 2 * max_word, len(dictionary)), dtype=int) if self._shuffle_all_inputs: s = self._load_shuffle_vector(shuffle_vector_location, len(content)) else: s = None # i = 0 for i in range(len(content)): if self._shuffle_all_inputs: mod_i = s[i] else: mod_i = i word = content[mod_i][0] if self._reverse_inputs: word = word[::-1] j = 0 for c in list(word): if j >= max_word: continue index = 0 if self._bidirectional_basic_input: j2 = max_word + (len(word) - j - 1) for d in dictionary: if c == d: x[i][j][index] = 1 if self._bidirectional_basic_input: x[i][j2][index] = 1 break index += 1 if self._additional_letter_attributes: if self._is_vowel(word, j, vowels): x[i][j][len(dictionary)] = 1 if self._bidirectional_basic_input: x[i][j2][len(dictionary)] = 1 else: x[i][j][len(dictionary) + 1] = 1 if self._bidirectional_basic_input: x[i][j2][len(dictionary) + 1] = 1 if c in voiced_consonants: x[i][j][len(dictionary) + 2] = 1 if self._bidirectional_basic_input: x[i][j2][len(dictionary) + 2] = 1 else: x[i][j][len(dictionary) + 3] = 1 if self._bidirectional_basic_input: x[i][j2][len(dictionary) + 3] = 1 if c in resonant_silent_consonants: x[i][j][len(dictionary) + 4] = 1 if self._bidirectional_basic_input: x[i][j2][len(dictionary) + 4] = 1 elif c in nonresonant_silent_consonants: x[i][j][len(dictionary) + 5] = 1 if self._bidirectional_basic_input: x[i][j2][len(dictionary) + 5] = 1 j += 1 #i += 1 return x def _x_syllable_input(self, content, dictionary, max_num_vowels, vowels, shuffle_vector_location): if not self._bidirectional_basic_input: x = np.zeros((len(content), max_num_vowels), dtype=int) else: x = np.zeros((len(content), 2 * max_num_vowels), dtype=int) if self._shuffle_all_inputs: s = self._load_shuffle_vector(shuffle_vector_location, len(content)) else: s = None for i in range(len(content)): if self._shuffle_all_inputs: mod_i = s[i] else: mod_i = i j = 0 syllables = self._create_syllables(content[mod_i][0], vowels) if self._reverse_inputs: syllables = syllables[::-1] for syllable in syllables: if j >= max_num_vowels: continue if syllable in dictionary: x[i][j] = dictionary.index(syllable) if self._bidirectional_basic_input: x[i][max_num_vowels + (len(syllables) - j - 1)] = dictionary.index(syllable) else: x[i][j] = 0 j += 1 #i += 1 return x def _y_output(self, content, max_num_vowels, vowels, accentuated_vowels, shuffle_vector_location): y = np.zeros((len(content), max_num_vowels)) i = 0 if self._shuffle_all_inputs: s = self._load_shuffle_vector(shuffle_vector_location, len(content)) else: s = None for i in range(len(content)): if self._shuffle_all_inputs: mod_i = s[i] else: mod_i = i el = content[mod_i] word = el[3] if self._reverse_inputs: word = word[::-1] j = 0 # word_accentuations = [] num_vowels = 0 for c in list(word): index = 0 for d in accentuated_vowels: if c == d: if not self._accent_classification: y[i][num_vowels] = 1 else: y[i][num_vowels] = index # word_accentuations.append(num_vowels) break index += 1 if self._is_vowel(word, j, vowels): num_vowels += 1 j += 1 return y # Generate each y as an array of 11 numbers (with possible values between 0 and 1) def _generate_x_and_y(self, dictionary, max_word, max_num_vowels, content, vowels, accentuated_vowels, feature_dictionary, shuffle_vector_location): if self._input_type == 'l': x = self._x_letter_input(content, dictionary, max_word, vowels, shuffle_vector_location) elif self._input_type == 's' or self._input_type == 'sl': x = self._x_syllable_input(content, dictionary, max_num_vowels, vowels, shuffle_vector_location) else: raise ValueError('No input_type provided. It could be \'l\', \'s\' or \'sl\'.') y = self._y_output(content, max_num_vowels, vowels, accentuated_vowels, shuffle_vector_location) # print('CREATING OTHER FEATURES...') x_other_features = self._create_x_features(content, feature_dictionary, vowels, shuffle_vector_location) # print('OTHER FEATURES CREATED!') if self._shuffle_all_inputs: print('SHUFFELING INPUTS...') #x, x_other_features, y = self._shuffle_inputs(x, x_other_features, y, shuffle_vector_location) print('INPUTS SHUFFELED!') return x, x_other_features, y def _create_syllables_dictionary(self, content, vowels): dictionary = [] for el in content: syllables = self._create_syllables(el[0], vowels) for syllable in syllables: if syllable not in dictionary: dictionary.append(syllable) dictionary.append('') return sorted(dictionary) def _create_syllables(self, word, vowels): word_list = list(word) consonants = [] syllables = [] for i in range(len(word_list)): if self._is_vowel(word_list, i, vowels): if syllables == []: consonants.append(word_list[i]) syllables.append(''.join(consonants)) else: left_consonants, right_consonants = self._split_consonants(list(''.join(consonants).lower())) syllables[-1] += ''.join(left_consonants) right_consonants.append(word_list[i]) syllables.append(''.join(right_consonants)) consonants = [] else: consonants.append(word_list[i]) if len(syllables) < 1: return word syllables[-1] += ''.join(consonants) return syllables def _is_vowel(self, word_list, position, vowels): if word_list[position] in vowels: return True if (word_list[position] == u'r' or word_list[position] == u'R') and (position - 1 < 0 or word_list[position - 1] not in vowels) and ( position + 1 >= len(word_list) or word_list[position + 1] not in vowels): return True return False def _split_consonants(self, consonants): voiced_consonants = self._get_voiced_consonants() resonant_silent_consonants = self._get_resonant_silent_consonants() unresonant_silent_consonants = self._get_nonresonant_silent_consonants() if len(consonants) == 0: return [''], [''] elif len(consonants) == 1: return [''], consonants else: split_options = [] for i in range(len(consonants) - 1): if consonants[i] == '-' or consonants[i] == '_': split_options.append([i, -1]) elif consonants[i] == consonants[i + 1]: split_options.append([i, 0]) elif consonants[i] in voiced_consonants: if consonants[i + 1] in resonant_silent_consonants or consonants[i + 1] in unresonant_silent_consonants: split_options.append([i, 2]) elif consonants[i] in resonant_silent_consonants: if consonants[i + 1] in resonant_silent_consonants: split_options.append([i, 1]) elif consonants[i + 1] in unresonant_silent_consonants: split_options.append([i, 3]) elif consonants[i] in unresonant_silent_consonants: if consonants[i + 1] in resonant_silent_consonants: split_options.append([i, 4]) if split_options == []: return [''], consonants else: split = min(split_options, key=lambda x: x[1]) return consonants[:split[0] + 1], consonants[split[0] + 1:] def _create_x_features(self, content, feature_dictionary, vowels, shuffle_vector_location): content = content x_other_features = [] if self._shuffle_all_inputs: s = self._load_shuffle_vector(shuffle_vector_location, len(content)) else: s = None for index in range(len(content)): if self._shuffle_all_inputs: mod_i = s[index] else: mod_i = index el = content[mod_i] x_el_other_features = [] if self._convert_multext: converted_el = ''.join(self._convert_to_multext_east_v4(list(el[2]), feature_dictionary)) else: converted_el = el[2] for feature in feature_dictionary: if converted_el[0] == feature[1]: x_el_other_features.append(1) for i in range(2, len(feature)): for j in range(len(feature[i])): if i - 1 < len(converted_el) and feature[i][j] == converted_el[i - 1]: x_el_other_features.append(1) else: x_el_other_features.append(0) else: x_el_other_features.extend([0] * feature[0]) if self._number_of_syllables: list_of_letters = list(el[0]) num_of_vowels = 0 for i in range(len(list_of_letters)): if self._is_vowel(list(el[0]), i, vowels): num_of_vowels += 1 x_el_other_features.append(num_of_vowels) x_other_features.append(x_el_other_features) return np.array(x_other_features) def _shuffle_inputs(self, x, x_other_features, y, shuffle_vector_location): s = self._load_shuffle_vector(shuffle_vector_location, x.shape[0]) x = x[s] y = y[s] x_other_features = x_other_features[s] return x, x_other_features, y # functions for saving, loading and shuffling whole arrays to ram @staticmethod def _save_inputs(file_name, x, x_other_features, y): h5f = h5py.File(file_name, 'w') a_dict = dict(X=x, X_other_features=x_other_features, y=y) for k, v in a_dict.items(): h5f.create_dataset(k, data=v) h5f.close() @staticmethod def _load_inputs(file_name): h5f = h5py.File(file_name, 'r') x = h5f['X'][:] y = h5f['y'][:] x_other_features = h5f['X_other_features'][:] h5f.close() return x, x_other_features, y def _load_shuffle_vector(self, file_path, length=0): if os.path.exists(file_path): h5f = h5py.File(file_path, 'r') shuffle_vector = h5f['shuffle_vector'][:] h5f.close() else: if self._allow_shuffle_vector_generation: shuffle_vector = self._create_and_save_shuffle_vector(file_path, length) else: raise ValueError('Shuffle vector on path: \'{}\' does not exist! Either generate new vector (with initializing new Data object with ' 'parameter allow_shuffle_vector_generation=True or paste one that is already generated!'.format(file_path)) return shuffle_vector @staticmethod def _convert_to_multext_east_v4(old_features, feature_dictionary): new_features = ['-'] * 9 new_features[:len(old_features)] = old_features if old_features[0] == 'A': if old_features[1] == 'f' or old_features[1] == 'o': new_features[1] = 'g' return new_features[:len(feature_dictionary[0]) - 1] if old_features[0] == 'C': return new_features[:len(feature_dictionary[1]) - 1] if old_features[0] == 'I': return new_features[:len(feature_dictionary[2]) - 1] if old_features[0] == 'M': new_features[2:6] = old_features[1:5] new_features[1] = old_features[5] if new_features[2] == 'm': new_features[2] = '-' return new_features[:len(feature_dictionary[3]) - 1] if old_features[0] == 'N': if len(old_features) >= 7: new_features[5] = old_features[7] return new_features[:len(feature_dictionary[4]) - 1] if old_features[0] == 'P': if new_features[8] == 'n': new_features[8] = 'b' return new_features[:len(feature_dictionary[5]) - 1] if old_features[0] == 'Q': return new_features[:len(feature_dictionary[6]) - 1] if old_features[0] == 'R': return new_features[:len(feature_dictionary[7]) - 1] if old_features[0] == 'S': if len(old_features) == 4: new_features[1] = old_features[3] else: new_features[1] = '-' return new_features[:len(feature_dictionary[8]) - 1] if old_features[0] == 'V': if old_features[1] == 'o' or old_features[1] == 'c': new_features[1] = 'm' new_features[3] = old_features[2] new_features[2] = '-' if old_features[2] == 'i': new_features[3] = 'r' if len(old_features) > 3 and old_features[3] == 'p': new_features[3] = 'r' elif len(old_features) > 3 and old_features[3] == 'f': new_features[3] = 'f' if len(old_features) >= 9: new_features[7] = old_features[8] else: new_features[7] = '-' return new_features[:len(feature_dictionary[9]) - 1] return '' # generator for inputs for tracking of data fitting def generator(self, data_type, batch_size, x=None, x_other_features_validate=None, y_validate=None, content_name='SlovarIJS_BESEDE_utf8.lex', content_location='../../../data/', oversampling=np.ones(13)): content_path = '{}{}'.format(content_location, content_name) if data_type == 'train': return self._generator_instance(self.x_train, self.x_other_features_train, self.y_train, batch_size, content_path, oversampling) elif data_type == 'test': return self._generator_instance(self.x_test, self.x_other_features_test, self.y_test, batch_size, content_path, oversampling) elif data_type == 'validate': return self._generator_instance(self.x_validate, self.x_other_features_validate, self.y_validate, batch_size, content_path, oversampling) else: return self._generator_instance(x, x_other_features_validate, y_validate, batch_size) # if self._input_type def _generator_instance(self, orig_x, orig_x_additional, orig_y, batch_size, content_path, oversampling): if self._input_type == 'l': content = self._read_content(content_path) dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content) return self._letter_generator(orig_x, orig_x_additional, orig_y, batch_size, accented_vowels) elif self._input_type == 's': content = self._read_content(content_path) dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content) syllable_dictionary = self._create_syllables_dictionary(content, vowels) eye = np.eye(len(syllable_dictionary), dtype=int) return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, eye, accented_vowels, oversampling) elif self._input_type == 'sl': content = self._read_content(content_path) dictionary, max_word, max_num_vowels, vowels, accented_vowels = self._create_dict(content) syllable_dictionary = self._create_syllables_dictionary(content, vowels) max_syllable = self._get_max_syllable(syllable_dictionary) syllable_letters_translator = self._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels) return self._syllable_generator(orig_x, orig_x_additional, orig_y, batch_size, syllable_letters_translator, accented_vowels, oversampling) # generator for inputs for tracking of data fitting def _letter_generator(self, orig_x, orig_x_additional, orig_y, batch_size, accented_vowels): size = orig_x.shape[0] while 1: loc = 0 if self._accent_classification: eye = np.eye(len(accented_vowels), dtype=int) eye_input_accent = np.eye(len(orig_y[0]), dtype=int) input_x_stack = [] input_x_other_features_stack = [] input_y_stack = [] while loc < size: while len(input_x_stack) < batch_size and loc < size: accent_loc = 0 for accent in orig_y[loc]: if accent > 0: new_orig_x_additional = orig_x_additional[loc] new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc])) input_x_stack.append(orig_x[loc]) input_x_other_features_stack.append(new_orig_x_additional) input_y_stack.append(eye[int(accent)]) accent_loc += 1 loc += 1 if len(input_x_stack) > batch_size: yield ([np.array(input_x_stack[:batch_size]), np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size]) input_x_stack = input_x_stack[batch_size:] input_x_other_features_stack = input_x_other_features_stack[batch_size:] input_y_stack = input_y_stack[batch_size:] else: # print('BBB') # print(np.array(input_stack)) # yield (np.array(input_stack)) yield ([np.array(input_x_stack), np.array(input_x_other_features_stack)], np.array(input_y_stack)) input_x_stack = [] input_x_other_features_stack = [] input_y_stack = [] else: while loc < size: if loc + batch_size >= size: if self._bidirectional_architectural_input: split_orig_x = np.hsplit(orig_x[loc:size], 2) yield ([split_orig_x[0], split_orig_x[1], orig_x_additional[loc:size]], orig_y[loc:size]) else: yield ([orig_x[loc:size], orig_x_additional[loc:size]], orig_y[loc:size]) else: if self._bidirectional_architectural_input: split_orig_x = np.hsplit(orig_x[loc:loc + batch_size], 2) yield ([split_orig_x[0], split_orig_x[1], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size]) else: yield ([orig_x[loc:loc + batch_size], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size]) loc += batch_size # generator for inputs for tracking of data fitting def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator, accented_vowels, oversampling=np.ones(13)): size = orig_x.shape[0] while 1: loc = 0 if self._accent_classification: eye = np.eye(len(accented_vowels), dtype=int) eye_input_accent = np.eye(len(orig_y[0]), dtype=int) input_x_stack = [] input_x_other_features_stack = [] input_y_stack = [] while loc < size: while len(input_x_stack) < batch_size and loc < size: accent_loc = 0 for accent in orig_y[loc]: if accent > 0: new_orig_x_additional = orig_x_additional[loc] new_orig_x_additional = np.concatenate((new_orig_x_additional, eye_input_accent[accent_loc])) for i in range(int(oversampling[int(accent)])): input_x_stack.append(orig_x[loc]) input_x_other_features_stack.append(new_orig_x_additional) input_y_stack.append(eye[int(accent)]) accent_loc += 1 loc += 1 if len(input_x_stack) > batch_size: gen_orig_x = translator[np.array(input_x_stack[:batch_size])] if self._bidirectional_architectural_input: split_orig_x = np.hsplit(gen_orig_x, 2) yield ([split_orig_x[0], split_orig_x[1], np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size]) else: yield ([gen_orig_x, np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size]) # yield ([gen_orig_x, np.array(input_x_other_features_stack[:batch_size])], np.array(input_y_stack)[:batch_size]) input_x_stack = input_x_stack[batch_size:] input_x_other_features_stack = input_x_other_features_stack[batch_size:] input_y_stack = input_y_stack[batch_size:] else: #print('-------------------------------------------------------------------------------------------') #if dictionary is not None: # print(self.decode_x(word_encoded, dictionary)) #print(input_x_stack) #print(input_x_other_features_stack) #print(input_y_stack) #print(loc) if len(input_x_stack) == 0: continue gen_orig_x = translator[np.array(input_x_stack)] if self._bidirectional_architectural_input: split_orig_x = np.hsplit(gen_orig_x, 2) yield ([split_orig_x[0], split_orig_x[1], np.array(input_x_other_features_stack)], np.array(input_y_stack)) else: yield ([gen_orig_x, np.array(input_x_other_features_stack)], np.array(input_y_stack)) # yield ([gen_orig_x, np.array(input_x_other_features_stack)], np.array(input_y_stack)) input_x_stack = [] input_x_other_features_stack = [] input_y_stack = [] else: while loc < size: if loc + batch_size >= size: gen_orig_x = translator[orig_x[loc:size]] if self._bidirectional_architectural_input: split_orig_x = np.hsplit(gen_orig_x, 2) yield ([split_orig_x[0], split_orig_x[1], orig_x_additional[loc:size]], orig_y[loc:size]) else: yield ([gen_orig_x, orig_x_additional[loc:size]], orig_y[loc:size]) #yield ([gen_orig_x, orig_x_additional[loc:size]], orig_y[loc:size]) else: gen_orig_x = translator[orig_x[loc:loc + batch_size]] if self._bidirectional_architectural_input: split_orig_x = np.hsplit(gen_orig_x, 2) yield ([split_orig_x[0], split_orig_x[1], orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size]) else: yield ([gen_orig_x, orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size]) #yield ([gen_orig_x, orig_x_additional[loc:loc + batch_size]], orig_y[loc:loc + batch_size]) loc += batch_size def _get_max_syllable(self, syllable_dictionary): max_len = 0 for el in syllable_dictionary: if len(el) > max_len: max_len = len(el) return max_len def _create_syllable_letters_translator(self, max_syllable, syllable_dictionary, dictionary, vowels, aditional_letter_attributes=True): if aditional_letter_attributes: voiced_consonants = self._get_voiced_consonants() resonant_silent_consonants = self._get_resonant_silent_consonants() nonresonant_silent_consonants = self._get_nonresonant_silent_consonants() syllable_letters_translator = [] for syllable in syllable_dictionary: di_syllable = [] for let in range(max_syllable): # di_let = [] for a in dictionary: if let < len(syllable) and a == list(syllable)[let]: di_syllable.append(1) else: di_syllable.append(0) if aditional_letter_attributes: if let >= len(syllable): di_syllable.extend([0, 0, 0, 0, 0, 0]) elif self._is_vowel(list(syllable), let, vowels): di_syllable.extend([1, 0, 0, 0, 0, 0]) else: # X[i][j][len(dictionary) + 1] = 1 if list(syllable)[let] in voiced_consonants: # X[i][j][len(dictionary) + 2] = 1 di_syllable.extend([0, 1, 1, 0, 0, 0]) else: # X[i][j][len(dictionary) + 3] = 1 if list(syllable)[let] in resonant_silent_consonants: # X[i][j][len(dictionary) + 4] = 1 di_syllable.extend([0, 1, 0, 1, 1, 0]) elif list(syllable)[let] in nonresonant_silent_consonants: # X[i][j][len(dictionary) + 5] = 1 di_syllable.extend([0, 1, 0, 1, 0, 1]) else: di_syllable.extend([0, 0, 0, 0, 0, 0]) # di_syllable.append(di_let) syllable_letters_translator.append(di_syllable) syllable_letters_translator = np.array(syllable_letters_translator, dtype=int) return syllable_letters_translator @staticmethod def _get_accented_vowels(): return [u'à', u'á', u'ä', u'é', u'ë', u'ì', u'í', u'î', u'ó', u'ô', u'ö', u'ú', u'ü'] @staticmethod def _get_unaccented_vowels(): return [u'a', u'e', u'i', u'o', u'u'] @staticmethod def _get_voiced_consonants(): return ['m', 'n', 'v', 'l', 'r', 'j', 'y', 'w'] @staticmethod def _get_resonant_silent_consonants(): return ['b', 'd', 'z', 'ž', 'g'] @staticmethod def _get_nonresonant_silent_consonants(): return ['p', 't', 's', 'š', 'č', 'k', 'f', 'h', 'c'] @staticmethod def _create_slovene_feature_dictionary(): # old: http://nl.ijs.si/ME/Vault/V3/msd/html/ # new: http://nl.ijs.si/ME/V4/msd/html/ # changes: http://nl.ijs.si/jos/msd/html-en/msd.diffs.html return [[21, 'P', ['p', 's'], ['n', 'p', 's'], ['m', 'z', 's'], ['e', 'd', 'm'], ['i', 'r', 'd', 't', 'm', 'o'], ['-', 'n', 'd']], [3, 'V', ['p', 'd']], [1, 'M'], [21, 'K', ['b'], ['-', 'g', 'v', 'd'], ['m', 'z', 's'], ['e', 'd', 'm'], ['i', 'r', 'd', 't', 'm', 'o'], ['-', 'n', 'd']], [17, 'S', ['o'], ['m', 'z', 's'], ['e', 'd', 'm'], ['i', 'r', 'd', 't', 'm', 'o'], ['-', 'n', 'd']], [40, 'Z', ['o', 's', 'k', 'z', 'p', 'c', 'v', 'n', 'l'], ['-', 'p', 'd', 't'], ['-', 'm', 'z', 's'], ['-', 'e', 'd', 'm'], ['-', 'i', 'r', 'd', 't', 'm', 'o'], ['-', 'e', 'd', 'm'], ['-', 'm', 'z', 's'], ['-', 'k', 'z']], [1, 'L'], [5, 'R', ['s'], ['n', 'r', 's']], [7, 'D', ['-', 'r', 'd', 't', 'm', 'o']], [24, 'G', ['g'], ['-'], ['n', 'm', 'd', 's', 'p', 'g'], ['-', 'p', 'd', 't'], ['-', 'e', 'm', 'd'], ['-', 'm', 'z', 's'], ['-', 'n', 'd']] ] @staticmethod def _create_feature_dictionary(): # old: http://nl.ijs.si/ME/Vault/V3/msd/html/ # new: http://nl.ijs.si/ME/V4/msd/html/ # changes: http://nl.ijs.si/jos/msd/html-en/msd.diffs.html return [[21, 'A', ['g', 's'], ['p', 'c', 's'], ['m', 'f', 'n'], ['s', 'd', 'p'], ['n', 'g', 'd', 'a', 'l', 'i'], ['-', 'n', 'y']], [3, 'C', ['c', 's']], [1, 'I'], [21, 'M', ['l'], ['-', 'c', 'o', 's'], ['m', 'f', 'n'], ['s', 'd', 'p'], ['n', 'g', 'd', 'a', 'l', 'i'], ['-', 'n', 'y']], [17, 'N', ['c'], ['m', 'f', 'n'], ['s', 'd', 'p'], ['n', 'g', 'd', 'a', 'l', 'i'], ['-', 'n', 'y']], [40, 'P', ['p', 's', 'd', 'r', 'x', 'g', 'q', 'i', 'z'], ['-', '1', '2', '3'], ['-', 'm', 'f', 'n'], ['-', 's', 'd', 'p'], ['-', 'n', 'g', 'd', 'a', 'l', 'i'], ['-', 's', 'd', 'p'], ['-', 'm', 'f', 'n'], ['-', 'y', 'b']], [1, 'Q'], [5, 'R', ['g'], ['p', 'c', 's']], [7, 'S', ['-', 'g', 'd', 'a', 'l', 'i']], [24, 'V', ['m'], ['-'], ['n', 'u', 'p', 'r', 'f', 'c'], ['-', '1', '2', '3'], ['-', 's', 'p', 'd'], ['-', 'm', 'f', 'n'], ['-', 'n', 'y']] ] # Decoders for inputs and outputs @staticmethod def decode_x(word_encoded, dictionary): word = '' for el in word_encoded: i = 0 for num in el: if num == 1: word += dictionary[i] break i += 1 return word @staticmethod def decode_x_other_features(feature_dictionary, x_other_features): final_word = [] for word in x_other_features: final_word = [] i = 0 for z in range(len(feature_dictionary)): for j in range(1, len(feature_dictionary[z])): if j == 1: if word[i] == 1: final_word.append(feature_dictionary[z][1]) i += 1 else: for k in range(len(feature_dictionary[z][j])): if word[i] == 1: final_word.append(feature_dictionary[z][j][k]) i += 1 # print(u''.join(final_word)) return u''.join(final_word) @staticmethod def decode_y(y): i = 0 res = [] for el in y: if el >= 0.5: res.append(i) i += 1 return res def test_accuracy(self, predictions, x, x_other_features, y, dictionary, feature_dictionary, vowels, syllable_dictionary=None, threshold=0.4999955, patterns=None): errors = [] num_of_pred = len(predictions) num_of_correct_pred = 0 # wrong_patterns = 0 # wrong_pattern_prediction = 0 for i in range(predictions.shape[0]): correct_prediction = True round_predictions = np.zeros(predictions[i].shape) for j in range(len(y[i])): if predictions[i][j] < threshold: round_predictions[j] = 0.0 else: round_predictions[j] = 1.0 if (predictions[i][j] < threshold and y[i][j] == 1.0) or (predictions[i][j] >= threshold and y[i][j] == 0.0): correct_prediction = False # in_pattern = False # if patterns is not None: # test_predictions = copy(predictions[i]) # l = self.get_word_length(x[i]) # round_predictions = np.zeros(test_predictions.shape) # for j in range(len(y[i])): # if test_predictions[j] < threshold: # round_predictions[j] = 0.0 # else: # round_predictions[j] = 1.0 # # in_pattern = False # for pattern in patterns[l]: # if (pattern == round_predictions).all(): # in_pattern = True # if not in_pattern: # wrong_patterns += 1 # # for j in range(len(y[i])): # if (predictions[i][j] < threshold and y[i][j] == 1.0) or (predictions[i][j] >= threshold and y[i][j] == 0.0): # correct_prediction = False # # if not in_pattern and not correct_prediction: # wrong_pattern_prediction += 1 # if (np.around(predictions[i]) == y[i]).all(): if correct_prediction: num_of_correct_pred += 1 else: if self._input_type == 'l': decoded_x = self.decode_x(x[i], dictionary) else: decoded_x = self.decode_syllable_x(x[i], syllable_dictionary) if self._bidirectional_basic_input: decoded_x = decoded_x[:int(len(decoded_x)/2)] errors.append([i, decoded_x, self.decode_x_other_features(feature_dictionary, [x_other_features[i]]), self.assign_stress_locations(decoded_x, round_predictions, vowels, syllables=self._input_type != 'l'), self.assign_stress_locations(decoded_x, y[i], vowels, syllables=self._input_type != 'l') ]) # print(wrong_patterns) # print(wrong_pattern_prediction) return (num_of_correct_pred / float(num_of_pred)) * 100, errors # def get_word_length(self, x_el): # i = 0 # for el in x_el: # if el == 0: # return i # i += 1 # return 10 @staticmethod def decode_syllable_x(word_encoded, syllable_dictionary): word = [] for i in range(len(word_encoded)): word.append(syllable_dictionary[word_encoded[i]]) return ''.join(word[::-1]) def assign_stress_locations(self, word, y, vowels, syllables=False): if not syllables: word_list = list(word) else: if self._reverse_inputs: word_list = list(word)[::-1] else: word_list = list(word) vowel_num = 0 for i in range(len(word_list)): if self._is_vowel(word_list, i, vowels): if word_list[i] == 'a' and y[vowel_num] == 1: word_list[i] = 'á' elif word_list[i] == 'e' and y[vowel_num] == 1: word_list[i] = 'é' elif word_list[i] == 'i' and y[vowel_num] == 1: word_list[i] = 'í' elif word_list[i] == 'o' and y[vowel_num] == 1: word_list[i] = 'ó' elif word_list[i] == 'u' and y[vowel_num] == 1: word_list[i] = 'ú' elif word_list[i] == 'r' and y[vowel_num] == 1: word_list[i] = 'ŕ' elif word_list[i] == 'A' and y[vowel_num] == 1: word_list[i] = 'Á' elif word_list[i] == 'E' and y[vowel_num] == 1: word_list[i] = 'É' elif word_list[i] == 'I' and y[vowel_num] == 1: word_list[i] = 'Í' elif word_list[i] == 'O' and y[vowel_num] == 1: word_list[i] = 'Ó' elif word_list[i] == 'U' and y[vowel_num] == 1: word_list[i] = 'Ú' elif word_list[i] == 'R' and y[vowel_num] == 1: word_list[i] = 'Ŕ' vowel_num += 1 if not syllables: return ''.join(word_list) else: return ''.join(word_list[::-1]) def test_type_accuracy(self, predictions, x, x_other_features, y, dictionary, feature_dictionary, vowels, accented_vowels, syllable_dictionary=None): errors = [] num_of_pred = len(predictions) num_of_correct_pred = 0 num_of_correct_pred_words = 0 accentuation_index = 0 eye = np.eye(len(accented_vowels), dtype=int) for i in range(len(y)): correct_prediction = True if self._input_type == 'l': decoded_x = self.decode_x(x[i], dictionary) else: decoded_x = self.decode_syllable_x(x[i], syllable_dictionary) wrong_word = decoded_x correct_word = decoded_x for j in range(len(y[i])): if y[i][j] > 0: # ERROR AS IT IS CALCULATED # arounded_predictions = np.around(predictions[accentuation_index]).astype(int) # MAX ELEMENT ONLY # arounded_predictions = np.zeros(len(predictions[accentuation_index])) # arounded_predictions[np.argmax(predictions[accentuation_index]).astype(int)] = 1 # MAX ELEMENT AMONGT POSSIBLE ONES # if i == 313: # print(decoded_x) stressed_letter = self.get_accentuated_letter(decoded_x, j, vowels, syllables=self._input_type != 'l') possible_places = np.zeros(len(predictions[accentuation_index])) if stressed_letter == 'r': possible_places[0] = 1 elif stressed_letter == 'a': possible_places[1] = 1 possible_places[2] = 1 elif stressed_letter == 'e': possible_places[3] = 1 possible_places[4] = 1 possible_places[5] = 1 elif stressed_letter == 'i': possible_places[6] = 1 possible_places[7] = 1 elif stressed_letter == 'o': possible_places[8] = 1 possible_places[9] = 1 possible_places[10] = 1 elif stressed_letter == 'u': possible_places[11] = 1 possible_places[12] = 1 possible_predictions = predictions[accentuation_index] * possible_places arounded_predictions = np.zeros(len(predictions[accentuation_index]), dtype=int) arounded_predictions[np.argmax(possible_predictions).astype(int)] = 1 wrong_word = self.assign_word_accentuation_type(wrong_word, j, arounded_predictions, vowels, accented_vowels, syllables=self._input_type != 'l', debug=i == 313) correct_word = self.assign_word_accentuation_type(correct_word, j, eye[int(y[i][j])], vowels, accented_vowels, syllables=self._input_type != 'l', debug=i == 313) if (eye[int(y[i][j])] == arounded_predictions).all(): num_of_correct_pred += 1 else: correct_prediction = False accentuation_index += 1 if correct_prediction: num_of_correct_pred_words += 1 else: if self._input_type == 'l': errors.append([i, decoded_x[::-1], self.decode_x_other_features(feature_dictionary, [x_other_features[i]]), wrong_word[::-1], correct_word[::-1] ]) else: errors.append([i, decoded_x, self.decode_x_other_features(feature_dictionary, [x_other_features[i]]), wrong_word, correct_word ]) print(num_of_pred) print(len(y)) print(num_of_correct_pred_words) print(len(errors)) print(num_of_correct_pred_words + len(errors)) return (num_of_correct_pred / float(num_of_pred)) * 100, (num_of_correct_pred_words / float(len(y))) * 100, errors def get_accentuated_letter(self, word, location, vowels, syllables=False, debug=False): # print(location) vowel_index = 0 word_list = list(word) if not syllables: word_list = list(word) else: word_list = list(word[::-1]) for i in range(len(word_list)): if self._is_vowel(word_list, i, vowels): if location == vowel_index: return word_list[i] vowel_index += 1 def assign_word_accentuation_type(self, word, location, y, vowels, accented_vowels, syllables=False, debug=False): vowel_index = 0 if not syllables: word_list = list(word) else: word_list = list(word[::-1]) for i in range(len(word_list)): if self._is_vowel(word_list, i, vowels + accented_vowels): if location == vowel_index: if len(np.where(y == 1)[0]) == 1: word_list[i] = accented_vowels[np.where(y == 1)[0][0]] vowel_index += 1 if not syllables: return ''.join(word_list) else: return ''.join(word_list[::-1]) def assign_stress_types(self, predictions, word, y, vowels, accented_vowels): words = [] accentuation_index = 0 for i in range(len(y)): wrong_word = word[i][::-1] for j in range(len(y[i])): if y[i][j] > 0: stressed_letter = self.get_accentuated_letter(word[i][::-1], j, vowels, syllables=self._input_type != 'l') possible_places = np.zeros(len(predictions[accentuation_index])) if stressed_letter == 'r': possible_places[0] = 1 elif stressed_letter == 'a': possible_places[1] = 1 possible_places[2] = 1 elif stressed_letter == 'e': possible_places[3] = 1 possible_places[4] = 1 possible_places[5] = 1 elif stressed_letter == 'i': possible_places[6] = 1 possible_places[7] = 1 elif stressed_letter == 'o': possible_places[8] = 1 possible_places[9] = 1 possible_places[10] = 1 elif stressed_letter == 'u': possible_places[11] = 1 possible_places[12] = 1 possible_predictions = predictions[accentuation_index] * possible_places arounded_predictions = np.zeros(len(predictions[accentuation_index]), dtype=int) arounded_predictions[np.argmax(possible_predictions).astype(int)] = 1 if np.max(possible_predictions) != 0: wrong_word = self.assign_word_accentuation_type(wrong_word, j, arounded_predictions, vowels, accented_vowels, syllables=self._input_type != 'l', debug=i == 313) accentuation_index += 1 words.append(wrong_word[::-1]) return words @staticmethod def load_location_models(letters_path, syllables_path, syllabled_letters_path): ############################ LOCATION ######################## nn_output_dim = 10 conv_input_shape = (23, 36) othr_input = (140,) conv_input = Input(shape=conv_input_shape, name='conv_input') x_conv = Conv1D(115, (3), padding='same', activation='relu')(conv_input) x_conv = Conv1D(46, (3), padding='same', activation='relu')(x_conv) x_conv = MaxPooling1D(pool_size=2)(x_conv) x_conv = Flatten()(x_conv) othr_input = Input(shape=othr_input, name='othr_input') x = concatenate([x_conv, othr_input]) # x = Dense(1024, input_dim=(516 + 256), activation='relu')(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(nn_output_dim, activation='sigmoid')(x) letter_location_model = Model(inputs=[conv_input, othr_input], outputs=x) opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08) letter_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ]) letter_location_model.load_weights(letters_path) ############################################################## # num_examples = len(data.x_train) # training set size nn_output_dim = 10 conv_input_shape = (10, 5168) othr_input = (140,) conv_input = Input(shape=conv_input_shape, name='conv_input') # syllabled letters x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input) x_conv = MaxPooling1D(pool_size=2)(x_conv) x_conv = Flatten()(x_conv) othr_input = Input(shape=othr_input, name='othr_input') x = concatenate([x_conv, othr_input]) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(nn_output_dim, activation='sigmoid')(x) syllable_location_model = Model(inputs=[conv_input, othr_input], outputs=x) opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08) syllable_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ]) syllable_location_model.load_weights(syllables_path) ##################################################### conv_input_shape = (10, 252) othr_input = (140,) conv_input = Input(shape=conv_input_shape, name='conv_input') # syllabled letters x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input) x_conv = MaxPooling1D(pool_size=2)(x_conv) x_conv = Flatten()(x_conv) othr_input = Input(shape=othr_input, name='othr_input') x = concatenate([x_conv, othr_input]) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(nn_output_dim, activation='sigmoid')(x) syllabled_letters_location_model = Model(inputs=[conv_input, othr_input], outputs=x) opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08) syllabled_letters_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ]) syllabled_letters_location_model.load_weights(syllabled_letters_path) return letter_location_model, syllable_location_model, syllabled_letters_location_model @staticmethod def load_type_models(letters_path, syllables_path, syllabled_letters_path): nn_output_dim = 13 # letters conv_input_shape = (23, 36) othr_input = (150,) conv_input = Input(shape=conv_input_shape, name='conv_input') # letters x_conv = Conv1D(115, (3), padding='same', activation='relu')(conv_input) x_conv = Conv1D(46, (3), padding='same', activation='relu')(x_conv) # syllabled letters x_conv = MaxPooling1D(pool_size=2)(x_conv) x_conv = Flatten()(x_conv) othr_input = Input(shape=othr_input, name='othr_input') x = concatenate([x_conv, othr_input]) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(nn_output_dim, activation='sigmoid')(x) letter_type_model = Model(inputs=[conv_input, othr_input], outputs=x) opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08) letter_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ]) letter_type_model.load_weights(letters_path) conv_input_shape = (10, 5168) othr_input = (150,) conv_input = Input(shape=conv_input_shape, name='conv_input') x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input) x_conv = MaxPooling1D(pool_size=2)(x_conv) x_conv = Flatten()(x_conv) othr_input = Input(shape=othr_input, name='othr_input') x = concatenate([x_conv, othr_input]) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(nn_output_dim, activation='sigmoid')(x) syllable_type_model = Model(inputs=[conv_input, othr_input], outputs=x) opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08) syllable_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ]) syllable_type_model.load_weights(syllables_path) # syllabled letters conv_input_shape = (10, 252) othr_input = (150,) conv_input = Input(shape=conv_input_shape, name='conv_input') x_conv = Conv1D(200, (2), padding='same', activation='relu')(conv_input) x_conv = MaxPooling1D(pool_size=2)(x_conv) x_conv = Flatten()(x_conv) othr_input = Input(shape=othr_input, name='othr_input') x = concatenate([x_conv, othr_input]) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(nn_output_dim, activation='sigmoid')(x) syllabled_letter_type_model = Model(inputs=[conv_input, othr_input], outputs=x) opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08) syllabled_letter_type_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ]) syllabled_letter_type_model.load_weights(syllabled_letters_path) return letter_type_model, syllable_type_model, syllabled_letter_type_model @staticmethod def get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model, letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model, dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary): batch_size = 16 # print(tagged_input_words[pos]) data = Data('l', shuffle_all_inputs=False, convert_multext=False) x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels, feature_dictionary, 'who cares') generator = data._letter_generator(x, x_other_features, fake_y, batch_size, accented_vowels) letter_location_predictions = letter_location_model.predict_generator(generator, len(x) / (batch_size)) data = Data('s', shuffle_all_inputs=False, convert_multext=False) x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels, feature_dictionary, 'who cares') eye = np.eye(len(syllable_dictionary), dtype=int) generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, eye, accented_vowels) syllable_location_predictions = syllable_location_model.predict_generator(generator, len(x) / (batch_size)) data = Data('sl', shuffle_all_inputs=False, convert_multext=False) x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels, feature_dictionary, 'who cares') max_syllable = data._get_max_syllable(syllable_dictionary) syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels) generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, syllable_letters_translator, accented_vowels) syllabled_letters_location_predictions = syllabled_letters_location_model.predict_generator(generator, len(x) / (batch_size)) ############## CORRECT ORDER INPUT ############## data = Data('l', shuffle_all_inputs=False, convert_multext=False, reverse_inputs=False) x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels, feature_dictionary, 'who cares') generator = data._letter_generator(x, x_other_features, fake_y, batch_size, accented_vowels) letter_location_co_predictions = letter_location_co_model.predict_generator(generator, len(x) / (batch_size)) letter_location_co_predictions = data.reverse_predictions(letter_location_co_predictions, input_words, vowels) data = Data('s', shuffle_all_inputs=False, convert_multext=False, reverse_inputs=False) x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels, feature_dictionary, 'who cares') eye = np.eye(len(syllable_dictionary), dtype=int) generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, eye, accented_vowels) syllable_location_co_predictions = syllable_location_co_model.predict_generator(generator, len(x) / (batch_size)) syllable_location_co_predictions = data.reverse_predictions(syllable_location_co_predictions, input_words, vowels) data = Data('sl', shuffle_all_inputs=False, convert_multext=False, reverse_inputs=False) x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels, feature_dictionary, 'who cares') max_syllable = data._get_max_syllable(syllable_dictionary) syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels) generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, syllable_letters_translator, accented_vowels) syllabled_letters_location_co_predictions = syllabled_letters_location_co_model.predict_generator(generator, len(x) / (batch_size)) syllabled_letters_location_co_predictions = data.reverse_predictions(syllabled_letters_location_co_predictions, input_words, vowels) return np.mean(np.array([letter_location_predictions, syllable_location_predictions, syllabled_letters_location_predictions, letter_location_co_predictions, syllable_location_co_predictions, syllabled_letters_location_co_predictions]), axis=0) def count_syllables(self, word, vowels): j = 0 num_vowels = 0 for j in range(len(word)): if self._is_vowel(word, j, vowels): num_vowels += 1 return num_vowels def reverse_predictions(self, predictions, words, vowels): new_predictions = np.zeros(predictions.shape, dtype='float32') for i in range(len(predictions)): word_len = self.count_syllables(words[i][0], vowels) if word_len > 10: word_len = 10 for k in range(word_len): new_predictions[i][k] += predictions[i][word_len - 1 - k] return new_predictions @staticmethod def get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model, syllabled_letter_type_model, letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model, dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary): batch_size = 16 y_array = np.asarray(location_y) accentuation_length = (y_array > 0).sum() data = Data('l', shuffle_all_inputs=False, accent_classification=True, convert_multext=False) x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels, feature_dictionary, 'who cares') generator = data._letter_generator(x, x_other_features, location_y, batch_size, accented_vowels) letter_type_predictions = letter_type_model.predict_generator(generator, accentuation_length / (batch_size)) data = Data('s', shuffle_all_inputs=False, accent_classification=True, convert_multext=False) x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels, feature_dictionary, 'who cares') eye = np.eye(len(syllable_dictionary), dtype=int) generator = data._syllable_generator(x, x_other_features, location_y, batch_size, eye, accented_vowels) syllable_type_predictions = syllable_type_model.predict_generator(generator, accentuation_length / (batch_size)) data = Data('sl', shuffle_all_inputs=False, accent_classification=True, convert_multext=False) x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels, feature_dictionary, 'who cares') max_syllable = data._get_max_syllable(syllable_dictionary) syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels) generator = data._syllable_generator(x, x_other_features, location_y, batch_size, syllable_letters_translator, accented_vowels) syllabled_letter_type_predictions = syllabled_letter_type_model.predict_generator(generator, accentuation_length / batch_size) ############## CORRECT ORDER INPUT ############## location_y = data.reverse_predictions(location_y, input_words, vowels) data = Data('l', shuffle_all_inputs=False, accent_classification=True, convert_multext=False, reverse_inputs=False) x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels, feature_dictionary, 'who cares') generator = data._letter_generator(x, x_other_features, location_y, batch_size, accented_vowels) letter_type_co_predictions = letter_type_co_model.predict_generator(generator, accentuation_length / (batch_size)) data.reorder_correct_direction_inputs(letter_type_co_predictions, location_y) data = Data('s', shuffle_all_inputs=False, accent_classification=True, convert_multext=False, reverse_inputs=False) x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels, feature_dictionary, 'who cares') eye = np.eye(len(syllable_dictionary), dtype=int) generator = data._syllable_generator(x, x_other_features, location_y, batch_size, eye, accented_vowels) syllable_type_co_predictions = syllable_type_co_model.predict_generator(generator, accentuation_length / (batch_size)) data.reorder_correct_direction_inputs(syllable_type_co_predictions, location_y) data = Data('sl', shuffle_all_inputs=False, accent_classification=True, convert_multext=False, reverse_inputs=False) x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels, feature_dictionary, 'who cares') max_syllable = data._get_max_syllable(syllable_dictionary) syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels) generator = data._syllable_generator(x, x_other_features, location_y, batch_size, syllable_letters_translator, accented_vowels) syllabled_letter_type_co_predictions = syllabled_letter_type_co_model.predict_generator(generator, accentuation_length / batch_size) data.reorder_correct_direction_inputs(syllabled_letter_type_co_predictions, location_y) return np.mean(np.array([letter_type_predictions, syllable_type_predictions, syllabled_letter_type_predictions, letter_type_co_predictions, syllable_type_co_predictions, syllabled_letter_type_co_predictions]), axis=0) def reorder_correct_direction_inputs(self, predictions, y): pred_i = 0 for i in range(len(y)): num_accented_syllables = 0 for el in y[i]: if el > 0: num_accented_syllables += 1 if num_accented_syllables > 1: min_i = pred_i max_i = pred_i + num_accented_syllables - 1 while (max_i > min_i): min_pred = copy(predictions[min_i]) max_pred = copy(predictions[max_i]) predictions[min_i] = max_pred predictions[max_i] = min_pred min_i += 1 max_i -= 1 pred_i += num_accented_syllables def assign_location_stress(self, word, locations, vowels): # word = list(word) word_list = list(word) for loc in locations: vowel_num = 0 # if loc == 0: # return word for i in range(len(word_list)): if self._is_vowel(word_list, i, vowels): if word_list[i] == 'a' and vowel_num == loc: word_list[i] = 'á' elif word_list[i] == 'e' and vowel_num == loc: word_list[i] = 'é' elif word_list[i] == 'i' and vowel_num == loc: word_list[i] = 'í' elif word_list[i] == 'o' and vowel_num == loc: word_list[i] = 'ó' elif word_list[i] == 'u' and vowel_num == loc: word_list[i] = 'ú' elif word_list[i] == 'r' and vowel_num == loc: word_list[i] = 'ŕ' elif word_list[i] == 'A' and vowel_num == loc: word_list[i] = 'Á' elif word_list[i] == 'E' and vowel_num == loc: word_list[i] = 'É' elif word_list[i] == 'I' and vowel_num == loc: word_list[i] = 'Í' elif word_list[i] == 'O' and vowel_num == loc: word_list[i] = 'Ó' elif word_list[i] == 'U' and vowel_num == loc: word_list[i] = 'Ú' elif word_list[i] == 'R' and vowel_num == loc: word_list[i] = 'Ŕ' vowel_num += 1 # print(word_list) return ''.join(word_list) def accentuate_word(self, input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model, letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model, letter_type_model, syllable_type_model, syllabled_letter_type_model, letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model, dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary): predictions = self.get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model, letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model, dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary) #print(predictions) if 'A' not in vowels: vowels.extend(['A', 'E', 'I', 'O', 'U']) location_accented_words = [self.assign_location_stress(input_words[i][0][::-1], self.decode_y(predictions[i]), vowels)[::-1] for i in range(len(input_words))] location_y = np.around(predictions) type_predictions = self.get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model, syllabled_letter_type_model, letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model, dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary) only_words = [el[0] for el in input_words] accented_words = self.assign_stress_types(type_predictions, only_words, location_y, vowels, accented_vowels) return location_accented_words, accented_words def tag_words(self, reldi_location, original_location): # generates text with every word in new line with open(original_location) as f: original_text = f.readlines() original_text = ''.join(original_text) # print(original_text) text_with_whitespaces = original_text.replace(',', ' ,').replace('.', ' .').replace('\n', ' ').replace("\"", " \" ").replace(":", " :").replace( "ć", "č").replace('–', '-') # print('-------------------------------------------------') text_with_whitespaces = '\n'.join(text_with_whitespaces.split()) text_with_whitespaces += '\n\n' # print(text_with_whitespaces) with open('.words_with_whitespaces', "w") as text_file: text_file.write(text_with_whitespaces) # generates text with PoS tags import subprocess myinput = open('.words_with_whitespaces', 'r') myoutput = open('.word_tags', 'w') # print(myinput.readlines()) python3_command = reldi_location + "/tagger.py sl" # launch your python2 script using bash process = subprocess.run(python3_command.split(), stdin=myinput, stdout=myoutput) # generates interesting words pointless_words = ['.', ',', '\"', ':', '-'] with open('.word_tags', "r") as text_file: tagged_input_words = [] for x in text_file.readlines()[:-1]: splited_line = x[:-1].split('\t') if splited_line[0] not in pointless_words and not any(char.isdigit() for char in splited_line[0]): tagged_input_words.append([splited_line[0].lower(), '', splited_line[1], splited_line[0].lower()]) remove(".words_with_whitespaces") remove(".word_tags") return tagged_input_words, original_text def create_connected_text_locations(self, tagged_input_words, original_text, predictions, vowels): if 'A' not in vowels: vowels.extend(['A', 'E', 'I', 'O', 'U']) accented_words = [self.assign_location_stress(tagged_input_words[i][0][::-1], self.decode_y(predictions[i]), vowels)[::-1] for i in range(len(tagged_input_words))] # print(accented_words[:20]) # print(tagged_input_words[:20]) words_and_accetuation_loc = [[tagged_input_words[i][0], self.decode_y(predictions[i])] for i in range(len(tagged_input_words))] original_text_list = list(original_text) original_text_lowercase = original_text.lower() end_pos = 0 for word in words_and_accetuation_loc: posit = original_text_lowercase.find(word[0], end_pos) if posit != -1: start_pos = posit end_pos = start_pos + len(word[0]) original_text_list[start_pos:end_pos] = list( self.assign_location_stress(''.join(original_text_list[start_pos:end_pos][::-1]), word[1], vowels)[::-1]) return ''.join(original_text_list) def create_connected_text_accented(self, tagged_input_words, original_text, type_predictions, location_y, vowels, accented_vowels): input_words = [el[0] for el in tagged_input_words] words = self.assign_stress_types(type_predictions, input_words, location_y, vowels, accented_vowels) # print(original_text) original_text_list = list(original_text) original_text_lowercase = original_text.lower() end_pos = 0 for i in range(len(words)): posit = original_text_lowercase.find(input_words[i], end_pos) if posit != -1: start_pos = posit end_pos = start_pos + len(words[i]) orig_word = original_text_list[start_pos:end_pos] new_word = list(words[i]) for j in range(len(orig_word)): if orig_word[j].isupper(): new_word[j] = new_word[j].upper() original_text_list[start_pos:end_pos] = new_word return ''.join(original_text_list) # def count_vowels(content, vowels): # num_all_vowels = 0 # for el in content: # for m in range(len(el[0])): # if is_vowel(list(el[0]), m, vowels): # num_all_vowels += 1 # return num_all_vowels # metric for calculation of correct results # test with: # print(mean_pred(y_validate[pos], predictions[pos]).eval()) # print(mean_pred(np.array([[ 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.], # [ 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.]]), # np.array([[ 0., 0.51, 0., 0.51, 0., 0., 0., 0., 0., 0., 0.], # [ 0., 0.92, 0., 0.51, 0., 0., 0., 0., 0., 0., 0.]])).eval()) def actual_accuracy(y_true, y_pred): return K.mean(K.equal(K.mean(K.equal(K.round(y_true), K.round(y_pred)), axis=-1), 1.0)) def convert_to_correct_stress(w): w = w.replace('ì', 'ê') w = w.replace('à', 'ŕ') w = w.replace('ä', 'à') w = w.replace('ë', 'è') w = w.replace('ě', 'ê') w = w.replace('î', 'ì') w = w.replace('ö', 'ò') w = w.replace('ü', 'ù') return w