[MAJOR UPDATE] Changed additional features to version 4, erased unnecessary input letters (unused vowels), split validation data to test data and validation data
@ -7,6 +7,7 @@ import h5py
import gc
import math
import keras.backend as K
import os.path
# functions for saving, loading and shuffling whole arrays to ram
@ -34,9 +35,15 @@ def load_inputs(file_name, other_features=False):
return X, y
def shuffle_inputs(X, y, X_pure=[]):
s = np.arange(X.shape[0])
def shuffle_inputs(X, y, shuffle_vector_location, X_pure=[]):
if os.path.exists(shuffle_vector_location):
s = load_shuffle_vector(shuffle_vector_location)
s = np.arange(X.shape[0])
create_and_save_shuffle_vector(shuffle_vector_location, s)
# s = np.arange(X.shape[0])
# np.random.shuffle(s)
X = X[s]
y = y[s]
if X_pure != []:
@ -57,7 +64,7 @@ def create_and_save_inputs(file_name, part, X, y, X_pure):
def load_extended_inputs(file_name, obtain_range):
h5f = h5py.File(file_name,'r')
h5f = h5py.File(file_name, 'r')
X = h5f['X'][obtain_range[0]:obtain_range[1]]
y = h5f['y'][obtain_range[0]:obtain_range[1]]
X_pure = h5f['X_pure'][obtain_range[0]:obtain_range[1]]
@ -69,16 +76,17 @@ def load_extended_inputs(file_name, obtain_range):
# functions for creating and loading shuffle vector
def create_and_save_shuffle_vector(file_name, shuffle_vector):
# X, y, X_pure = generate_full_vowel_matrix_inputs()
h5f = h5py.File(file_name + '_shuffle_vector.h5', 'w')
h5f = h5py.File(file_name, 'w')
adict = dict(shuffle_vector=shuffle_vector)
for k, v in adict.items():
h5f.create_dataset(k, data=v)
def load_shuffle_vector(file_name):
h5f = h5py.File(file_name,'r')
shuffle_vector = h5f['shuffle_vector'][[179859, 385513, 893430]]
h5f = h5py.File(file_name, 'r')
# shuffle_vector = h5f['shuffle_vector'][[179859, 385513, 893430]]
shuffle_vector = h5f['shuffle_vector'][:]
return shuffle_vector
@ -138,7 +146,8 @@ def create_dict():
dictionary = ['']
dictionary_output = ['']
dictionary_input = ['']
line = 0
max_word = 0
@ -154,12 +163,12 @@ def create_dict():
for c in list(el[3]):
if is_vowel(list(el[3]), i, vowels):
num_vowels += 1
if c not in dictionary:
if c not in dictionary_output:
i += 1
for c in list(el[0]):
if c not in dictionary:
if c not in dictionary_input:
if num_vowels > max_num_vowels:
max_num_vowels = num_vowels
except Exception:
@ -167,10 +176,10 @@ def create_dict():
line += 1
dictionary = sorted(dictionary)
dictionary_input = sorted(dictionary_input)
max_num_vowels += 1
return dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels
return dictionary_input, max_word, max_num_vowels, content, vowels, accetuated_vowels
# GENERATE X and y
@ -272,7 +281,22 @@ def generate_presentable_y(accetuations_list, word_list, max_num_vowels):
# return X, y
def generate_full_matrix_inputs():
def generate_full_matrix_inputs(content_shuffle_vector_location, shuffle_vector_location):
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
train_content, test_content, validate_content = split_content(content, 0.2, content_shuffle_vector_location)
feature_dictionary = create_feature_dictionary()
# Generate X and y
print('GENERATING X AND y...')
X_train, X_other_features_train, y_train = generate_X_and_y(dictionary, max_word, max_num_vowels, train_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_train.h5')
X_test, X_other_features_test, y_test = generate_X_and_y(dictionary, max_word, max_num_vowels, test_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_test.h5')
X_validate, X_other_features_validate, y_validate = generate_X_and_y(dictionary, max_word, max_num_vowels, validate_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_validate.h5')
return X_train, X_other_features_train, y_train, X_test, X_other_features_test, y_test, X_validate, X_other_features_validate, y_validate
# generate full matrix, with old features
def old_generate_full_matrix_inputs():
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
train_content, validate_content = split_content(content, 0.2)
feature_dictionary = create_feature_dictionary(content)
@ -286,7 +310,7 @@ def generate_full_matrix_inputs():
# Generate each y as an array of 11 numbers (with possible values between 0 and 1)
def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary):
def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location):
y = np.zeros((len(content), max_num_vowels))
X = np.zeros((len(content), max_word, len(dictionary)))
@ -328,7 +352,7 @@ def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, acce
i += 1
X, y, X_other_features = shuffle_inputs(X, y, X_pure=X_other_features)
X, y, X_other_features = shuffle_inputs(X, y, shuffle_vector_location, X_pure=X_other_features)
return X, X_other_features, y
@ -559,6 +583,7 @@ def shuffle_full_vowel_inputs(name, orderd_name, parts):
# Decoders for inputs and outputs
def decode_X_features(feature_dictionary, X_other_features):
final_word = []
for word in X_other_features:
final_word = []
i = 0
@ -574,6 +599,7 @@ def decode_X_features(feature_dictionary, X_other_features):
i += 1
return u''.join(final_word)
def decode_position(y, max_num_vowels):
@ -650,7 +676,37 @@ def decode_position_from_vowel_to_final_number(y):
# split content so that there is no overfitting
def split_content(content, ratio):
def split_content(content, test_and_validation_ratio, content_shuffle_vector_location, validation_ratio=0.5):
expanded_content = [el[1] if el[1] != '=' else el[0] for el in content]
# print(len(content))
unique_content = sorted(set(expanded_content))
if os.path.exists(content_shuffle_vector_location):
s = load_shuffle_vector(content_shuffle_vector_location)
s = np.arange(len(unique_content))
create_and_save_shuffle_vector(content_shuffle_vector_location, s)
split_num = math.floor(len(unique_content) * test_and_validation_ratio)
validation_num = math.floor(split_num * validation_ratio)
shuffled_unique_train_content = [unique_content[i] for i in range(len(s)) if s[i] >= split_num]
shuffled_unique_train_content_set = set(shuffled_unique_train_content)
shuffled_unique_test_content = [unique_content[i] for i in range(len(s)) if split_num > s[i] >= validation_num]
shuffled_unique_test_content_set = set(shuffled_unique_test_content)
shuffled_unique_validate_content = [unique_content[i] for i in range(len(s)) if s[i] < validation_num]
shuffled_unique_validate_content_set = set(shuffled_unique_validate_content)
train_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_train_content_set]
test_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_test_content_set]
validate_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_validate_content_set]
return train_content, test_content, validate_content
# split content so that there is no overfitting with out split of validation and test data
def old_split_content(content, ratio):
expanded_content = [el[1] if el[1] != '=' else el[0] for el in content]
# print(len(content))
unique_content = sorted(set(expanded_content))
@ -671,8 +727,8 @@ def split_content(content, ratio):
return train_content, validate_content
# create feature dictionary
def create_feature_dictionary(content):
# X features that use MULTEX v3 as their encoding
def create_old_feature_dictionary(content):
additional_data = [el[2] for el in content]
possible_variants = sorted(set(additional_data))
categories = sorted(set([el[0] for el in possible_variants]))
@ -690,7 +746,8 @@ def create_feature_dictionary(content):
return feature_dictionary
def create_X_features(content, feature_dictionary):
# X features that use MULTEX v3 as their encoding
def create_old_X_features(content, feature_dictionary):
content = content
X_other_features = []
for el in content:
@ -707,4 +764,213 @@ def create_X_features(content, feature_dictionary):
X_el_other_features.extend([0] * feature[0])
return np.array(X_other_features)
return np.array(X_other_features)
def convert_to_MULTEXT_east_v4(old_features, feature_dictionary):
new_features = ['-'] * 9
new_features[:len(old_features)] = old_features
if old_features[0] == 'A':
if old_features[1] == 'f' or old_features[1] == 'o':
new_features[1] = 'g'
return new_features[:len(feature_dictionary[0]) - 1]
if old_features[0] == 'C':
return new_features[:len(feature_dictionary[1]) - 1]
if old_features[0] == 'I':
return new_features[:len(feature_dictionary[2]) - 1]
if old_features[0] == 'M':
new_features[2:6] = old_features[1:5]
new_features[1] = old_features[5]
if new_features[2] == 'm':
new_features[2] = '-'
return new_features[:len(feature_dictionary[3]) - 1]
if old_features[0] == 'N':
if len(old_features) > 5:
new_features[5] = old_features[7]
return new_features[:len(feature_dictionary[4]) - 1]
if old_features[0] == 'P':
if new_features[8] == 'n':
new_features[8] = 'b'
return new_features[:len(feature_dictionary[5]) - 1]
if old_features[0] == 'Q':
return new_features[:len(feature_dictionary[6]) - 1]
if old_features[0] == 'R':
return new_features[:len(feature_dictionary[7]) - 1]
if old_features[0] == 'S':
if len(old_features) == 4:
new_features[1] = old_features[3]
new_features[1] = '-'
return new_features[:len(feature_dictionary[8]) - 1]
if old_features[0] == 'V':
if old_features[1] == 'o' or old_features[1] == 'c':
new_features[1] = 'm'
new_features[3] = old_features[2]
new_features[2] = '-'
if old_features[2] == 'i':
new_features[3] = 'r'
if len(old_features) > 3 and old_features[3] == 'p':
new_features[3] = 'r'
elif len(old_features) > 3 and old_features[3] == 'f':
new_features[3] = 'f'
if len(old_features) >= 9:
new_features[7] = old_features[8]
new_features[7] = '-'
return new_features[:len(feature_dictionary[9]) - 1]
return ''
def create_X_features(content, feature_dictionary):
content = content
X_other_features = []
for el in content:
X_el_other_features = []
converted_el = ''.join(convert_to_MULTEXT_east_v4(list(el[2]), feature_dictionary))
# converted_el = el[2]
for feature in feature_dictionary:
if converted_el[0] == feature[1]:
for i in range(2, len(feature)):
for j in range(len(feature[i])):
if i-1 < len(converted_el) and feature[i][j] == converted_el[i-1]:
X_el_other_features.extend([0] * feature[0])
return np.array(X_other_features)
def create_feature_dictionary():
# old: http://nl.ijs.si/ME/Vault/V3/msd/html/
# new: http://nl.ijs.si/ME/V4/msd/html/
# changes: http://nl.ijs.si/jos/msd/html-en/msd.diffs.html
return [[21,
['g', 's'],
['p', 'c', 's'],
['m', 'f', 'n'],
['s', 'd', 'p'],
['n', 'g', 'd', 'a', 'l', 'i'],
['-', 'n', 'y']],
[3, 'C', ['c', 's']],
[1, 'I'],
['-', 'c', 'o', 's'],
['m', 'f', 'n'],
['s', 'd', 'p'],
['n', 'g', 'd', 'a', 'l', 'i'],
['-', 'n', 'y']],
['m', 'f', 'n'],
['s', 'd', 'p'],
['n', 'g', 'd', 'a', 'l', 'i'],
['-', 'n', 'y']],
['p', 's', 'd', 'r', 'x', 'g', 'q', 'i', 'z'],
['-', '1', '2', '3'],
['-', 'm', 'f', 'n'],
['-', 's', 'd', 'p'],
['-', 'n', 'g', 'd', 'a', 'l', 'i'],
['-', 's', 'd', 'p'],
['-', 'm', 'f', 'n'],
['-', 'y', 'b']],
[1, 'Q'],
[5, 'R', ['g'], ['p', 'c', 's']],
[7, 'S', ['-', 'g', 'd', 'a', 'l', 'i']],
['n', 'u', 'p', 'r', 'f', 'c'],
['-', '1', '2', '3'],
['-', 's', 'p', 'd'],
['-', 'm', 'f', 'n'],
['-', 'n', 'y']]
def complete_feature_dict():
# old: http://nl.ijs.si/ME/Vault/V3/msd/html/
# new: http://nl.ijs.si/ME/V4/msd/html/
# changes: http://nl.ijs.si/jos/msd/html-en/msd.diffs.html
return [[27,
['-', 'g', 's', 'p'],
['-', 'p', 'c', 's'],
['-', 'm', 'f', 'n'],
['-', 's', 'd', 'p'],
['-', 'n', 'g', 'd', 'a', 'l', 'i'],
['-', 'n', 'y']],
[4, 'C', ['-', 'c', 's']],
[1, 'I'],
['-', 'd', 'r', 'l'],
['-', 'c', 'o', 'p', 's'],
['-', 'm', 'f', 'n'],
['-', 's', 'd', 'p'],
['-', 'n', 'g', 'd', 'a', 'l', 'i'],
['-', 'n', 'y']],
['-', 'c', 'p'],
['-', 'm', 'f', 'n'],
['-', 's', 'd', 'p'],
['-', 'n', 'g', 'd', 'a', 'l', 'i'],
['-', 'n', 'y']],
['-', 'p', 's', 'd', 'r', 'x', 'g', 'q', 'i', 'z'],
['-', '1', '2', '3'],
['-', 'm', 'f', 'n'],
['-', 's', 'd', 'p'],
['-', 'n', 'g', 'd', 'a', 'l', 'i'],
['-', 's', 'd', 'p'],
['-', 'm', 'f', 'n'],
['-', 'y', 'b']],
[1, 'Q'],
[8, 'R', ['-', 'g', 'r'], ['-', 'p', 'c', 's']],
[8, 'S', ['-', 'n', 'g', 'd', 'a', 'l', 'i']],
['-', 'm', 'a'],
['-', 'e', 'p', 'b'],
['-', 'n', 'u', 'p', 'r', 'f', 'c', 'm'],
['-', '1', '2', '3'],
['-', 's', 'p', 'd'],
['-', 'm', 'f', 'n'],
['-', 'n', 'y']]
def check_feature_letter_usage(X_other_features, feature_dictionary):
case_numbers = np.sum(X_other_features, axis=0)
arrays = [1] * 164
letters = list(decode_X_features(feature_dictionary, [arrays]))
for i in range(len(letters)):
print(letters[i] + ': ' + str(case_numbers[i]))
def dict_occurances_in_dataset_rate(content):
feature_dictionary = complete_feature_dict()
# case = 3107
# print(content[case])
# print(feature_dictionary)
# X_other_features = create_X_features([content[case]], feature_dictionary)
X_other_features = create_X_features(content, feature_dictionary)
# print(X_other_features)
# print(decode_X_features(feature_dictionary, X_other_features))
X_other_features = np.array(X_other_features)
case_numbers = np.sum(X_other_features, axis=0)
Normal file
Normal file
@ -0,0 +1,101 @@
import sys
sys.path.insert(0, '../../../')
from prepare_data import *
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
feature_dictionary = create_feature_dictionary(content)
def read_hyphenation_pattern():
with open('../../../hyphenation') as f:
content = f.readlines()
return [x[:-1] for x in content]
def find_hyphenation_patterns_in_text(text, pattern):
res = []
index = 0
while index < len(text):
index = text.find(pattern, index)
if index == -1:
index += 1 # +2 because len('ll') == 2
return res
def create_hyphenation_dictionary(hyphenation_pattern):
dictionary = []
for el in hyphenation_pattern:
substring = ''
anomalies_indices = []
digit_location = 0
for let in list(el):
if let.isdigit():
anomalies_indices.append([digit_location, int(let)])
substring += let
digit_location += 1
dictionary.append([substring, anomalies_indices])
return dictionary
def split_hyphenated_word(split, word):
split = split[2:-2]
word = list(word)[1:-1]
res = []
hyphenate = ''
loc = 0
for let in word:
hyphenate += let
if loc == len(split) or split[loc] % 2 == 1:
hyphenate = ''
loc += 1
return res
def hyphenate_word(word, hyphenation_dictionary):
word = word.replace('è', 'č')
word = '.' + word + '.'
split = [0] * (len(word) + 1)
for pattern in hyphenation_dictionary:
pattern_locations = find_hyphenation_patterns_in_text(word, pattern[0])
for pattern_location in pattern_locations:
for el_hyphenation_dictionary in pattern[1]:
if split[pattern_location + el_hyphenation_dictionary[0]] < el_hyphenation_dictionary[1]:
split[pattern_location + el_hyphenation_dictionary[0]] = el_hyphenation_dictionary[1]
return split_hyphenated_word(split, word)
hyphenation_pattern = read_hyphenation_pattern()
# ['zz', [{0:2},{1:1},{2:2}]]
hyphenation_dictionary = create_hyphenation_dictionary(hyphenation_pattern)
separated_word = hyphenate_word('izziv', hyphenation_dictionary)
all_words = []
i = 0
for el in content:
separated_word = hyphenate_word(el[0], hyphenation_dictionary)
all_words.append([el[0], separated_word])
if i % 10000 == 0:
i += 1
errors = []
errors2 = []
for word in all_words:
for hyphenated_part in word[1]:
num_vowels = 0
for let in list(hyphenated_part):
if let in vowels:
num_vowels += 1
if num_vowels == 0:
for let in list(hyphenated_part):
if let == 'r':
num_vowels += 1
if num_vowels != 1:
