stress_asignment/prepare_data.py

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
# text in Western (Windows 1252)

import numpy as np
import h5py
import gc

def save_inputs(file_name, X, y):
    h5f = h5py.File(file_name, 'w')
    adict=dict(X=X, y=y)
    for k,v in adict.items():
        h5f.create_dataset(k,data=v)
    h5f.close()

def create_and_save_inputs(file_name):
    X, y, X_pure = generate_full_vowel_matrix_inputs()
    h5f = h5py.File(file_name, 'w')
    adict=dict(X=X, y=y, X_pure=X_pure)
    for k,v in adict.items():
        h5f.create_dataset(k,data=v)
    h5f.close()

def load_inputs(file_name):
    h5f = h5py.File(file_name,'r')
    X = h5f['X'][:]
    y = h5f['y'][:]

    h5f.close()
    return X, y

def save_model(model, file_name):
    h5f = h5py.File(file_name, 'w')
    adict=dict(W1=model['W1'], b1=model['b1'], W2=model['W2'], b2=model['b2'])
    for k,v in adict.items():
        h5f.create_dataset(k,data=v)

    h5f.close()

def load_model(file_name):
    h5f = h5py.File(file_name,'r')
    model = {}
    W1.set_value(h5f['W1'][:])
    b1.set_value(h5f['b1'][:])
    W2.set_value(h5f['W2'][:])
    b2.set_value(h5f['b2'][:])
    h5f.close()
    return model

def read_content():
    print('READING CONTENT...')
    with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f:
        content = f.readlines()
    print('CONTENT READ SUCCESSFULY')
    return [x.decode('utf8').split('\t') for x in content]


def is_vowel(word_list, position, vowels):
    if word_list[position] in vowels:
        return True
    if word_list[position] == u'r' and     (position - 1 < 0 or word_list[position - 1] not in vowels) and     (position + 1 >= len(word_list) or word_list[position + 1] not in vowels):
        return True
    return False

def create_dict():

    content = read_content()

    print('CREATING DICTIONARY...')

    # CREATE dictionary AND max_word
    accetuated_vowels = [u'à', u'á', u'ä', u'é', u'ë', u'ì', u'í', u'î', u'ó', u'ô', u'ö', u'ú', u'ü']
    default_vowels = [u'a', u'e', u'i', u'o', u'u']
    vowels = []
    vowels.extend(accetuated_vowels)
    vowels.extend(default_vowels)

    dictionary = ['']
    line = 0
    max_word = 0
    # ADD 'EMPTY' VOWEL
    max_num_vowels = 0
    for el in content:
        num_vowels = 0
        i = 0
        try:
            if len(el[3]) > max_word:
                max_word = len(el[3])
            if len(el[0]) > max_word:
                max_word = len(el[0])
            for c in list(el[3]):
                if is_vowel(list(el[3]), i, vowels):
                    num_vowels += 1
                if c not in dictionary:
                    dictionary.append(c)
                i += 1
            for c in list(el[0]):
                if c not in dictionary:
                    dictionary.append(c)
            if num_vowels > max_num_vowels:
                max_num_vowels = num_vowels
        except Exception, e:
            print line - 1
            print el
            break
        line += 1
    dictionary = sorted(dictionary)
    max_num_vowels += 1
    print('DICTIONARY CREATION SUCCESSFUL!')
    return dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels


# GENERATE X and y
def generate_presentable_y(accetuations_list, word_list, max_num_vowels):
    while len(accetuations_list) < 2:
        accetuations_list.append(0)
    if len(accetuations_list) > 2:
        accetuations_list = accetuations_list[:2]
    accetuations_list = np.array(accetuations_list)
    final_position = accetuations_list[0] + max_num_vowels * accetuations_list[1]
    return final_position

def shuffle_inputs(X, y, X_pure):
    s = np.arange(X.shape[0])
    np.random.shuffle(s)
    X = X[s]
    y = y[s]
    X_pure = X_pure[s]
    return X, y, X_pure

def generate_inputs():
    dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()

    print('GENERATING X AND y...')
    X = np.zeros((len(content), max_word*len(dictionary)))
    y = np.zeros((len(content), max_num_vowels * max_num_vowels ))

    i = 0
    for el in content:
        j = 0
        for c in list(el[0]):
            index = 0
            for d in dictionary:
                if c == d:
                    X[i][index + j * max_word] = 1
                    break
                index += 1
            j += 1
        j = 0
        word_accetuations = []
        num_vowels = 0
        for c in list(el[3]):
            index = 0
            if is_vowel(el[3], j, vowels):
                num_vowels += 1
            for d in accetuated_vowels:
                if c == d:
                    word_accetuations.append(num_vowels)
                    break
                index += 1
            j += 1
        y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
        i += 1
    print('GENERATION SUCCESSFUL!')
    print('SHUFFELING INPUTS...')
    X, y = shuffle_inputs(X, y)
    print('INPUTS SHUFFELED!')
    return X, y


def generate_matrix_inputs():
    dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()

    print('GENERATING X AND y...')
    # X = np.zeros((len(content), max_word*len(dictionary)))
    y = np.zeros((len(content), max_num_vowels * max_num_vowels ))

    X = []

    i = 0
    for el in content:
        # j = 0
        word = []
        for c in list(el[0]):
            index = 0
            character = np.zeros(len(dictionary))
            for d in dictionary:
                if c == d:
                    # X[i][index + j * max_word] = 1
                    character[index] = 1
                    break
                index += 1
            word.append(character)
            # j += 1
        j = 0
        X.append(word)
        word_accetuations = []
        num_vowels = 0
        for c in list(el[3]):
            index = 0
            if is_vowel(el[3], j, vowels):
                num_vowels += 1
            for d in accetuated_vowels:
                if c == d:
                    word_accetuations.append(num_vowels)
                    break
                index += 1
            j += 1
        y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
        i += 1
    X = np.array(X)
    print('GENERATION SUCCESSFUL!')
    print('SHUFFELING INPUTS...')
    X, y = shuffle_inputs(X, y)
    print('INPUTS SHUFFELED!')
    return X, y


def generate_full_matrix_inputs():
    dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()

    print('GENERATING X AND y...')
    # X = np.zeros((len(content), max_word*len(dictionary)))
    y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
    X = np.zeros((len(content), max_word, len(dictionary)))

    i = 0
    for el in content:
        j = 0
        # word = []
        for c in list(el[0]):
            index = 0
            # character = np.zeros(len(dictionary))
            for d in dictionary:
                if c == d:
                    X[i][j][index] = 1
                    # character[index] = 1
                    break
                index += 1
            # word.append(character)
            j += 1
        j = 0
        # X.append(word)
        word_accetuations = []
        num_vowels = 0
        for c in list(el[3]):
            index = 0
            if is_vowel(el[3], j, vowels):
                num_vowels += 1
            for d in accetuated_vowels:
                if c == d:
                    word_accetuations.append(num_vowels)
                    break
                index += 1
            j += 1
        y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
        i += 1
    # X = np.array(X)
    print('GENERATION SUCCESSFUL!')
    print('SHUFFELING INPUTS...')
    X, y = shuffle_inputs(X, y)
    print('INPUTS SHUFFELED!')
    return X, y

def count_vowels(content, vowels):
    num_all_vowels = 0
    for el in content:
        for m in range(len(el[0])):
            if is_vowel(list(el[0]), m, vowels):
                num_all_vowels += 1
    return num_all_vowels


def generate_full_vowel_matrix_inputs():
    dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
    gc.collect()
    # print (2018553 * max_word * len(dictionary) / (2**30.0))
    print('GENERATING X AND y...')
    # X = np.zeros((len(content), max_word*len(dictionary)))
    y = np.zeros((len(content), max_num_vowels * max_num_vowels ))
    # X = np.zeros((2018553, max_word, len(dictionary)))
    X_pure = []
    X = []

    i = 0
    for el in content:
        j = 0
        # word = []
        X_el = np.zeros((max_word, len(dictionary)))
        for c in list(el[0]):
            index = 0
            # character = np.zeros(len(dictionary))
            for d in dictionary:
                if c == d:
                    X_el[j][index] = 1
                    # character[index] = 1
                    break
                index += 1
            # word.append(character)
            j += 1
        # for c in list(el[0]):
        vowel_i = 0
        for m in range(len(el[0])):
            if is_vowel(list(el[0]), m, vowels):
                X.append(X_el)
                X_pure.append(vowel_i)
                vowel_i += 1
        j = 0
        # X.append(word)
        word_accetuations = []
        num_vowels = 0
        for c in list(el[3]):
            index = 0
            if is_vowel(el[3], j, vowels):
                num_vowels += 1
            for d in accetuated_vowels:
                if c == d:
                    word_accetuations.append(num_vowels)
                    break
                index += 1
            j += 1
        y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
        i += 1
    # print(len(X))
    # del X_pure
    # del dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels

    X = np.array(X)
    X_pure = np.array(X_pure)
    print('GENERATION SUCCESSFUL!')
    print('SHUFFELING INPUTS...')
    X, y, X_pure = shuffle_inputs(X, y, X_pure)
    print('INPUTS SHUFFELED!')
    return X, y, X_pure


def decode_position(y, max_num_vowels):
    max_el = 0
    i = 0
    pos = -1
    for el in y:
        if el > max_el:
            max_el = el
            pos = i
        i += 1
    return [pos % max_num_vowels, pos / max_num_vowels]


def decode_position_from_number(y, max_num_vowels):
    return [y % max_num_vowels, y / max_num_vowels]


def generate_input_from_word(word, max_word, dictionary):
    x = np.zeros(max_word*len(dictionary))
    j = 0
    for c in list(word):
        index = 0
        for d in dictionary:
            if c == d:
                x[index + j * max_word] = 1
                break
            index += 1
        j += 1
    return x