diff --git a/.gitignore b/.gitignore index 72364f9..8fc9c79 100644 --- a/.gitignore +++ b/.gitignore @@ -87,3 +87,7 @@ ENV/ # Rope project settings .ropeproject + +# Custom +data/ +character_based_ffnn/internal_representations/inputs/ diff --git a/.idea/accetuation.iml b/.idea/accetuation.iml new file mode 100644 index 0000000..e914aa8 --- /dev/null +++ b/.idea/accetuation.iml @@ -0,0 +1,11 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 0000000..97626ba --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..cc28c80 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,22 @@ + + + + + $USER_HOME$/.subversion + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..f3d2052 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000..2f69d75 --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,952 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nearly_zeros + rand + u + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1486720239842 + + + 1492074623429 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/prepare_data.py b/prepare_data.py new file mode 100644 index 0000000..aec0902 --- /dev/null +++ b/prepare_data.py @@ -0,0 +1,364 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals +# text in Western (Windows 1252) + +import numpy as np +import h5py +import gc + +def save_inputs(file_name, X, y): + h5f = h5py.File(file_name, 'w') + adict=dict(X=X, y=y) + for k,v in adict.items(): + h5f.create_dataset(k,data=v) + h5f.close() + +def create_and_save_inputs(file_name): + X, y, X_pure = generate_full_vowel_matrix_inputs() + h5f = h5py.File(file_name, 'w') + adict=dict(X=X, y=y, X_pure=X_pure) + for k,v in adict.items(): + h5f.create_dataset(k,data=v) + h5f.close() + +def load_inputs(file_name): + h5f = h5py.File(file_name,'r') + X = h5f['X'][:] + y = h5f['y'][:] + + h5f.close() + return X, y + +def save_model(model, file_name): + h5f = h5py.File(file_name, 'w') + adict=dict(W1=model['W1'], b1=model['b1'], W2=model['W2'], b2=model['b2']) + for k,v in adict.items(): + h5f.create_dataset(k,data=v) + + h5f.close() + +def load_model(file_name): + h5f = h5py.File(file_name,'r') + model = {} + W1.set_value(h5f['W1'][:]) + b1.set_value(h5f['b1'][:]) + W2.set_value(h5f['W2'][:]) + b2.set_value(h5f['b2'][:]) + h5f.close() + return model + +def read_content(): + print('READING CONTENT...') + with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f: + content = f.readlines() + print('CONTENT READ SUCCESSFULY') + return [x.decode('utf8').split('\t') for x in content] + + +def is_vowel(word_list, position, vowels): + if word_list[position] in vowels: + return True + if word_list[position] == u'r' and (position - 1 < 0 or word_list[position - 1] not in vowels) and (position + 1 >= len(word_list) or word_list[position + 1] not in vowels): + return True + return False + +def create_dict(): + + content = read_content() + + print('CREATING DICTIONARY...') + + # CREATE dictionary AND max_word + accetuated_vowels = [u'à', u'á', u'ä', u'é', u'ë', u'ì', u'í', u'î', u'ó', u'ô', u'ö', u'ú', u'ü'] + default_vowels = [u'a', u'e', u'i', u'o', u'u'] + vowels = [] + vowels.extend(accetuated_vowels) + vowels.extend(default_vowels) + + dictionary = [''] + line = 0 + max_word = 0 + # ADD 'EMPTY' VOWEL + max_num_vowels = 0 + for el in content: + num_vowels = 0 + i = 0 + try: + if len(el[3]) > max_word: + max_word = len(el[3]) + if len(el[0]) > max_word: + max_word = len(el[0]) + for c in list(el[3]): + if is_vowel(list(el[3]), i, vowels): + num_vowels += 1 + if c not in dictionary: + dictionary.append(c) + i += 1 + for c in list(el[0]): + if c not in dictionary: + dictionary.append(c) + if num_vowels > max_num_vowels: + max_num_vowels = num_vowels + except Exception, e: + print line - 1 + print el + break + line += 1 + dictionary = sorted(dictionary) + max_num_vowels += 1 + print('DICTIONARY CREATION SUCCESSFUL!') + return dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels + + +# GENERATE X and y +def generate_presentable_y(accetuations_list, word_list, max_num_vowels): + while len(accetuations_list) < 2: + accetuations_list.append(0) + if len(accetuations_list) > 2: + accetuations_list = accetuations_list[:2] + accetuations_list = np.array(accetuations_list) + final_position = accetuations_list[0] + max_num_vowels * accetuations_list[1] + return final_position + +def shuffle_inputs(X, y, X_pure): + s = np.arange(X.shape[0]) + np.random.shuffle(s) + X = X[s] + y = y[s] + X_pure = X_pure[s] + return X, y, X_pure + +def generate_inputs(): + dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict() + + print('GENERATING X AND y...') + X = np.zeros((len(content), max_word*len(dictionary))) + y = np.zeros((len(content), max_num_vowels * max_num_vowels )) + + i = 0 + for el in content: + j = 0 + for c in list(el[0]): + index = 0 + for d in dictionary: + if c == d: + X[i][index + j * max_word] = 1 + break + index += 1 + j += 1 + j = 0 + word_accetuations = [] + num_vowels = 0 + for c in list(el[3]): + index = 0 + if is_vowel(el[3], j, vowels): + num_vowels += 1 + for d in accetuated_vowels: + if c == d: + word_accetuations.append(num_vowels) + break + index += 1 + j += 1 + y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1 + i += 1 + print('GENERATION SUCCESSFUL!') + print('SHUFFELING INPUTS...') + X, y = shuffle_inputs(X, y) + print('INPUTS SHUFFELED!') + return X, y + + +def generate_matrix_inputs(): + dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict() + + print('GENERATING X AND y...') + # X = np.zeros((len(content), max_word*len(dictionary))) + y = np.zeros((len(content), max_num_vowels * max_num_vowels )) + + X = [] + + i = 0 + for el in content: + # j = 0 + word = [] + for c in list(el[0]): + index = 0 + character = np.zeros(len(dictionary)) + for d in dictionary: + if c == d: + # X[i][index + j * max_word] = 1 + character[index] = 1 + break + index += 1 + word.append(character) + # j += 1 + j = 0 + X.append(word) + word_accetuations = [] + num_vowels = 0 + for c in list(el[3]): + index = 0 + if is_vowel(el[3], j, vowels): + num_vowels += 1 + for d in accetuated_vowels: + if c == d: + word_accetuations.append(num_vowels) + break + index += 1 + j += 1 + y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1 + i += 1 + X = np.array(X) + print('GENERATION SUCCESSFUL!') + print('SHUFFELING INPUTS...') + X, y = shuffle_inputs(X, y) + print('INPUTS SHUFFELED!') + return X, y + + +def generate_full_matrix_inputs(): + dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict() + + print('GENERATING X AND y...') + # X = np.zeros((len(content), max_word*len(dictionary))) + y = np.zeros((len(content), max_num_vowels * max_num_vowels )) + X = np.zeros((len(content), max_word, len(dictionary))) + + i = 0 + for el in content: + j = 0 + # word = [] + for c in list(el[0]): + index = 0 + # character = np.zeros(len(dictionary)) + for d in dictionary: + if c == d: + X[i][j][index] = 1 + # character[index] = 1 + break + index += 1 + # word.append(character) + j += 1 + j = 0 + # X.append(word) + word_accetuations = [] + num_vowels = 0 + for c in list(el[3]): + index = 0 + if is_vowel(el[3], j, vowels): + num_vowels += 1 + for d in accetuated_vowels: + if c == d: + word_accetuations.append(num_vowels) + break + index += 1 + j += 1 + y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1 + i += 1 + # X = np.array(X) + print('GENERATION SUCCESSFUL!') + print('SHUFFELING INPUTS...') + X, y = shuffle_inputs(X, y) + print('INPUTS SHUFFELED!') + return X, y + +def count_vowels(content, vowels): + num_all_vowels = 0 + for el in content: + for m in range(len(el[0])): + if is_vowel(list(el[0]), m, vowels): + num_all_vowels += 1 + return num_all_vowels + + +def generate_full_vowel_matrix_inputs(): + dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict() + gc.collect() + # print (2018553 * max_word * len(dictionary) / (2**30.0)) + print('GENERATING X AND y...') + # X = np.zeros((len(content), max_word*len(dictionary))) + y = np.zeros((len(content), max_num_vowels * max_num_vowels )) + # X = np.zeros((2018553, max_word, len(dictionary))) + X_pure = [] + X = [] + + i = 0 + for el in content: + j = 0 + # word = [] + X_el = np.zeros((max_word, len(dictionary))) + for c in list(el[0]): + index = 0 + # character = np.zeros(len(dictionary)) + for d in dictionary: + if c == d: + X_el[j][index] = 1 + # character[index] = 1 + break + index += 1 + # word.append(character) + j += 1 + # for c in list(el[0]): + vowel_i = 0 + for m in range(len(el[0])): + if is_vowel(list(el[0]), m, vowels): + X.append(X_el) + X_pure.append(vowel_i) + vowel_i += 1 + j = 0 + # X.append(word) + word_accetuations = [] + num_vowels = 0 + for c in list(el[3]): + index = 0 + if is_vowel(el[3], j, vowels): + num_vowels += 1 + for d in accetuated_vowels: + if c == d: + word_accetuations.append(num_vowels) + break + index += 1 + j += 1 + y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1 + i += 1 + # print(len(X)) + # del X_pure + # del dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels + + X = np.array(X) + X_pure = np.array(X_pure) + print('GENERATION SUCCESSFUL!') + print('SHUFFELING INPUTS...') + X, y, X_pure = shuffle_inputs(X, y, X_pure) + print('INPUTS SHUFFELED!') + return X, y, X_pure + + +def decode_position(y, max_num_vowels): + max_el = 0 + i = 0 + pos = -1 + for el in y: + if el > max_el: + max_el = el + pos = i + i += 1 + return [pos % max_num_vowels, pos / max_num_vowels] + + +def decode_position_from_number(y, max_num_vowels): + return [y % max_num_vowels, y / max_num_vowels] + + +def generate_input_from_word(word, max_word, dictionary): + x = np.zeros(max_word*len(dictionary)) + j = 0 + for c in list(word): + index = 0 + for d in dictionary: + if c == d: + x[index + j * max_word] = 1 + break + index += 1 + j += 1 + return x diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ecae77f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,47 @@ +appnope==0.1.0 +backports.ssl-match-hostname==3.4.0.2 +certifi==2015.4.28 +decorator==4.0.2 +funcsigs==0.4 +functools32==3.2.3.post2 +gnureadline==6.3.3 +ipykernel==4.0.3 +ipython==4.0.0 +ipython-genutils==0.1.0 +ipywidgets==4.0.2 +Jinja2==2.8 +jsonschema==2.5.1 +jupyter==1.0.0 +jupyter-client==4.0.0 +jupyter-console==4.0.1 +jupyter-core==4.0.4 +MarkupSafe==0.23 +matplotlib==1.4.3 +mistune==0.7.1 +mock==1.3.0 +nbconvert==4.0.0 +nbformat==4.0.0 +nose==1.3.7 +notebook==4.0.4 +numpy==1.9.2 +path.py==8.1 +pbr==1.6.0 +pexpect==3.3 +pickleshare==0.5 +ptyprocess==0.5 +PyBrain==0.3 +Pygments==2.0.2 +pyparsing==2.0.3 +python-dateutil==2.4.2 +pytz==2015.4 +pyzmq==14.7.0 +qtconsole==4.0.1 +scikit-learn==0.16.1 +scipy==0.16.0 +simplegeneric==0.8.1 +six==1.9.0 +sklearn==0.0 +terminado==0.5 +tornado==4.2.1 +traitlets==4.0.0 +wheel==0.24.0 diff --git a/theanoTest.py b/theanoTest.py new file mode 100644 index 0000000..e69de29 diff --git a/theano_tutorial/logistic_regression_loop.py b/theano_tutorial/logistic_regression_loop.py new file mode 100644 index 0000000..e8daf9a --- /dev/null +++ b/theano_tutorial/logistic_regression_loop.py @@ -0,0 +1,82 @@ +import numpy +import theano +import theano.tensor as T +rng = numpy.random + +N = 400 # training sample size +feats = 784 # number of input variables + +# generate a dataset: D = (input_values, target_class) +D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2)) +training_steps = 10000 + +# Declare Theano symbolic variables +x = T.dmatrix("x") +y = T.dvector("y") + +# initialize the weight vector w randomly +# +# this and the following bias variable b +# are shared so they keep their values +# between training iterations (updates) +w = theano.shared(rng.randn(feats), name="w") + +# initialize the bias term +b = theano.shared(0., name="b") + +print("Initial model:") +print(w.get_value()) +print(b.get_value()) + +# Construct Theano expression graph +p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b)) # Probability that target = 1 +prediction = p_1 > 0.5 # The prediction thresholded +xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function +cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize +gw, gb = T.grad(cost, [w, b]) # Compute the gradient of the cost + # w.r.t weight vector w and + # bias term b + # (we shall return to this in a + # following section of this tutorial) + + +def set_value_at_position(x, y, prediction, xent, w, b): + p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b)) # Probability that target = 1 + prediction = p_1 > 0.5 # The prediction thresholded + xent = -y * T.log(p_1) - (1 - y) * T.log(1 - p_1) # Cross-entropy loss function + cost = xent.mean() + 0.01 * (w ** 2).sum() # The cost to minimize + gw, gb = T.grad(cost, [w, b]) + w = w - 0.1 * gw + b = b - 0.1 * gb + return w, b + + +result, updates = theano.scan(fn=set_value_at_position, + outputs_info=[prediction, xent], + sequences=[x, y], + non_sequences=[w, b], + n_steps=training_steps) + +calculate_scan = theano.function(inputs=[x, y], outputs=[prediction, xent], updates=updates) + + +# Compile +train = theano.function( + inputs=[x,y], + outputs=[prediction, xent], + updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb))) +predict = theano.function(inputs=[x], outputs=prediction) + + + +# Train +for i in range(training_steps): + pred, err = train(D[0], D[1]) + +print("Final model:") +print(w.get_value()) +print(b.get_value()) +print("target values for D:") +print(D[1]) +print("prediction on D:") +print(predict(D[0])) diff --git a/theano_tutorial/test.py b/theano_tutorial/test.py new file mode 100644 index 0000000..05c11f4 --- /dev/null +++ b/theano_tutorial/test.py @@ -0,0 +1,105 @@ +import numpy as np +import theano.tensor as T +from theano import function + +# ALGEBRA +x = T.dmatrix('x') +y = T.dmatrix('y') +z = x + y +f = function([x, y], z) + +# print(f(2, 3)) +# print(numpy.allclose(f(16.3, 12.1), 28.4)) +print(f([[1, 2], [3, 4]], [[10, 20], [30, 40]])) + +# exercise +import theano +a = T.vector() # declare variable +b = T.vector() # declare variable +out = a ** 2 + b ** 2 + 2 * a * b # build symbolic expression +f = function([a, b], out) # compile function +print(f([1, 2], [4, 5])) + +################################################### +# OTHER EXAMPLES + +# logistic function +x = T.dmatrix('x') +logistic_eq = 1 / (1 + T.exp(-x)) +logistic = function([x], logistic_eq) +print(logistic([[0, 1], [-1, -2]])) + + +# multiple things calculation +a, b = T.dmatrices('a', 'b') +diff = a - b +abs_diff = abs(diff) +diff_squared = diff**2 +f = function([a, b], [diff, abs_diff, diff_squared]) +print(f([[1, 1], [1, 1]], [[0, 1], [2, 3]])) + + +# default value +c = T.matrix('c') +c = a + b +f = function([a, theano.In(b, value=[[1, 1], [1, 1]])], c) +print(f([[1, 1], [1, 1]])) + + +# accumulator +state = theano.shared([[0, 0], [0, 0]]) +print("accumulator") +print(state.get_value()) + + +state = theano.shared(np.matrix('0 0; 0 0', dtype=np.int32)) +print(type(np.matrix('0 0; 0 0', dtype=np.int64))) +print(type(np.matrix('0 1; 2 3', dtype=np.int64))) +inc = T.imatrix('inc') +expression = state+inc +print(type(expression)) +accumulator = function([inc], state, updates=[(state, state+inc)]) + + +accumulator(np.matrix('1 2; 3 4', dtype=np.int32)) +print(state.get_value()) +accumulator(np.matrix('1 1; 1 1', dtype=np.int32)) +print(state.get_value()) + +# function copy +print("function copy") +new_state = theano.shared(np.matrix('0 0; 0 0', dtype=np.int32)) +new_accumulator = accumulator.copy(swap={state: new_state}) +new_accumulator(np.matrix('1 2; 3 4', dtype=np.int32)) +print(new_state.get_value()) +print(state.get_value()) + +# random numbers +# POSSIBLE THAT THIS DOES NOT WORK ON GPU +print("random numbers") +srng = T.shared_randomstreams.RandomStreams(seed=234) +rv_u = srng.uniform((2, 2)) +rv_n = srng.normal((2, 2)) +f = function([], rv_u) +g = function([], rv_n, no_default_updates=True) # Not updating rv_n.rng +nearly_zeros = function([], rv_u + rv_u - 2 * rv_u) + +print(f()) +print(f()) + +print(g()) +print(g()) + +print("sharing streams between functions") +state_after_v0 = rv_u.rng.get_value().get_state() +# nearly_zeros() # this affects rv_u's generator +v1 = f() +rng = rv_u.rng.get_value(borrow=True) +rng.set_state(state_after_v0) +rv_u.rng.set_value(rng, borrow=True) +v2 = f() # v2 != v1 +v3 = f() # v3 == v1 + +print(v1) +print(v2) +print(v3)