From 524ceeb4b66bfa5dae6da9e21600e66bedac50c9 Mon Sep 17 00:00:00 2001 From: Luka Date: Sat, 28 Apr 2018 10:52:50 +0200 Subject: [PATCH] Added tab2xml conversion --- sloleks_accentuation2_tab2xml.py | 137 ++++++++++++++++++++ theano_tutorial/logistic_regression.py | 61 --------- theano_tutorial/logistic_regression_loop.py | 82 ------------ theano_tutorial/test.py | 105 --------------- theano_tutorial/theanoTest.py | 0 theano_tutorial/tutorial_conditions.py | 34 ----- theano_tutorial/tutorial_derivates.py | 94 -------------- theano_tutorial/tutorial_loop.py | 100 -------------- 8 files changed, 137 insertions(+), 476 deletions(-) create mode 100644 sloleks_accentuation2_tab2xml.py delete mode 100644 theano_tutorial/logistic_regression.py delete mode 100644 theano_tutorial/logistic_regression_loop.py delete mode 100644 theano_tutorial/test.py delete mode 100644 theano_tutorial/theanoTest.py delete mode 100644 theano_tutorial/tutorial_conditions.py delete mode 100644 theano_tutorial/tutorial_derivates.py delete mode 100644 theano_tutorial/tutorial_loop.py diff --git a/sloleks_accentuation2_tab2xml.py b/sloleks_accentuation2_tab2xml.py new file mode 100644 index 0000000..24e7c49 --- /dev/null +++ b/sloleks_accentuation2_tab2xml.py @@ -0,0 +1,137 @@ +# Words proccesed: 650250 +# Word indeks: 50023 +# Word number: 50023 + +from lxml import etree +import time +from prepare_data import * + +# def xml_words_generator(xml_path): +# for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"): +# words = [] +# for child in element: +# if child.tag == 'WordForm': +# msd = None +# word = None +# for wf in child: +# if 'att' in wf.attrib and wf.attrib['att'] == 'msd': +# msd = wf.attrib['val'] +# elif wf.tag == 'FormRepresentation': +# for form_rep in wf: +# if form_rep.attrib['att'] == 'zapis_oblike': +# word = form_rep.attrib['val'] +# #if msd is not None and word is not None: +# # pass +# #else: +# # print('NOOOOO') +# words.append([word, '', msd, word]) +# yield words +# +# +# gen = xml_words_generator('data/Sloleks_v1.2_p2.xml') +word_glob_num = 0 +word_limit = 1000 +iter_num = 1000 +word_index = 0 + +# iter_index = 0 +# words = [] +# +# lexical_entries_load_number = 0 +# lexical_entries_save_number = 0 +# +# # INSIDE +# # word_glob_num = 1500686 +# word_glob_num = 1550705 +# +# # word_limit = 1500686 +# word_limit = 1550705 +# +# iter_index = 31 + +# done_lexical_entries = 33522 +data = Data('s', shuffle_all_inputs=False) +accentuated_content = data._read_content('data/new_sloleks/new_sloleks.tab') + +start_timer = time.time() + +print('Copy initialization complete') +with open("data/new_sloleks/final_sloleks.xml", "ab") as myfile: + # myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab') + for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): + # if word_glob_num >= word_limit: + # myfile2.close() + # myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab') + # iter_index += 1 + # print("Words proccesed: " + str(word_glob_num)) + # + # print("Word indeks: " + str(word_index)) + # print("Word number: " + str(len(words))) + # + # # print("lexical_entries_load_number: " + str(lexical_entries_load_number)) + # # print("lexical_entries_save_number: " + str(lexical_entries_save_number)) + # + # end_timer = time.time() + # print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes") + lemma = '' + accentuated_word_location = '' + accentuated_word = '' + for child in element: + if child.tag == 'Lemma': + for wf in child: + if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike': + lemma = wf.attrib['val'] + if child.tag == 'WordForm': + msd = None + word = None + for wf in child: + if 'att' in wf.attrib and wf.attrib['att'] == 'msd': + msd = wf.attrib['val'] + elif wf.tag == 'FormRepresentation': + for form_rep in wf: + if form_rep.attrib['att'] == 'zapis_oblike': + word = form_rep.attrib['val'] + # if msd is not None and word is not None: + # pass + # else: + # print('NOOOOO') + + word_index = (word_index - 500) % len(accentuated_content) + word_index_sp = (word_index - 1) % len(accentuated_content) + while word_index != word_index_sp: + if word == accentuated_content[word_index][0] and msd == accentuated_content[word_index][2] and \ + lemma == accentuated_content[word_index][1]: + accentuated_word_location = accentuated_content[word_index][4] + accentuated_word = accentuated_content[word_index][5][:-1] + del(accentuated_content[word_index]) + break + word_index = (word_index + 1) % len(accentuated_content) + + if word_index == word_index_sp: + print('ERROR IN ' + word + ' : ' + lemma + ' : ' + msd) + # print('ERROR IN ' + word + ' : ' + accentuated_content[word_index][0] + ' OR ' + msd + ' : ' + # + accentuated_content[word_index][2]) + # words.append([word, '', msd, word]) + + new_element = etree.Element('feat') + new_element.attrib['att'] = 'naglasna_mesta_besede' + new_element.attrib['val'] = accentuated_word_location + wf.append(new_element) + + new_element = etree.Element('feat') + new_element.attrib['att'] = 'naglaĊĦena_beseda' + new_element.attrib['val'] = accentuated_word + wf.append(new_element) + word_glob_num += 1 + # word_index += 1 + + # print(etree.tostring(element, encoding="UTF-8")) + # myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True)) + if word_glob_num > word_limit: + print('Proccessed ' + str(word_glob_num) + ' words') + end_timer = time.time() + print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes") + word_limit += iter_num + break + myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True)) + element.clear() diff --git a/theano_tutorial/logistic_regression.py b/theano_tutorial/logistic_regression.py deleted file mode 100644 index 26a683d..0000000 --- a/theano_tutorial/logistic_regression.py +++ /dev/null @@ -1,61 +0,0 @@ -import numpy -import theano -import theano.tensor as T -rng = numpy.random - -N = 400 # training sample size -feats = 784 # number of input variables - -# generate a dataset: D = (input_values, target_class) -D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2)) -training_steps = 10000 - -# Declare Theano symbolic variables -x = T.dmatrix("x") -y = T.dvector("y") - -# initialize the weight vector w randomly -# -# this and the following bias variable b -# are shared so they keep their values -# between training iterations (updates) -w = theano.shared(rng.randn(feats), name="w") - -# initialize the bias term -b = theano.shared(0., name="b") - -print("Initial model:") -print(w.get_value()) -print(b.get_value()) - -# Construct Theano expression graph -p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b)) # Probability that target = 1 -prediction = p_1 > 0.5 # The prediction thresholded -xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function -cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize -gw, gb = T.grad(cost, [w, b]) # Compute the gradient of the cost - # w.r.t weight vector w and - # bias term b - # (we shall return to this in a - # following section of this tutorial) - -# Compile -train = theano.function( - inputs=[x,y], - outputs=[prediction, xent], - updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb))) -predict = theano.function(inputs=[x], outputs=prediction) - - - -# Train -for i in range(training_steps): - pred, err = train(D[0], D[1]) - -print("Final model:") -print(w.get_value()) -print(b.get_value()) -print("target values for D:") -print(D[1]) -print("prediction on D:") -print(predict(D[0])) diff --git a/theano_tutorial/logistic_regression_loop.py b/theano_tutorial/logistic_regression_loop.py deleted file mode 100644 index e8daf9a..0000000 --- a/theano_tutorial/logistic_regression_loop.py +++ /dev/null @@ -1,82 +0,0 @@ -import numpy -import theano -import theano.tensor as T -rng = numpy.random - -N = 400 # training sample size -feats = 784 # number of input variables - -# generate a dataset: D = (input_values, target_class) -D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2)) -training_steps = 10000 - -# Declare Theano symbolic variables -x = T.dmatrix("x") -y = T.dvector("y") - -# initialize the weight vector w randomly -# -# this and the following bias variable b -# are shared so they keep their values -# between training iterations (updates) -w = theano.shared(rng.randn(feats), name="w") - -# initialize the bias term -b = theano.shared(0., name="b") - -print("Initial model:") -print(w.get_value()) -print(b.get_value()) - -# Construct Theano expression graph -p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b)) # Probability that target = 1 -prediction = p_1 > 0.5 # The prediction thresholded -xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function -cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize -gw, gb = T.grad(cost, [w, b]) # Compute the gradient of the cost - # w.r.t weight vector w and - # bias term b - # (we shall return to this in a - # following section of this tutorial) - - -def set_value_at_position(x, y, prediction, xent, w, b): - p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b)) # Probability that target = 1 - prediction = p_1 > 0.5 # The prediction thresholded - xent = -y * T.log(p_1) - (1 - y) * T.log(1 - p_1) # Cross-entropy loss function - cost = xent.mean() + 0.01 * (w ** 2).sum() # The cost to minimize - gw, gb = T.grad(cost, [w, b]) - w = w - 0.1 * gw - b = b - 0.1 * gb - return w, b - - -result, updates = theano.scan(fn=set_value_at_position, - outputs_info=[prediction, xent], - sequences=[x, y], - non_sequences=[w, b], - n_steps=training_steps) - -calculate_scan = theano.function(inputs=[x, y], outputs=[prediction, xent], updates=updates) - - -# Compile -train = theano.function( - inputs=[x,y], - outputs=[prediction, xent], - updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb))) -predict = theano.function(inputs=[x], outputs=prediction) - - - -# Train -for i in range(training_steps): - pred, err = train(D[0], D[1]) - -print("Final model:") -print(w.get_value()) -print(b.get_value()) -print("target values for D:") -print(D[1]) -print("prediction on D:") -print(predict(D[0])) diff --git a/theano_tutorial/test.py b/theano_tutorial/test.py deleted file mode 100644 index 05c11f4..0000000 --- a/theano_tutorial/test.py +++ /dev/null @@ -1,105 +0,0 @@ -import numpy as np -import theano.tensor as T -from theano import function - -# ALGEBRA -x = T.dmatrix('x') -y = T.dmatrix('y') -z = x + y -f = function([x, y], z) - -# print(f(2, 3)) -# print(numpy.allclose(f(16.3, 12.1), 28.4)) -print(f([[1, 2], [3, 4]], [[10, 20], [30, 40]])) - -# exercise -import theano -a = T.vector() # declare variable -b = T.vector() # declare variable -out = a ** 2 + b ** 2 + 2 * a * b # build symbolic expression -f = function([a, b], out) # compile function -print(f([1, 2], [4, 5])) - -################################################### -# OTHER EXAMPLES - -# logistic function -x = T.dmatrix('x') -logistic_eq = 1 / (1 + T.exp(-x)) -logistic = function([x], logistic_eq) -print(logistic([[0, 1], [-1, -2]])) - - -# multiple things calculation -a, b = T.dmatrices('a', 'b') -diff = a - b -abs_diff = abs(diff) -diff_squared = diff**2 -f = function([a, b], [diff, abs_diff, diff_squared]) -print(f([[1, 1], [1, 1]], [[0, 1], [2, 3]])) - - -# default value -c = T.matrix('c') -c = a + b -f = function([a, theano.In(b, value=[[1, 1], [1, 1]])], c) -print(f([[1, 1], [1, 1]])) - - -# accumulator -state = theano.shared([[0, 0], [0, 0]]) -print("accumulator") -print(state.get_value()) - - -state = theano.shared(np.matrix('0 0; 0 0', dtype=np.int32)) -print(type(np.matrix('0 0; 0 0', dtype=np.int64))) -print(type(np.matrix('0 1; 2 3', dtype=np.int64))) -inc = T.imatrix('inc') -expression = state+inc -print(type(expression)) -accumulator = function([inc], state, updates=[(state, state+inc)]) - - -accumulator(np.matrix('1 2; 3 4', dtype=np.int32)) -print(state.get_value()) -accumulator(np.matrix('1 1; 1 1', dtype=np.int32)) -print(state.get_value()) - -# function copy -print("function copy") -new_state = theano.shared(np.matrix('0 0; 0 0', dtype=np.int32)) -new_accumulator = accumulator.copy(swap={state: new_state}) -new_accumulator(np.matrix('1 2; 3 4', dtype=np.int32)) -print(new_state.get_value()) -print(state.get_value()) - -# random numbers -# POSSIBLE THAT THIS DOES NOT WORK ON GPU -print("random numbers") -srng = T.shared_randomstreams.RandomStreams(seed=234) -rv_u = srng.uniform((2, 2)) -rv_n = srng.normal((2, 2)) -f = function([], rv_u) -g = function([], rv_n, no_default_updates=True) # Not updating rv_n.rng -nearly_zeros = function([], rv_u + rv_u - 2 * rv_u) - -print(f()) -print(f()) - -print(g()) -print(g()) - -print("sharing streams between functions") -state_after_v0 = rv_u.rng.get_value().get_state() -# nearly_zeros() # this affects rv_u's generator -v1 = f() -rng = rv_u.rng.get_value(borrow=True) -rng.set_state(state_after_v0) -rv_u.rng.set_value(rng, borrow=True) -v2 = f() # v2 != v1 -v3 = f() # v3 == v1 - -print(v1) -print(v2) -print(v3) diff --git a/theano_tutorial/theanoTest.py b/theano_tutorial/theanoTest.py deleted file mode 100644 index e69de29..0000000 diff --git a/theano_tutorial/tutorial_conditions.py b/theano_tutorial/tutorial_conditions.py deleted file mode 100644 index db0e855..0000000 --- a/theano_tutorial/tutorial_conditions.py +++ /dev/null @@ -1,34 +0,0 @@ -# if: (if(smth) else) -# switch: (if(smth) elif(smth)) - -from theano import tensor as T -from theano.ifelse import ifelse -import theano, time, numpy - -a,b = T.scalars('a', 'b') -x,y = T.matrices('x', 'y') - -z_switch = T.switch(T.lt(a, b), T.mean(x), T.mean(y)) -z_lazy = ifelse(T.lt(a, b), T.mean(x), T.mean(y)) - -f_switch = theano.function([a, b, x, y], z_switch, - mode=theano.Mode(linker='vm')) -f_lazyifelse = theano.function([a, b, x, y], z_lazy, - mode=theano.Mode(linker='vm')) - -val1 = 0. -val2 = 1. -big_mat1 = numpy.ones((10000, 1000)) -big_mat2 = numpy.ones((10000, 1000)) - -n_times = 10 - -tic = time.clock() -for i in range(n_times): - f_switch(val1, val2, big_mat1, big_mat2) -print('time spent evaluating both values %f sec' % (time.clock() - tic)) - -tic = time.clock() -for i in range(n_times): - f_lazyifelse(val1, val2, big_mat1, big_mat2) -print('time spent evaluating one value %f sec' % (time.clock() - tic)) \ No newline at end of file diff --git a/theano_tutorial/tutorial_derivates.py b/theano_tutorial/tutorial_derivates.py deleted file mode 100644 index ed1a295..0000000 --- a/theano_tutorial/tutorial_derivates.py +++ /dev/null @@ -1,94 +0,0 @@ -import numpy as np -import theano -import theano.tensor as T - -# normal gradient -x = T.dscalar('x') -z = T.dscalar('z') -y = x ** 3 + z ** 2 -gy = T.grad(y, [x, z]) - -f = theano.function([x, z], gy) - -# print(theano.pp(f.maker.fgraph.outputs[0])) -# print(theano.pp(f.maker.fgraph.outputs[1])) - -print(f(4, 8)) - -# logistic gradient -x = T.dmatrix('x') -l = T.sum(1 / (1 + T.exp(-x))) -gl = T.grad(l, x) - -f_lg = theano.function([x], gl) - -print(f_lg([[0, 1], [-1, -2]])) - -# np.matrix([[1, 2], [3, 4]]) - -# jacobian matrix -print('jacobian matrix1') -x = T.dvector('x') -y = x ** 2 -J, updates = theano.scan(lambda i, y, x : T.grad(y[i], x), sequences=T.arange(y.shape[0]), non_sequences=[y, x]) -f = theano.function([x], J, updates=updates) -print(f([1, 2, 3, 4, 5])) - -# already implemented jacobian matrix -# W, V = T.dmatrices('W', 'V') -J = theano.gradient.jacobian(y, x) -f2 = theano.function([x], J) -print(f2([1, 2, 3, 4, 5])) - -# jacobian matrix with matrix :) -W, V = T.dmatrices('W', 'V') -x = T.dvector('x') -y = T.dot(x, W) -J = theano.gradient.jacobian(y, W) -f2 = theano.function([W, x], J) -print(f2(np.array([[1, 1], [1, 1]]), np.array([0, 1]))) - -JV2 = T.dot(J, V) -f2 = theano.function([W, V, x], JV2) -print(f2(np.array([[1, 1], [1, 1]]), np.array([[2, 2], [2, 2]]), np.array([0, 1]))) - - -print('jacobian matrix2') -x = T.dvector('x') -z = T.dvector('z') -y = x ** 2 + z ** 2 -J, updates = theano.scan(lambda i, y, x, z: T.grad(y[i], [x, z]), sequences=T.arange(y.shape[0]), non_sequences=[y,x,z]) -f = theano.function([x, z], J, updates=updates) -test = T.arange(y.shape[0]) -t_f = theano.function([x, z], test) -print(f([4, 4], [1, 1])) -print(t_f([4, 4], [1, 1])) - -# hessian matrix -x = T.dvector('x') -y = x ** 3 -cost = y.sum() -gy = T.grad(cost, x) -H, updates = theano.scan(lambda i, gy, x : T.grad(gy[i], x), sequences=T.arange(gy.shape[0]), non_sequences=[gy, x]) -f = theano.function([x], H, updates=updates) -print(f([4, 4])) - -# jacobian times vector - -# R-operator -W = T.dmatrix('W') -V = T.dmatrix('V') -x = T.dvector('x') -y = T.dot(x, W) -JV = T.Rop(y, W, V) -f = theano.function([W, V, x], JV) -print(f([[1, 1], [1, 1]], [[2, 2], [2, 2]], [0,1])) - -# L-operator -W = T.dmatrix('W') -v = T.dvector('v') -x = T.dvector('x') -y = T.dot(x, W) -VJ = T.Lop(y, W, v) -f = theano.function([v,x], VJ) -print(f([2, 2], [0, 1])) \ No newline at end of file diff --git a/theano_tutorial/tutorial_loop.py b/theano_tutorial/tutorial_loop.py deleted file mode 100644 index 7dbebe0..0000000 --- a/theano_tutorial/tutorial_loop.py +++ /dev/null @@ -1,100 +0,0 @@ -import theano -import theano.tensor as T - -k = T.iscalar("k") -A = T.vector("A") - -# Symbolic description of the result -result, updates = theano.scan(fn=lambda prior_result, A: prior_result * A, - outputs_info=T.ones_like(A), - non_sequences=A, - n_steps=k) - -# We only care about A**k, but scan has provided us with A**1 through A**k. -# Discard the values that we don't care about. Scan is smart enough to -# notice this and not waste memory saving them. -final_result = result[-1] - -# compiled function that returns A**k -power = theano.function(inputs=[A,k], outputs=final_result, updates=updates) - -print(power(range(10),2)) -print(power(range(10),4)) - -print('P2:') -import numpy - -coefficients = theano.tensor.vector("coefficients") -x = T.scalar("x") - -max_coefficients_supported = 10000 - -# Generate the components of the polynomial -components, updates = theano.scan(fn=lambda coefficient, power, prior_result, free_variable: prior_result + (coefficient * (free_variable ** power)), - outputs_info=T.zeros(1), - sequences=[coefficients, theano.tensor.arange(max_coefficients_supported)], - non_sequences=x) -# Sum them up -polynomial = components.sum() - -pol = components[-1] - -# Compile a function -calculate_polynomial = theano.function(inputs=[coefficients, x], outputs=components) - -# Test -test_coefficients = numpy.asarray([1, 0, 2], dtype=numpy.float32) -test_value = 3 -print(calculate_polynomial(test_coefficients, test_value)) -print(1.0 * (3 ** 0) + 0.0 * (3 ** 1) + 2.0 * (3 ** 2)) - -print('P3:') -import numpy as np -import theano -import theano.tensor as T - -up_to = T.iscalar("up_to") - -# define a named function, rather than using lambda -def accumulate_by_adding(arange_val, prior_result): - return prior_result + arange_val -seq = T.arange(up_to) - -# An unauthorized implicit downcast from the dtype of 'seq', to that of -# 'T.as_tensor_variable(0)' which is of dtype 'int8' by default would occur -# if this instruction were to be used instead of the next one: -# outputs_info = T.as_tensor_variable(0) - -outputs_info = T.as_tensor_variable(np.asarray(0, seq.dtype)) -scan_result, scan_updates = theano.scan(fn=accumulate_by_adding, - outputs_info=outputs_info, - sequences=seq) -triangular_sequence = theano.function(inputs=[up_to], outputs=scan_result) - -# test -some_num = 15 -print(triangular_sequence(some_num)) -print([n * (n + 1) // 2 for n in range(some_num)]) - -print('P4:') -location = T.imatrix("location") -values = T.vector("values") -output_model = T.matrix("output_model") - -def set_value_at_position(a_location, a_value, output_model): - zeros = T.zeros_like(output_model) - zeros_subtensor = zeros[a_location[0], a_location[1]] - return T.set_subtensor(zeros_subtensor, a_value) - -result, updates = theano.scan(fn=set_value_at_position, - outputs_info=None, - sequences=[location, values], - non_sequences=output_model) - -assign_values_at_positions = theano.function(inputs=[location, values, output_model], outputs=result) - -# test -test_locations = numpy.asarray([[1, 1], [2, 3]], dtype=numpy.int32) -test_values = numpy.asarray([42, 50], dtype=numpy.float32) -test_output_model = numpy.zeros((5, 5), dtype=numpy.float32) -print(assign_values_at_positions(test_locations, test_values, test_output_model)) \ No newline at end of file