Added tab2xml conversion

2018-04-28 10:52:50 +02:00 · 2018-04-28 10:52:50 +02:00 · 524ceeb4b6
commit 524ceeb4b6
parent 1686f5cc6f
8 changed files with 137 additions and 476 deletions
--- a/sloleks_accentuation2_tab2xml.py
+++ b/sloleks_accentuation2_tab2xml.py
@ -0,0 +1,137 @@
+# Words proccesed: 650250
+# Word indeks: 50023
+# Word number: 50023
+
+from lxml import etree
+import time
+from prepare_data import *
+
+# def xml_words_generator(xml_path):
+#     for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):
+#         words = []
+#         for child in element:
+#             if child.tag == 'WordForm':
+#                 msd = None
+#                 word = None
+#                 for wf in child:
+#                     if 'att' in wf.attrib and wf.attrib['att'] == 'msd':
+#                         msd = wf.attrib['val']
+#                     elif wf.tag == 'FormRepresentation':
+#                         for form_rep in wf:
+#                             if form_rep.attrib['att'] == 'zapis_oblike':
+#                                 word = form_rep.attrib['val']
+#                         #if msd is not None and word is not None:
+#                         #    pass
+#                         #else:
+#                         #    print('NOOOOO')
+#                         words.append([word, '', msd, word])
+#         yield words
+#
+#
+# gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')
+word_glob_num = 0
+word_limit = 1000
+iter_num = 1000
+word_index = 0
+
+# iter_index = 0
+# words = []
+#
+# lexical_entries_load_number = 0
+# lexical_entries_save_number = 0
+#
+# # INSIDE
+# # word_glob_num = 1500686
+# word_glob_num = 1550705
+#
+# # word_limit = 1500686
+# word_limit = 1550705
+#
+# iter_index = 31
+
+# done_lexical_entries = 33522
+data = Data('s', shuffle_all_inputs=False)
+accentuated_content = data._read_content('data/new_sloleks/new_sloleks.tab')
+
+start_timer = time.time()
+
+print('Copy initialization complete')
+with open("data/new_sloleks/final_sloleks.xml", "ab") as myfile:
+    # myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
+    for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
+        # if word_glob_num >= word_limit:
+        #     myfile2.close()
+        #     myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
+        #     iter_index += 1
+        #     print("Words proccesed: " + str(word_glob_num))
+        #
+        #     print("Word indeks: " + str(word_index))
+        #     print("Word number: " + str(len(words)))
+        #
+        #     # print("lexical_entries_load_number: " + str(lexical_entries_load_number))
+        #     # print("lexical_entries_save_number: " + str(lexical_entries_save_number))
+        #
+        #     end_timer = time.time()
+        #     print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
+        lemma = ''
+        accentuated_word_location = ''
+        accentuated_word = ''
+        for child in element:
+            if child.tag == 'Lemma':
+                for wf in child:
+                    if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike':
+                        lemma = wf.attrib['val']
+            if child.tag == 'WordForm':
+                msd = None
+                word = None
+                for wf in child:
+                    if 'att' in wf.attrib and wf.attrib['att'] == 'msd':
+                        msd = wf.attrib['val']
+                    elif wf.tag == 'FormRepresentation':
+                        for form_rep in wf:
+                            if form_rep.attrib['att'] == 'zapis_oblike':
+                                word = form_rep.attrib['val']
+                        # if msd is not None and word is not None:
+                        #    pass
+                        # else:
+                        #    print('NOOOOO')
+
+                        word_index = (word_index - 500) % len(accentuated_content)
+                        word_index_sp = (word_index - 1) % len(accentuated_content)
+                        while word_index != word_index_sp:
+                            if word == accentuated_content[word_index][0] and msd == accentuated_content[word_index][2] and \
+                               lemma == accentuated_content[word_index][1]:
+                                accentuated_word_location = accentuated_content[word_index][4]
+                                accentuated_word = accentuated_content[word_index][5][:-1]
+                                del(accentuated_content[word_index])
+                                break
+                            word_index = (word_index + 1) % len(accentuated_content)
+
+                        if word_index == word_index_sp:
+                            print('ERROR IN ' + word + ' : ' + lemma + ' : ' + msd)
+                            # print('ERROR IN ' + word + ' : ' + accentuated_content[word_index][0] + ' OR ' + msd + ' : '
+                            #       + accentuated_content[word_index][2])
+                        # words.append([word, '', msd, word])
+
+                        new_element = etree.Element('feat')
+                        new_element.attrib['att'] = 'naglasna_mesta_besede'
+                        new_element.attrib['val'] = accentuated_word_location
+                        wf.append(new_element)
+
+                        new_element = etree.Element('feat')
+                        new_element.attrib['att'] = 'naglašena_beseda'
+                        new_element.attrib['val'] = accentuated_word
+                        wf.append(new_element)
+                        word_glob_num += 1
+                        # word_index += 1
+
+        # print(etree.tostring(element, encoding="UTF-8"))
+        # myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
+        if word_glob_num > word_limit:
+            print('Proccessed ' + str(word_glob_num) + ' words')
+            end_timer = time.time()
+            print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
+            word_limit += iter_num
+            break
+        myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
+        element.clear()
--- a/theano_tutorial/logistic_regression.py
+++ b/theano_tutorial/logistic_regression.py
@ -1,61 +0,0 @@
-import numpy
-import theano
-import theano.tensor as T
-rng = numpy.random
-
-N = 400                                   # training sample size
-feats = 784                               # number of input variables
-
-# generate a dataset: D = (input_values, target_class)
-D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2))
-training_steps = 10000
-
-# Declare Theano symbolic variables
-x = T.dmatrix("x")
-y = T.dvector("y")
-
-# initialize the weight vector w randomly
-#
-# this and the following bias variable b
-# are shared so they keep their values
-# between training iterations (updates)
-w = theano.shared(rng.randn(feats), name="w")
-
-# initialize the bias term
-b = theano.shared(0., name="b")
-
-print("Initial model:")
-print(w.get_value())
-print(b.get_value())
-
-# Construct Theano expression graph
-p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b))   # Probability that target = 1
-prediction = p_1 > 0.5                    # The prediction thresholded
-xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function
-cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize
-gw, gb = T.grad(cost, [w, b])             # Compute the gradient of the cost
-                                          # w.r.t weight vector w and
-                                          # bias term b
-                                          # (we shall return to this in a
-                                          # following section of this tutorial)
-
-# Compile
-train = theano.function(
-          inputs=[x,y],
-          outputs=[prediction, xent],
-          updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb)))
-predict = theano.function(inputs=[x], outputs=prediction)
-
-
-
-# Train
-for i in range(training_steps):
-    pred, err = train(D[0], D[1])
-
-print("Final model:")
-print(w.get_value())
-print(b.get_value())
-print("target values for D:")
-print(D[1])
-print("prediction on D:")
-print(predict(D[0]))
--- a/theano_tutorial/logistic_regression_loop.py
+++ b/theano_tutorial/logistic_regression_loop.py
@ -1,82 +0,0 @@
-import numpy
-import theano
-import theano.tensor as T
-rng = numpy.random
-
-N = 400                                   # training sample size
-feats = 784                               # number of input variables
-
-# generate a dataset: D = (input_values, target_class)
-D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2))
-training_steps = 10000
-
-# Declare Theano symbolic variables
-x = T.dmatrix("x")
-y = T.dvector("y")
-
-# initialize the weight vector w randomly
-#
-# this and the following bias variable b
-# are shared so they keep their values
-# between training iterations (updates)
-w = theano.shared(rng.randn(feats), name="w")
-
-# initialize the bias term
-b = theano.shared(0., name="b")
-
-print("Initial model:")
-print(w.get_value())
-print(b.get_value())
-
-# Construct Theano expression graph
-p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b))   # Probability that target = 1
-prediction = p_1 > 0.5                    # The prediction thresholded
-xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function
-cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize
-gw, gb = T.grad(cost, [w, b])             # Compute the gradient of the cost
-                                          # w.r.t weight vector w and
-                                          # bias term b
-                                          # (we shall return to this in a
-                                          # following section of this tutorial)
-
-
-def set_value_at_position(x, y, prediction, xent, w, b):
-    p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b))  # Probability that target = 1
-    prediction = p_1 > 0.5  # The prediction thresholded
-    xent = -y * T.log(p_1) - (1 - y) * T.log(1 - p_1)  # Cross-entropy loss function
-    cost = xent.mean() + 0.01 * (w ** 2).sum()  # The cost to minimize
-    gw, gb = T.grad(cost, [w, b])
-    w = w - 0.1 * gw
-    b = b - 0.1 * gb
-    return w, b
-
-
-result, updates = theano.scan(fn=set_value_at_position,
-                              outputs_info=[prediction, xent],
-                              sequences=[x, y],
-                              non_sequences=[w, b],
-                              n_steps=training_steps)
-
-calculate_scan = theano.function(inputs=[x, y], outputs=[prediction, xent], updates=updates)
-
-
-# Compile
-train = theano.function(
-          inputs=[x,y],
-          outputs=[prediction, xent],
-          updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb)))
-predict = theano.function(inputs=[x], outputs=prediction)
-
-
-
-# Train
-for i in range(training_steps):
-    pred, err = train(D[0], D[1])
-
-print("Final model:")
-print(w.get_value())
-print(b.get_value())
-print("target values for D:")
-print(D[1])
-print("prediction on D:")
-print(predict(D[0]))
--- a/theano_tutorial/test.py
+++ b/theano_tutorial/test.py
@ -1,105 +0,0 @@
-import numpy as np
-import theano.tensor as T
-from theano import function
-
-# ALGEBRA
-x = T.dmatrix('x')
-y = T.dmatrix('y')
-z = x + y
-f = function([x, y], z)
-
-# print(f(2, 3))
-# print(numpy.allclose(f(16.3, 12.1), 28.4))
-print(f([[1, 2], [3, 4]], [[10, 20], [30, 40]]))
-
-# exercise
-import theano
-a = T.vector()                                  # declare variable
-b = T.vector()                                  # declare variable
-out = a ** 2 + b ** 2 + 2 * a * b               # build symbolic expression
-f = function([a, b], out)                       # compile function
-print(f([1, 2], [4, 5]))
-
-###################################################
-# OTHER EXAMPLES
-
-# logistic function
-x = T.dmatrix('x')
-logistic_eq = 1 / (1 + T.exp(-x))
-logistic = function([x], logistic_eq)
-print(logistic([[0, 1], [-1, -2]]))
-
-
-# multiple things calculation
-a, b = T.dmatrices('a', 'b')
-diff = a - b
-abs_diff = abs(diff)
-diff_squared = diff**2
-f = function([a, b], [diff, abs_diff, diff_squared])
-print(f([[1, 1], [1, 1]], [[0, 1], [2, 3]]))
-
-
-# default value
-c = T.matrix('c')
-c = a + b
-f = function([a, theano.In(b, value=[[1, 1], [1, 1]])], c)
-print(f([[1, 1], [1, 1]]))
-
-
-# accumulator
-state = theano.shared([[0, 0], [0, 0]])
-print("accumulator")
-print(state.get_value())
-
-
-state = theano.shared(np.matrix('0 0; 0 0', dtype=np.int32))
-print(type(np.matrix('0 0; 0 0', dtype=np.int64)))
-print(type(np.matrix('0 1; 2 3', dtype=np.int64)))
-inc = T.imatrix('inc')
-expression = state+inc
-print(type(expression))
-accumulator = function([inc], state, updates=[(state, state+inc)])
-
-
-accumulator(np.matrix('1 2; 3 4', dtype=np.int32))
-print(state.get_value())
-accumulator(np.matrix('1 1; 1 1', dtype=np.int32))
-print(state.get_value())
-
-# function copy
-print("function copy")
-new_state = theano.shared(np.matrix('0 0; 0 0', dtype=np.int32))
-new_accumulator = accumulator.copy(swap={state: new_state})
-new_accumulator(np.matrix('1 2; 3 4', dtype=np.int32))
-print(new_state.get_value())
-print(state.get_value())
-
-# random numbers
-# POSSIBLE THAT THIS DOES NOT WORK ON GPU
-print("random numbers")
-srng = T.shared_randomstreams.RandomStreams(seed=234)
-rv_u = srng.uniform((2, 2))
-rv_n = srng.normal((2, 2))
-f = function([], rv_u)
-g = function([], rv_n, no_default_updates=True)     # Not updating rv_n.rng
-nearly_zeros = function([], rv_u + rv_u - 2 * rv_u)
-
-print(f())
-print(f())
-
-print(g())
-print(g())
-
-print("sharing streams between functions")
-state_after_v0 = rv_u.rng.get_value().get_state()
-# nearly_zeros()       # this affects rv_u's generator
-v1 = f()
-rng = rv_u.rng.get_value(borrow=True)
-rng.set_state(state_after_v0)
-rv_u.rng.set_value(rng, borrow=True)
-v2 = f()             # v2 != v1
-v3 = f()             # v3 == v1
-
-print(v1)
-print(v2)
-print(v3)
--- a/theano_tutorial/theanoTest.py
+++ b/theano_tutorial/theanoTest.py
--- a/theano_tutorial/tutorial_conditions.py
+++ b/theano_tutorial/tutorial_conditions.py
@ -1,34 +0,0 @@
-# if: (if(smth) else)
-# switch: (if(smth) elif(smth))
-
-from theano import tensor as T
-from theano.ifelse import ifelse
-import theano, time, numpy
-
-a,b = T.scalars('a', 'b')
-x,y = T.matrices('x', 'y')
-
-z_switch = T.switch(T.lt(a, b), T.mean(x), T.mean(y))
-z_lazy = ifelse(T.lt(a, b), T.mean(x), T.mean(y))
-
-f_switch = theano.function([a, b, x, y], z_switch,
-                           mode=theano.Mode(linker='vm'))
-f_lazyifelse = theano.function([a, b, x, y], z_lazy,
-                               mode=theano.Mode(linker='vm'))
-
-val1 = 0.
-val2 = 1.
-big_mat1 = numpy.ones((10000, 1000))
-big_mat2 = numpy.ones((10000, 1000))
-
-n_times = 10
-
-tic = time.clock()
-for i in range(n_times):
-    f_switch(val1, val2, big_mat1, big_mat2)
-print('time spent evaluating both values %f sec' % (time.clock() - tic))
-
-tic = time.clock()
-for i in range(n_times):
-    f_lazyifelse(val1, val2, big_mat1, big_mat2)
-print('time spent evaluating one value %f sec' % (time.clock() - tic))
--- a/theano_tutorial/tutorial_derivates.py
+++ b/theano_tutorial/tutorial_derivates.py
@ -1,94 +0,0 @@
-import numpy as np
-import theano
-import theano.tensor as T
-
-# normal gradient
-x = T.dscalar('x')
-z = T.dscalar('z')
-y = x ** 3 + z ** 2
-gy = T.grad(y, [x, z])
-
-f = theano.function([x, z], gy)
-
-# print(theano.pp(f.maker.fgraph.outputs[0]))
-# print(theano.pp(f.maker.fgraph.outputs[1]))
-
-print(f(4, 8))
-
-# logistic gradient
-x = T.dmatrix('x')
-l = T.sum(1 / (1 + T.exp(-x)))
-gl = T.grad(l, x)
-
-f_lg = theano.function([x], gl)
-
-print(f_lg([[0, 1], [-1, -2]]))
-
-# np.matrix([[1, 2], [3, 4]])
-
-# jacobian matrix
-print('jacobian matrix1')
-x = T.dvector('x')
-y = x ** 2
-J, updates = theano.scan(lambda i, y, x : T.grad(y[i], x), sequences=T.arange(y.shape[0]), non_sequences=[y, x])
-f = theano.function([x], J, updates=updates)
-print(f([1, 2, 3, 4, 5]))
-
-# already implemented jacobian matrix
-# W, V = T.dmatrices('W', 'V')
-J = theano.gradient.jacobian(y, x)
-f2 = theano.function([x], J)
-print(f2([1, 2, 3, 4, 5]))
-
-# jacobian matrix with matrix :)
-W, V = T.dmatrices('W', 'V')
-x = T.dvector('x')
-y = T.dot(x, W)
-J = theano.gradient.jacobian(y, W)
-f2 = theano.function([W, x], J)
-print(f2(np.array([[1, 1], [1, 1]]), np.array([0, 1])))
-
-JV2 = T.dot(J, V)
-f2 = theano.function([W, V, x], JV2)
-print(f2(np.array([[1, 1], [1, 1]]),  np.array([[2, 2], [2, 2]]), np.array([0, 1])))
-
-
-print('jacobian matrix2')
-x = T.dvector('x')
-z = T.dvector('z')
-y = x ** 2 + z ** 2
-J, updates = theano.scan(lambda i, y, x, z: T.grad(y[i], [x, z]), sequences=T.arange(y.shape[0]), non_sequences=[y,x,z])
-f = theano.function([x, z], J, updates=updates)
-test = T.arange(y.shape[0])
-t_f = theano.function([x, z], test)
-print(f([4, 4], [1, 1]))
-print(t_f([4, 4], [1, 1]))
-
-# hessian matrix
-x = T.dvector('x')
-y = x ** 3
-cost = y.sum()
-gy = T.grad(cost, x)
-H, updates = theano.scan(lambda i, gy, x : T.grad(gy[i], x), sequences=T.arange(gy.shape[0]), non_sequences=[gy, x])
-f = theano.function([x], H, updates=updates)
-print(f([4, 4]))
-
-# jacobian times vector
-
-# R-operator
-W = T.dmatrix('W')
-V = T.dmatrix('V')
-x = T.dvector('x')
-y = T.dot(x, W)
-JV = T.Rop(y, W, V)
-f = theano.function([W, V, x], JV)
-print(f([[1, 1], [1, 1]], [[2, 2], [2, 2]], [0,1]))
-
-# L-operator
-W = T.dmatrix('W')
-v = T.dvector('v')
-x = T.dvector('x')
-y = T.dot(x, W)
-VJ = T.Lop(y, W, v)
-f = theano.function([v,x], VJ)
-print(f([2, 2], [0, 1]))
--- a/theano_tutorial/tutorial_loop.py
+++ b/theano_tutorial/tutorial_loop.py
@ -1,100 +0,0 @@
-import theano
-import theano.tensor as T
-
-k = T.iscalar("k")
-A = T.vector("A")
-
-# Symbolic description of the result
-result, updates = theano.scan(fn=lambda prior_result, A: prior_result * A,
-                              outputs_info=T.ones_like(A),
-                              non_sequences=A,
-                              n_steps=k)
-
-# We only care about A**k, but scan has provided us with A**1 through A**k.
-# Discard the values that we don't care about. Scan is smart enough to
-# notice this and not waste memory saving them.
-final_result = result[-1]
-
-# compiled function that returns A**k
-power = theano.function(inputs=[A,k], outputs=final_result, updates=updates)
-
-print(power(range(10),2))
-print(power(range(10),4))
-
-print('P2:')
-import numpy
-
-coefficients = theano.tensor.vector("coefficients")
-x = T.scalar("x")
-
-max_coefficients_supported = 10000
-
-# Generate the components of the polynomial
-components, updates = theano.scan(fn=lambda coefficient, power, prior_result, free_variable: prior_result + (coefficient * (free_variable ** power)),
-                                  outputs_info=T.zeros(1),
-                                  sequences=[coefficients, theano.tensor.arange(max_coefficients_supported)],
-                                  non_sequences=x)
-# Sum them up
-polynomial = components.sum()
-
-pol = components[-1]
-
-# Compile a function
-calculate_polynomial = theano.function(inputs=[coefficients, x], outputs=components)
-
-# Test
-test_coefficients = numpy.asarray([1, 0, 2], dtype=numpy.float32)
-test_value = 3
-print(calculate_polynomial(test_coefficients, test_value))
-print(1.0 * (3 ** 0) + 0.0 * (3 ** 1) + 2.0 * (3 ** 2))
-
-print('P3:')
-import numpy as np
-import theano
-import theano.tensor as T
-
-up_to = T.iscalar("up_to")
-
-# define a named function, rather than using lambda
-def accumulate_by_adding(arange_val, prior_result):
-    return prior_result + arange_val
-seq = T.arange(up_to)
-
-# An unauthorized implicit downcast from the dtype of 'seq', to that of
-# 'T.as_tensor_variable(0)' which is of dtype 'int8' by default would occur
-# if this instruction were to be used instead of the next one:
-# outputs_info = T.as_tensor_variable(0)
-
-outputs_info = T.as_tensor_variable(np.asarray(0, seq.dtype))
-scan_result, scan_updates = theano.scan(fn=accumulate_by_adding,
-                                        outputs_info=outputs_info,
-                                        sequences=seq)
-triangular_sequence = theano.function(inputs=[up_to], outputs=scan_result)
-
-# test
-some_num = 15
-print(triangular_sequence(some_num))
-print([n * (n + 1) // 2 for n in range(some_num)])
-
-print('P4:')
-location = T.imatrix("location")
-values = T.vector("values")
-output_model = T.matrix("output_model")
-
-def set_value_at_position(a_location, a_value, output_model):
-    zeros = T.zeros_like(output_model)
-    zeros_subtensor = zeros[a_location[0], a_location[1]]
-    return T.set_subtensor(zeros_subtensor, a_value)
-
-result, updates = theano.scan(fn=set_value_at_position,
-                              outputs_info=None,
-                              sequences=[location, values],
-                              non_sequences=output_model)
-
-assign_values_at_positions = theano.function(inputs=[location, values, output_model], outputs=result)
-
-# test
-test_locations = numpy.asarray([[1, 1], [2, 3]], dtype=numpy.int32)
-test_values = numpy.asarray([42, 50], dtype=numpy.float32)
-test_output_model = numpy.zeros((5, 5), dtype=numpy.float32)
-print(assign_values_at_positions(test_locations, test_values, test_output_model))