2017-08-18 17:08:42 +00:00
|
|
|
{
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2018-03-21 10:35:05 +00:00
|
|
|
"execution_count": 1,
|
2017-08-18 17:08:42 +00:00
|
|
|
"metadata": {},
|
2018-03-21 10:35:05 +00:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stderr",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
|
|
|
"Using Theano backend.\n"
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
2017-08-18 17:08:42 +00:00
|
|
|
"source": [
|
|
|
|
"# -*- coding: utf-8 -*-\n",
|
|
|
|
"from __future__ import unicode_literals\n",
|
|
|
|
"\n",
|
|
|
|
"import numpy as np\n",
|
|
|
|
"from keras.models import load_model\n",
|
|
|
|
"import sys\n",
|
2018-03-21 10:35:05 +00:00
|
|
|
"import pickle\n",
|
2017-08-18 17:08:42 +00:00
|
|
|
"\n",
|
|
|
|
"from prepare_data import *\n",
|
|
|
|
"\n",
|
|
|
|
"np.random.seed(7)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2018-03-21 10:35:05 +00:00
|
|
|
"execution_count": 2,
|
2017-08-18 17:08:42 +00:00
|
|
|
"metadata": {
|
|
|
|
"collapsed": true
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"data = Data('l', shuffle_all_inputs=False)\n",
|
|
|
|
"content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex')\n",
|
|
|
|
"dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content)\n",
|
|
|
|
"feature_dictionary = data._create_slovene_feature_dictionary()\n",
|
|
|
|
"syllable_dictionary = data._create_syllables_dictionary(content, vowels)\n",
|
|
|
|
"accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']\n",
|
|
|
|
"\n"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2018-03-21 10:35:05 +00:00
|
|
|
"execution_count": 15,
|
|
|
|
"metadata": {
|
|
|
|
"collapsed": true
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"environment = {}\n",
|
|
|
|
"environment['dictionary'] = dictionary\n",
|
|
|
|
"environment['max_word'] = max_word\n",
|
|
|
|
"environment['max_num_vowels'] = max_num_vowels\n",
|
|
|
|
"environment['vowels'] = vowels\n",
|
|
|
|
"environment['accented_vowels'] = accented_vowels\n",
|
|
|
|
"environment['feature_dictionary'] = feature_dictionary\n",
|
|
|
|
"environment['eng_feature_dictionary'] = feature_dictionary\n",
|
|
|
|
"environment['syllable_dictionary'] = syllable_dictionary\n",
|
|
|
|
"output = open('environment.pkl', 'wb')\n",
|
|
|
|
"pickle.dump(environment, output)\n",
|
|
|
|
"output.close()"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 12,
|
2017-08-18 17:08:42 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
2018-03-21 10:35:05 +00:00
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
|
|
|
"407\n"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"i = 0\n",
|
|
|
|
"for el in syllable_dictionary:\n",
|
|
|
|
" if el == \"da\":\n",
|
|
|
|
" print(i)\n",
|
|
|
|
" i += 1"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 98,
|
|
|
|
"metadata": {
|
|
|
|
"collapsed": true
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"feature__en_dictionary = data._create_feature_dictionary()\n",
|
|
|
|
"feature__slo_dictionary = data._create_slovene_feature_dictionary()"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 3,
|
|
|
|
"metadata": {
|
|
|
|
"collapsed": true
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(\n",
|
|
|
|
" 'cnn/word_accetuation/cnn_dictionary/v3_10/20_test_epoch.h5',\n",
|
|
|
|
" 'cnn/word_accetuation/syllables/v2_4/20_test_epoch.h5',\n",
|
|
|
|
" 'cnn/word_accetuation/syllabled_letters/v2_5_3/20_test_epoch.h5')\n",
|
|
|
|
"\n",
|
|
|
|
"letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models(\n",
|
|
|
|
" 'cnn/accent_classification/letters/v2_1/20_test_epoch.h5',\n",
|
|
|
|
" 'cnn/accent_classification/syllables/v1_0/20_test_epoch.h5',\n",
|
|
|
|
" 'cnn/accent_classification/syllabled_letters/v1_0/20_test_epoch.h5')"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 6,
|
|
|
|
"metadata": {
|
|
|
|
"collapsed": true
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"test_input = [['uradni', '', 'Agpmpn', 'uradni'], ['podatki', '', 'Ncmpn', 'podatki'], ['policije', '', 'Ncfsg', 'policije'], ['kažejo', '', 'Vmpr3p', 'kažejo'], ['na', '', 'Sa', 'na'], ['precej', '', 'Rgp', 'precej'], ['napete', '', 'Appfpa', 'napete'], ['razmere', '', 'Ncfpa', 'razmere'], ['v', '', 'Sl', 'v'], ['piranskem', '', 'Agpmsl', 'piranskem'], ['zalivu', '', 'Ncmsl', 'zalivu'], ['je', '', 'Va-r3s-n', 'je'], ['danes', '', 'Rgp', 'danes'], ['poročala', '', 'Vmpp-sf', 'poročala'], ['oddaja', '', 'Ncfsn', 'oddaja'], ['do', '', 'Sg', 'do'], ['danes', '', 'Rgp', 'danes'], ['se', '', 'Px------y', 'se'], ['je', '', 'Va-r3s-n', 'je'], ['zgodilo', '', 'Vmep-sn', 'zgodilo']]"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 16,
|
|
|
|
"metadata": {
|
|
|
|
"collapsed": true
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"%run prepare_data.py\n",
|
2017-08-18 17:08:42 +00:00
|
|
|
"data = Data('l', shuffle_all_inputs=False)\n",
|
2018-03-21 10:35:05 +00:00
|
|
|
"location_accented_words, accented_words = data.accentuate_word(test_input, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
|
|
|
|
" letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
|
|
|
|
" dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 19,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
|
|
|
"['uradní', 'podatkí', 'policíje', 'kažéjo', 'ná', 'precéj', 'napeté', 'razmeré', 'v', 'piranském', 'zalivú', 'jé', 'danés', 'poročála', 'oddajá', 'dó', 'danés', 'sé', 'jé', 'zgodílo']\n",
|
|
|
|
"['uradnî', 'podatkî', 'policíje', 'kažëjo', 'ná', 'precëj', 'napetë', 'razmerë', 'v', 'piranskëm', 'zalivú', 'jë', 'danës', 'poročála', 'oddajá', 'dó', 'danës', 'së', 'jë', 'zgodílo']\n"
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"print(location_accented_words)\n",
|
|
|
|
"print(accented_words)"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 67,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
|
|
|
"CREATING OTHER FEATURES...\n",
|
|
|
|
"OTHER FEATURES CREATED!\n"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"'nädnarávnih'"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 67,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"def predict_word(word_acentuation_model, accent_type_model, word, msd, dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary):\n",
|
|
|
|
" eye_input_accent = np.eye(10, dtype=int)\n",
|
|
|
|
" \n",
|
|
|
|
" english_msd = msd\n",
|
|
|
|
" fake_content = [[word, '-', msd, '-']]\n",
|
|
|
|
" x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, fake_content, vowels, accented_vowels, feature_dictionary, 'who cares')\n",
|
|
|
|
"# print(x)\n",
|
|
|
|
" accent_loc = word_acentuation_model.predict([x, x_other_features])\n",
|
|
|
|
" \n",
|
|
|
|
" j = 0\n",
|
|
|
|
" word=list(word)[::-1]\n",
|
|
|
|
"# print(word)\n",
|
|
|
|
"# print(accent_loc)\n",
|
|
|
|
" \n",
|
|
|
|
" for i in range(len(word)):\n",
|
|
|
|
" if data._is_vowel(word, i, vowels):\n",
|
|
|
|
" if accent_loc[0][j] >= 0.5:\n",
|
|
|
|
" # print(x_other_features[0])\n",
|
|
|
|
" # print(eye_input_accent[i])\n",
|
|
|
|
" new_x_other_features = np.array([np.concatenate((x_other_features[0], eye_input_accent[j]))])\n",
|
|
|
|
" # print(x_other_features)\n",
|
|
|
|
" # print(new_x_other_features)\n",
|
|
|
|
" final_accent = accent_type_model.predict([x, new_x_other_features])\n",
|
|
|
|
"# print(accented_vowels[final_accent[0].argmax(axis=0)])\n",
|
|
|
|
" word[i] = accented_vowels[final_accent[0].argmax(axis=0)]\n",
|
|
|
|
"# print(final_accent)\n",
|
|
|
|
" j += 1\n",
|
|
|
|
"\n",
|
|
|
|
" \n",
|
|
|
|
" \n",
|
|
|
|
" return ''.join(word[::-1])\n",
|
|
|
|
"\n",
|
|
|
|
"predict_word(word_acentuation_model, accent_type_model, 'nadnaravnih', 'Afpfdg', dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2018-03-21 10:35:05 +00:00
|
|
|
"execution_count": 4,
|
|
|
|
"metadata": {
|
|
|
|
"collapsed": true
|
|
|
|
},
|
|
|
|
"outputs": [],
|
2017-08-18 17:08:42 +00:00
|
|
|
"source": [
|
|
|
|
"from lxml import etree\n",
|
|
|
|
"\n",
|
2018-03-21 10:35:05 +00:00
|
|
|
"def xml_words_generator(xml_path):\n",
|
|
|
|
" for event, element in etree.iterparse(xml_path, tag=\"LexicalEntry\", encoding=\"UTF-8\"):\n",
|
|
|
|
" words = []\n",
|
2017-08-18 17:08:42 +00:00
|
|
|
" for child in element:\n",
|
|
|
|
" if child.tag == 'WordForm':\n",
|
|
|
|
" msd = None\n",
|
|
|
|
" word = None\n",
|
|
|
|
" for wf in child:\n",
|
|
|
|
" if 'att' in wf.attrib and wf.attrib['att'] == 'msd':\n",
|
|
|
|
" msd = wf.attrib['val']\n",
|
|
|
|
" elif wf.tag == 'FormRepresentation':\n",
|
|
|
|
" for form_rep in wf:\n",
|
|
|
|
" if form_rep.attrib['att'] == 'zapis_oblike':\n",
|
|
|
|
" word = form_rep.attrib['val']\n",
|
2018-03-21 10:35:05 +00:00
|
|
|
" #if msd is not None and word is not None:\n",
|
|
|
|
" # pass\n",
|
|
|
|
" #else:\n",
|
|
|
|
" # print('NOOOOO')\n",
|
|
|
|
" words.append([word, '', msd, word])\n",
|
|
|
|
" yield words\n",
|
|
|
|
" \n",
|
|
|
|
"gen = xml_words_generator('data/Sloleks_v1.2.xml')"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2018-03-21 10:35:05 +00:00
|
|
|
"execution_count": 7,
|
|
|
|
"metadata": {
|
|
|
|
"collapsed": true
|
|
|
|
},
|
|
|
|
"outputs": [],
|
2017-08-18 17:08:42 +00:00
|
|
|
"source": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"# SPLIT ALL TEXT!!!\n",
|
|
|
|
"NUM_OF_LINES=16660466\n",
|
|
|
|
"filename = 'data/Sloleks_v1.2.xml'\n",
|
|
|
|
"with open(filename) as fin:\n",
|
|
|
|
" fout = open('data/Sloleks_v1.2_p2.xml',\"a\")\n",
|
|
|
|
" for i,line in enumerate(fin):\n",
|
|
|
|
" if NUM_OF_LINES < i:\n",
|
|
|
|
" fout.write(line)\n",
|
|
|
|
" fout.close()\n",
|
|
|
|
" fout = open('data/Sloleks_v1.2_p2.xml',\"a\")\n",
|
|
|
|
"\n",
|
|
|
|
" fout.close()"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2018-03-21 10:35:05 +00:00
|
|
|
"execution_count": 5,
|
2017-08-18 17:08:42 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"50017\n"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"words = []\n",
|
|
|
|
"while len(words) < 50000:\n",
|
|
|
|
" words.extend(next(gen))\n",
|
|
|
|
"print(len(words))"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2018-03-21 10:35:05 +00:00
|
|
|
"execution_count": 99,
|
2017-08-18 17:08:42 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"[[21, 'A', ['g', 's'], ['p', 'c', 's'], ['m', 'f', 'n'], ['s', 'd', 'p'], ['n', 'g', 'd', 'a', 'l', 'i'], ['-', 'n', 'y']], [3, 'C', ['c', 's']], [1, 'I'], [21, 'M', ['l'], ['-', 'c', 'o', 's'], ['m', 'f', 'n'], ['s', 'd', 'p'], ['n', 'g', 'd', 'a', 'l', 'i'], ['-', 'n', 'y']], [17, 'N', ['c'], ['m', 'f', 'n'], ['s', 'd', 'p'], ['n', 'g', 'd', 'a', 'l', 'i'], ['-', 'n', 'y']], [40, 'P', ['p', 's', 'd', 'r', 'x', 'g', 'q', 'i', 'z'], ['-', '1', '2', '3'], ['-', 'm', 'f', 'n'], ['-', 's', 'd', 'p'], ['-', 'n', 'g', 'd', 'a', 'l', 'i'], ['-', 's', 'd', 'p'], ['-', 'm', 'f', 'n'], ['-', 'y', 'b']], [1, 'Q'], [5, 'R', ['g'], ['p', 'c', 's']], [7, 'S', ['-', 'g', 'd', 'a', 'l', 'i']], [24, 'V', ['m'], ['-'], ['n', 'u', 'p', 'r', 'f', 'c'], ['-', '1', '2', '3'], ['-', 's', 'p', 'd'], ['-', 'm', 'f', 'n'], ['-', 'n', 'y']]]\n",
|
|
|
|
"[[21, 'P', ['p', 's'], ['n', 'p', 's'], ['m', 'z', 's'], ['e', 'd', 'm'], ['i', 'r', 'd', 't', 'm', 'o'], ['-', 'n', 'd']], [3, 'V', ['p', 'd']], [1, 'M'], [21, 'K', ['b'], ['-', 'g', 'v', 'd'], ['m', 'z', 's'], ['e', 'd', 'm'], ['i', 'r', 'd', 't', 'm', 'o'], ['-', 'n', 'd']], [17, 'S', ['o'], ['m', 'z', 's'], ['e', 'd', 'm'], ['i', 'r', 'd', 't', 'm', 'o'], ['-', 'n', 'd']], [40, 'Z', ['o', 's', 'k', 'z', 'p', 'c', 'v', 'n', 'l'], ['-', 'p', 'd', 't'], ['-', 'm', 'z', 's'], ['-', 'e', 'd', 'm'], ['-', 'i', 'r', 'd', 't', 'm', 'o'], ['-', 'e', 'd', 'm'], ['-', 'm', 'z', 's'], ['-', 'k', 'z']], [1, 'L'], [5, 'R', ['s'], ['n', 'r', 's']], [7, 'D', ['-', 'r', 'd', 't', 'm', 'o']], [24, 'G', ['g'], ['-'], ['n', 'm', 'd', 's', 'p', 'g'], ['-', 'p', 'd', 't'], ['-', 'e', 'm', 'd'], ['-', 'm', 'z', 's'], ['-', 'n', 'd']]]\n"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"print(feature__en_dictionary)\n",
|
|
|
|
"print(feature__slo_dictionary)"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 5,
|
|
|
|
"metadata": {
|
|
|
|
"collapsed": true
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']\n",
|
|
|
|
"words = [[\"Gorejevemu\", \"\", \"Psnsed\", \"Gorejevemu\"]]"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2018-03-21 10:35:05 +00:00
|
|
|
"execution_count": 29,
|
2017-08-18 17:08:42 +00:00
|
|
|
"metadata": {
|
|
|
|
"collapsed": true
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"%run prepare_data.py\n",
|
|
|
|
"data = Data('l', shuffle_all_inputs=False)\n",
|
|
|
|
"location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
|
|
|
|
" letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
|
|
|
|
" dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2018-03-21 10:35:05 +00:00
|
|
|
"execution_count": 159,
|
2017-08-18 17:08:42 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
2018-03-21 10:35:05 +00:00
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
|
|
|
"['Gorejévemu']\n",
|
|
|
|
"['Gorejěvemu']\n",
|
|
|
|
"[['Gorejevemu', '', 'Psnsed', 'Gorejevemu']]\n"
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"pos = 4282\n",
|
|
|
|
"print(location_accented_words)\n",
|
|
|
|
"print(accented_words)\n",
|
|
|
|
"print(words)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 165,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
|
|
|
"hello\n"
|
|
|
|
]
|
|
|
|
},
|
2017-08-18 17:08:42 +00:00
|
|
|
{
|
|
|
|
"ename": "NameError",
|
2018-03-21 10:35:05 +00:00
|
|
|
"evalue": "name 'wait' is not defined",
|
2017-08-18 17:08:42 +00:00
|
|
|
"output_type": "error",
|
|
|
|
"traceback": [
|
|
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
|
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
2018-03-21 10:35:05 +00:00
|
|
|
"\u001b[0;32m<ipython-input-165-0a2d5e185f36>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mstart\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"hello\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mend\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
|
|
"\u001b[0;31mNameError\u001b[0m: name 'wait' is not defined"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"import time\n",
|
|
|
|
"\n"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2018-03-21 10:35:05 +00:00
|
|
|
"execution_count": 5,
|
2017-08-18 17:08:42 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"Words proccesed: 0\n",
|
|
|
|
"Word indeks: 0\n"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
2018-03-21 10:35:05 +00:00
|
|
|
"ename": "NameError",
|
|
|
|
"evalue": "name 'words' is not defined",
|
|
|
|
"output_type": "error",
|
|
|
|
"traceback": [
|
|
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
|
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
|
|
"\u001b[0;32m<ipython-input-5-0e24b34aba55>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Word indeks: \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Word number: \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mend_timer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
|
|
"\u001b[0;31mNameError\u001b[0m: name 'words' is not defined"
|
|
|
|
]
|
2017-08-18 17:08:42 +00:00
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"from lxml import etree\n",
|
|
|
|
"import time\n",
|
|
|
|
"\n",
|
|
|
|
"gen = xml_words_generator('data/Sloleks_v1.2.xml')\n",
|
|
|
|
"word_glob_num = 0\n",
|
|
|
|
"word_limit = 0\n",
|
|
|
|
"iter_num = 50000\n",
|
|
|
|
"word_index = 0\n",
|
|
|
|
"start_timer = time.time()\n",
|
|
|
|
"iter_index = 0\n",
|
|
|
|
"myfile = open('data/new_sloleks/test' + str(iter_index) + '.xml', 'ab')\n",
|
|
|
|
"#with open(\"data/new_sloleks/new_sloleks.xml\", \"ab\") as myfile:\n",
|
|
|
|
"\n",
|
|
|
|
"enable_print = False\n",
|
|
|
|
"\n",
|
|
|
|
"for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n",
|
|
|
|
" # LOAD NEW WORDS AND ACCENTUATE THEM\n",
|
|
|
|
" if word_glob_num >= word_limit:\n",
|
|
|
|
" myfile.close()\n",
|
|
|
|
" myfile = open('data/new_sloleks/test' + str(iter_index) + '.xml', 'ab')\n",
|
|
|
|
" #if iter_index == 5:\n",
|
|
|
|
" # break\n",
|
|
|
|
" \n",
|
|
|
|
" iter_index += 1\n",
|
|
|
|
" print(\"Words proccesed: \" + str(word_glob_num))\n",
|
|
|
|
" \n",
|
|
|
|
" print(\"Word indeks: \" + str(word_index))\n",
|
|
|
|
" print(\"Word number: \" + str(len(words)))\n",
|
|
|
|
" \n",
|
|
|
|
" end_timer = time.time()\n",
|
|
|
|
" print(\"Elapsed time: \" + \"{0:.2f}\".format((end_timer - start_timer)/60.0) + \" minutes\")\n",
|
|
|
|
" \n",
|
|
|
|
" \n",
|
|
|
|
" word_index = 0\n",
|
|
|
|
" words = []\n",
|
|
|
|
" #print(\"HERE!!!\")\n",
|
|
|
|
" while len(words) < iter_num:\n",
|
|
|
|
" try:\n",
|
|
|
|
" words.extend(next(gen))\n",
|
|
|
|
" except:\n",
|
|
|
|
" break\n",
|
|
|
|
" #print(\"HERE!!!\")\n",
|
|
|
|
" #if word_glob_num > 1:\n",
|
|
|
|
" # break\n",
|
|
|
|
"\n",
|
|
|
|
" word_limit += len(words)\n",
|
|
|
|
" #print(\"HERE!!!\")\n",
|
|
|
|
" \n",
|
|
|
|
" \n",
|
2017-08-18 17:08:42 +00:00
|
|
|
" \n",
|
2018-03-21 10:35:05 +00:00
|
|
|
" for child in element:\n",
|
|
|
|
" if child.tag == 'WordForm':\n",
|
|
|
|
" #msd = None\n",
|
|
|
|
" #word = None\n",
|
|
|
|
" for wf in child:\n",
|
|
|
|
" if wf.tag == 'FormRepresentation':\n",
|
|
|
|
" sloleks_word = None\n",
|
|
|
|
" for form_rep in wf:\n",
|
|
|
|
" if form_rep.attrib['att'] == 'zapis_oblike':\n",
|
|
|
|
" sloleks_word = form_rep.attrib['val']\n",
|
|
|
|
" \n",
|
|
|
|
" if sloleks_word != words[word_index][0]:\n",
|
|
|
|
" print(sloleks_word)\n",
|
|
|
|
" print(words[word_index][0])\n",
|
|
|
|
" print(word_index)\n",
|
|
|
|
" \n",
|
|
|
|
" \n",
|
|
|
|
" #if sloleks_word == \n",
|
|
|
|
" new_element = etree.Element('feat')\n",
|
|
|
|
" new_element.attrib['att']='naglasna_mesta_oblike'\n",
|
|
|
|
" new_element.attrib['val']=words[word_index][0]\n",
|
|
|
|
" wf.append(new_element)\n",
|
|
|
|
"\n",
|
|
|
|
" new_element = etree.Element('feat')\n",
|
|
|
|
" new_element.attrib['att']='naglašena_oblika'\n",
|
|
|
|
" new_element.attrib['val']=words[word_index][0]\n",
|
|
|
|
" wf.append(new_element)\n",
|
|
|
|
" word_glob_num += 1\n",
|
|
|
|
" word_index += 1\n",
|
|
|
|
"\n",
|
|
|
|
" #myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
|
|
|
|
" element.clear()\n",
|
|
|
|
" \n",
|
|
|
|
" "
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2018-03-21 10:35:05 +00:00
|
|
|
"execution_count": 6,
|
|
|
|
"metadata": {
|
|
|
|
"collapsed": true
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"problem_words = []"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 6,
|
2017-08-18 17:08:42 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"Words proccesed: 1550705\n",
|
|
|
|
"Word indeks: 0\n",
|
|
|
|
"Word number: 0\n",
|
|
|
|
"Elapsed time: 0.00 minutes\n",
|
|
|
|
"Words proccesed: 1600757\n",
|
|
|
|
"Word indeks: 50052\n",
|
|
|
|
"Word number: 50052\n",
|
|
|
|
"Elapsed time: 9.39 minutes\n",
|
|
|
|
"Words proccesed: 1650762\n",
|
|
|
|
"Word indeks: 50005\n",
|
|
|
|
"Word number: 50005\n",
|
|
|
|
"Elapsed time: 18.22 minutes\n",
|
|
|
|
"Words proccesed: 1700781\n",
|
|
|
|
"Word indeks: 50019\n",
|
|
|
|
"Word number: 50019\n",
|
|
|
|
"Elapsed time: 27.47 minutes\n",
|
|
|
|
"Words proccesed: 1750833\n",
|
|
|
|
"Word indeks: 50052\n",
|
|
|
|
"Word number: 50052\n",
|
|
|
|
"Elapsed time: 36.58 minutes\n",
|
|
|
|
"Words proccesed: 1800864\n",
|
|
|
|
"Word indeks: 50031\n",
|
|
|
|
"Word number: 50031\n",
|
|
|
|
"Elapsed time: 45.39 minutes\n",
|
|
|
|
"Words proccesed: 1850886\n",
|
|
|
|
"Word indeks: 50022\n",
|
|
|
|
"Word number: 50022\n",
|
|
|
|
"Elapsed time: 54.31 minutes\n",
|
|
|
|
"Words proccesed: 1900898\n",
|
|
|
|
"Word indeks: 50012\n",
|
|
|
|
"Word number: 50012\n",
|
|
|
|
"Elapsed time: 62.81 minutes\n",
|
|
|
|
"Words proccesed: 1950911\n",
|
|
|
|
"Word indeks: 50013\n",
|
|
|
|
"Word number: 50013\n",
|
|
|
|
"Elapsed time: 70.84 minutes\n",
|
|
|
|
"Words proccesed: 2000920\n",
|
|
|
|
"Word indeks: 50009\n",
|
|
|
|
"Word number: 50009\n",
|
|
|
|
"Elapsed time: 79.08 minutes\n",
|
|
|
|
"Words proccesed: 2050927\n",
|
|
|
|
"Word indeks: 50007\n",
|
|
|
|
"Word number: 50007\n",
|
|
|
|
"Elapsed time: 87.50 minutes\n",
|
|
|
|
"Words proccesed: 2100944\n",
|
|
|
|
"Word indeks: 50017\n",
|
|
|
|
"Word number: 50017\n",
|
|
|
|
"Elapsed time: 95.62 minutes\n",
|
|
|
|
"Words proccesed: 2150949\n",
|
|
|
|
"Word indeks: 50005\n",
|
|
|
|
"Word number: 50005\n",
|
|
|
|
"Elapsed time: 104.08 minutes\n",
|
|
|
|
"Words proccesed: 2200958\n",
|
|
|
|
"Word indeks: 50009\n",
|
|
|
|
"Word number: 50009\n",
|
|
|
|
"Elapsed time: 112.44 minutes\n",
|
|
|
|
"Words proccesed: 2250969\n",
|
|
|
|
"Word indeks: 50011\n",
|
|
|
|
"Word number: 50011\n",
|
|
|
|
"Elapsed time: 120.68 minutes\n",
|
|
|
|
"Words proccesed: 2300978\n",
|
|
|
|
"Word indeks: 50009\n",
|
|
|
|
"Word number: 50009\n",
|
|
|
|
"Elapsed time: 129.58 minutes\n",
|
|
|
|
"Words proccesed: 2350986\n",
|
|
|
|
"Word indeks: 50008\n",
|
|
|
|
"Word number: 50008\n",
|
|
|
|
"Elapsed time: 139.40 minutes\n",
|
|
|
|
"Words proccesed: 2400993\n",
|
|
|
|
"Word indeks: 50007\n",
|
|
|
|
"Word number: 50007\n",
|
|
|
|
"Elapsed time: 148.05 minutes\n",
|
|
|
|
"Words proccesed: 2451000\n",
|
|
|
|
"Word indeks: 50007\n",
|
|
|
|
"Word number: 50007\n",
|
|
|
|
"Elapsed time: 156.79 minutes\n",
|
|
|
|
"Words proccesed: 2501005\n",
|
|
|
|
"Word indeks: 50005\n",
|
|
|
|
"Word number: 50005\n",
|
|
|
|
"Elapsed time: 165.57 minutes\n",
|
|
|
|
"Words proccesed: 2551016\n",
|
|
|
|
"Word indeks: 50011\n",
|
|
|
|
"Word number: 50011\n",
|
|
|
|
"Elapsed time: 174.29 minutes\n",
|
|
|
|
"Words proccesed: 2601024\n",
|
|
|
|
"Word indeks: 50008\n",
|
|
|
|
"Word number: 50008\n",
|
|
|
|
"Elapsed time: 183.20 minutes\n",
|
|
|
|
"Words proccesed: 2651037\n",
|
|
|
|
"Word indeks: 50013\n",
|
|
|
|
"Word number: 50013\n",
|
|
|
|
"Elapsed time: 191.94 minutes\n",
|
|
|
|
"Words proccesed: 2701046\n",
|
|
|
|
"Word indeks: 50009\n",
|
|
|
|
"Word number: 50009\n",
|
|
|
|
"Elapsed time: 200.67 minutes\n",
|
|
|
|
"Words proccesed: 2751050\n",
|
|
|
|
"Word indeks: 50004\n",
|
|
|
|
"Word number: 50004\n",
|
|
|
|
"Elapsed time: 209.40 minutes\n"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"#Words proccesed: 650250\n",
|
|
|
|
"#Word indeks: 50023\n",
|
|
|
|
"#Word number: 50023\n",
|
|
|
|
"\n",
|
|
|
|
"from lxml import etree\n",
|
|
|
|
"import time\n",
|
|
|
|
"\n",
|
|
|
|
"gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')\n",
|
|
|
|
"word_glob_num = 0\n",
|
|
|
|
"word_limit = 0\n",
|
|
|
|
"iter_num = 50000\n",
|
|
|
|
"word_index = 0\n",
|
|
|
|
"start_timer = time.time()\n",
|
|
|
|
"iter_index = 0\n",
|
|
|
|
"words = []\n",
|
|
|
|
"\n",
|
|
|
|
"lexical_entries_load_number = 0\n",
|
|
|
|
"lexical_entries_save_number = 0\n",
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
"# INSIDE\n",
|
|
|
|
"#word_glob_num = 1500686\n",
|
|
|
|
"word_glob_num = 1550705\n",
|
|
|
|
"\n",
|
|
|
|
"#word_limit = 1500686\n",
|
|
|
|
"word_limit = 1550705\n",
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
"iter_index = 31\n",
|
|
|
|
"\n",
|
|
|
|
"#done_lexical_entries = 33522\n",
|
|
|
|
"\n",
|
|
|
|
"with open(\"data/new_sloleks/new_sloleks.xml\", \"ab\") as myfile:\n",
|
|
|
|
" myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')\n",
|
|
|
|
" for event, element in etree.iterparse('data/Sloleks_v1.2_p2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n",
|
|
|
|
" # LOAD NEW WORDS AND ACCENTUATE THEM\n",
|
|
|
|
" #print(\"HERE\")\n",
|
|
|
|
" \n",
|
|
|
|
"# if lexical_entries_save_number < done_lexical_entries:\n",
|
|
|
|
"# next(gen)\n",
|
|
|
|
"# #print(lexical_entries_save_number)\n",
|
|
|
|
"# lexical_entries_save_number += 1\n",
|
|
|
|
"# lexical_entries_load_number += 1\n",
|
|
|
|
"# continue\n",
|
|
|
|
" \n",
|
|
|
|
" if word_glob_num >= word_limit:\n",
|
|
|
|
" myfile2.close()\n",
|
|
|
|
" myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')\n",
|
|
|
|
" iter_index += 1\n",
|
|
|
|
" print(\"Words proccesed: \" + str(word_glob_num))\n",
|
2017-08-18 17:08:42 +00:00
|
|
|
"\n",
|
2018-03-21 10:35:05 +00:00
|
|
|
" print(\"Word indeks: \" + str(word_index))\n",
|
|
|
|
" print(\"Word number: \" + str(len(words)))\n",
|
|
|
|
" \n",
|
|
|
|
" #print(\"lexical_entries_load_number: \" + str(lexical_entries_load_number))\n",
|
|
|
|
" #print(\"lexical_entries_save_number: \" + str(lexical_entries_save_number))\n",
|
2017-08-18 17:08:42 +00:00
|
|
|
"\n",
|
2018-03-21 10:35:05 +00:00
|
|
|
" end_timer = time.time()\n",
|
|
|
|
" print(\"Elapsed time: \" + \"{0:.2f}\".format((end_timer - start_timer)/60.0) + \" minutes\")\n",
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
" word_index = 0\n",
|
|
|
|
" words = []\n",
|
|
|
|
"\n",
|
|
|
|
" while len(words) < iter_num:\n",
|
|
|
|
" try:\n",
|
|
|
|
" words.extend(next(gen))\n",
|
|
|
|
" lexical_entries_load_number += 1\n",
|
|
|
|
" except:\n",
|
|
|
|
" break\n",
|
|
|
|
" #if word_glob_num > 1:\n",
|
|
|
|
" # break\n",
|
|
|
|
"\n",
|
|
|
|
" #problem_words = words\n",
|
|
|
|
" #break\n",
|
|
|
|
" data = Data('l', shuffle_all_inputs=False)\n",
|
|
|
|
" location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
|
|
|
|
" letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
|
|
|
|
" dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)\n",
|
|
|
|
"\n",
|
|
|
|
" word_limit += len(words)\n",
|
|
|
|
" \n",
|
|
|
|
" \n",
|
|
|
|
" # READ DATA\n",
|
|
|
|
" for child in element:\n",
|
|
|
|
" if child.tag == 'WordForm':\n",
|
|
|
|
" msd = None\n",
|
|
|
|
" word = None\n",
|
|
|
|
" for wf in child:\n",
|
|
|
|
" if wf.tag == 'FormRepresentation':\n",
|
|
|
|
" new_element = etree.Element('feat')\n",
|
|
|
|
" new_element.attrib['att']='naglasna_mesta_oblike'\n",
|
|
|
|
" new_element.attrib['val']=location_accented_words[word_index]\n",
|
|
|
|
" wf.append(new_element)\n",
|
2017-08-18 17:08:42 +00:00
|
|
|
"\n",
|
2018-03-21 10:35:05 +00:00
|
|
|
" new_element = etree.Element('feat')\n",
|
|
|
|
" new_element.attrib['att']='naglašena_oblika'\n",
|
|
|
|
" new_element.attrib['val']=accented_words[word_index]\n",
|
|
|
|
" wf.append(new_element)\n",
|
|
|
|
" word_glob_num += 1\n",
|
|
|
|
" word_index += 1\n",
|
2017-08-18 17:08:42 +00:00
|
|
|
"\n",
|
2018-03-21 10:35:05 +00:00
|
|
|
" # print(etree.tostring(element, encoding=\"UTF-8\"))\n",
|
|
|
|
" myfile2.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
|
|
|
|
" myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
|
|
|
|
" element.clear()\n",
|
|
|
|
" lexical_entries_save_number += 1\n",
|
|
|
|
" "
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 10,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"50052"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 10,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"len(problem_words)"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2018-03-21 10:35:05 +00:00
|
|
|
"execution_count": null,
|
|
|
|
"metadata": {
|
|
|
|
"collapsed": true
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"%run prepare_data.py\n",
|
|
|
|
"data = Data('l', shuffle_all_inputs=False)\n",
|
|
|
|
"location_accented_words, accented_words = data.accentuate_word(problem_words[:], letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
|
|
|
|
" letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
|
|
|
|
" dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 21,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"1558562"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 21,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"# CALCULATE INDEX NUMBER:\n",
|
|
|
|
"previous_file_len = [622061, 618306, 618266, 618483, 619342]\n",
|
|
|
|
"word_nums = [50017, 50007, 50017, 50012, 50024]\n",
|
|
|
|
"def calculate_index(previous_files_len, word_nums):\n",
|
|
|
|
" return sum(previous_files_len) - 2 * sum(word_nums) + 11\n",
|
|
|
|
"calculate_index(previous_file_len[:3], word_nums[:3])"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 12,
|
2017-08-18 17:08:42 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"250002\n",
|
|
|
|
"250000\n",
|
|
|
|
"50000\n",
|
|
|
|
"0\n"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"print(word_glob_num)\n",
|
|
|
|
"print(word_limit)\n",
|
|
|
|
"print(iter_num)\n",
|
|
|
|
"print(word_index)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 22,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"23"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 22,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"max_word"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2018-03-21 10:35:05 +00:00
|
|
|
"execution_count": 7,
|
2017-08-18 17:08:42 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"['zapirati', '', 'Ggnn', 'zapirati']"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
},
|
2018-03-21 10:35:05 +00:00
|
|
|
"execution_count": 7,
|
2017-08-18 17:08:42 +00:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2018-03-21 10:35:05 +00:00
|
|
|
"words[0]"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"metadata": {
|
|
|
|
"collapsed": true
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"with open(\"new_sloleks.xml\", \"ab\") as myfile:\n",
|
|
|
|
" for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n",
|
|
|
|
" # READ DATA\n",
|
|
|
|
" for child in element:\n",
|
|
|
|
" if child.tag == 'WordForm':\n",
|
|
|
|
" msd = None\n",
|
|
|
|
" word = None\n",
|
|
|
|
" for wf in child:\n",
|
|
|
|
"# print(wf.attrib['att'])\n",
|
|
|
|
" if 'att' in wf.attrib and wf.attrib['att'] == 'msd':\n",
|
|
|
|
" msd = wf.attrib['val']\n",
|
|
|
|
" elif wf.tag == 'FormRepresentation':\n",
|
|
|
|
" for form_rep in wf:\n",
|
|
|
|
" if form_rep.attrib['att'] == 'zapis_oblike':\n",
|
|
|
|
" word = form_rep.attrib['val']\n",
|
|
|
|
"\n",
|
|
|
|
" new_element = etree.Element('feat')\n",
|
|
|
|
" new_element.attrib['att']='naglasna_mesta_oblike'\n",
|
|
|
|
" new_element.attrib['val']='test'\n",
|
|
|
|
" wf.append(new_element)\n",
|
|
|
|
"\n",
|
|
|
|
" new_element = etree.Element('feat')\n",
|
|
|
|
" new_element.attrib['att']='naglašena_oblika'\n",
|
|
|
|
" new_element.attrib['val']='test'\n",
|
|
|
|
" wf.append(new_element)\n",
|
|
|
|
" if msd is not None and word is not None:\n",
|
|
|
|
" print(msd)\n",
|
|
|
|
" print(word)\n",
|
|
|
|
" else:\n",
|
|
|
|
" print('NOOOOO')\n",
|
|
|
|
" print(etree.tostring(element, encoding=\"UTF-8\"))\n",
|
|
|
|
" myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
|
|
|
|
" element.clear()\n",
|
|
|
|
" break"
|
2017-08-18 17:08:42 +00:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"metadata": {
|
|
|
|
"kernelspec": {
|
|
|
|
"display_name": "Python 3",
|
|
|
|
"language": "python",
|
|
|
|
"name": "python3"
|
|
|
|
},
|
|
|
|
"language_info": {
|
|
|
|
"codemirror_mode": {
|
|
|
|
"name": "ipython",
|
|
|
|
"version": 3
|
|
|
|
},
|
|
|
|
"file_extension": ".py",
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
"version": "3.5.2"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 2
|
|
|
|
}
|