stress_asignment/sloleks_accetuation.ipynb

1051 lines
39 KiB
Plaintext
Executable File

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using Theano backend.\n"
]
}
],
"source": [
"# -*- coding: utf-8 -*-\n",
"from __future__ import unicode_literals\n",
"\n",
"import numpy as np\n",
"from keras.models import load_model\n",
"import sys\n",
"import pickle\n",
"\n",
"from prepare_data import *\n",
"\n",
"np.random.seed(7)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data = Data('l', shuffle_all_inputs=False)\n",
"content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex')\n",
"dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content)\n",
"feature_dictionary = data._create_slovene_feature_dictionary()\n",
"syllable_dictionary = data._create_syllables_dictionary(content, vowels)\n",
"accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"environment = {}\n",
"environment['dictionary'] = dictionary\n",
"environment['max_word'] = max_word\n",
"environment['max_num_vowels'] = max_num_vowels\n",
"environment['vowels'] = vowels\n",
"environment['accented_vowels'] = accented_vowels\n",
"environment['feature_dictionary'] = feature_dictionary\n",
"environment['eng_feature_dictionary'] = feature_dictionary\n",
"environment['syllable_dictionary'] = syllable_dictionary\n",
"output = open('environment.pkl', 'wb')\n",
"pickle.dump(environment, output)\n",
"output.close()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"407\n"
]
}
],
"source": [
"i = 0\n",
"for el in syllable_dictionary:\n",
" if el == \"da\":\n",
" print(i)\n",
" i += 1"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(\n",
" 'cnn/word_accetuation/cnn_dictionary/v3_10/20_test_epoch.h5',\n",
" 'cnn/word_accetuation/syllables/v2_4/20_test_epoch.h5',\n",
" 'cnn/word_accetuation/syllabled_letters/v2_5_3/20_test_epoch.h5')\n",
"\n",
"letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models(\n",
" 'cnn/accent_classification/letters/v2_1/20_test_epoch.h5',\n",
" 'cnn/accent_classification/syllables/v1_0/20_test_epoch.h5',\n",
" 'cnn/accent_classification/syllabled_letters/v1_0/20_test_epoch.h5')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test_input = [['uradni', '', 'Agpmpn', 'uradni'], ['podatki', '', 'Ncmpn', 'podatki'], ['policije', '', 'Ncfsg', 'policije'], ['kažejo', '', 'Vmpr3p', 'kažejo'], ['na', '', 'Sa', 'na'], ['precej', '', 'Rgp', 'precej'], ['napete', '', 'Appfpa', 'napete'], ['razmere', '', 'Ncfpa', 'razmere'], ['v', '', 'Sl', 'v'], ['piranskem', '', 'Agpmsl', 'piranskem'], ['zalivu', '', 'Ncmsl', 'zalivu'], ['je', '', 'Va-r3s-n', 'je'], ['danes', '', 'Rgp', 'danes'], ['poročala', '', 'Vmpp-sf', 'poročala'], ['oddaja', '', 'Ncfsn', 'oddaja'], ['do', '', 'Sg', 'do'], ['danes', '', 'Rgp', 'danes'], ['se', '', 'Px------y', 'se'], ['je', '', 'Va-r3s-n', 'je'], ['zgodilo', '', 'Vmep-sn', 'zgodilo']]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%run prepare_data.py\n",
"data = Data('s', shuffle_all_inputs=False)\n",
"location_accented_words, accented_words = data.accentuate_word(test_input, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
" letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
" dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['uradni', 'podatkí', 'policíje', 'kažéjo', 'ná', 'precéj', 'napeté', 'razmeré', 'v', 'piranském', 'zalivú', 'jé', 'danés', 'poročála', 'oddajá', 'dó', 'danés', 'sé', 'jé', 'zgodílo']\n",
"['uradni', 'pödatki', 'polícije', 'kažëjo', 'ná', 'prëcej', 'nápete', 'räzmere', 'v', 'pîranskem', 'zálivu', 'jë', 'dánes', 'poróčala', 'öddaja', 'dó', 'dánes', 'së', 'jë', 'zgodílo']\n"
]
}
],
"source": [
"print(location_accented_words)\n",
"print(accented_words)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CREATING OTHER FEATURES...\n",
"OTHER FEATURES CREATED!\n"
]
},
{
"data": {
"text/plain": [
"'nädnarávnih'"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def predict_word(word_acentuation_model, accent_type_model, word, msd, dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary):\n",
" eye_input_accent = np.eye(10, dtype=int)\n",
" \n",
" english_msd = msd\n",
" fake_content = [[word, '-', msd, '-']]\n",
" x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, fake_content, vowels, accented_vowels, feature_dictionary, 'who cares')\n",
"# print(x)\n",
" accent_loc = word_acentuation_model.predict([x, x_other_features])\n",
" \n",
" j = 0\n",
" word=list(word)[::-1]\n",
"# print(word)\n",
"# print(accent_loc)\n",
" \n",
" for i in range(len(word)):\n",
" if data._is_vowel(word, i, vowels):\n",
" if accent_loc[0][j] >= 0.5:\n",
" # print(x_other_features[0])\n",
" # print(eye_input_accent[i])\n",
" new_x_other_features = np.array([np.concatenate((x_other_features[0], eye_input_accent[j]))])\n",
" # print(x_other_features)\n",
" # print(new_x_other_features)\n",
" final_accent = accent_type_model.predict([x, new_x_other_features])\n",
"# print(accented_vowels[final_accent[0].argmax(axis=0)])\n",
" word[i] = accented_vowels[final_accent[0].argmax(axis=0)]\n",
"# print(final_accent)\n",
" j += 1\n",
"\n",
" \n",
" \n",
" return ''.join(word[::-1])\n",
"\n",
"predict_word(word_acentuation_model, accent_type_model, 'nadnaravnih', 'Afpfdg', dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from lxml import etree\n",
"\n",
"def xml_words_generator(xml_path):\n",
" for event, element in etree.iterparse(xml_path, tag=\"LexicalEntry\", encoding=\"UTF-8\"):\n",
" words = []\n",
" for child in element:\n",
" if child.tag == 'WordForm':\n",
" msd = None\n",
" word = None\n",
" for wf in child:\n",
" if 'att' in wf.attrib and wf.attrib['att'] == 'msd':\n",
" msd = wf.attrib['val']\n",
" elif wf.tag == 'FormRepresentation':\n",
" for form_rep in wf:\n",
" if form_rep.attrib['att'] == 'zapis_oblike':\n",
" word = form_rep.attrib['val']\n",
" #if msd is not None and word is not None:\n",
" # pass\n",
" #else:\n",
" # print('NOOOOO')\n",
" words.append([word, '', msd, word])\n",
" yield words\n",
" \n",
"gen = xml_words_generator('data/Sloleks_v1.2.xml')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# SPLIT ALL TEXT!!!\n",
"NUM_OF_LINES=16660466\n",
"filename = 'data/Sloleks_v1.2.xml'\n",
"with open(filename) as fin:\n",
" fout = open('data/Sloleks_v1.2_p2.xml',\"a\")\n",
" for i,line in enumerate(fin):\n",
" if NUM_OF_LINES < i:\n",
" fout.write(line)\n",
" fout.close()\n",
" fout = open('data/Sloleks_v1.2_p2.xml',\"a\")\n",
"\n",
" fout.close()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"50017\n"
]
}
],
"source": [
"words = []\n",
"while len(words) < 50000:\n",
" words.extend(next(gen))\n",
"print(len(words))"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[21, 'A', ['g', 's'], ['p', 'c', 's'], ['m', 'f', 'n'], ['s', 'd', 'p'], ['n', 'g', 'd', 'a', 'l', 'i'], ['-', 'n', 'y']], [3, 'C', ['c', 's']], [1, 'I'], [21, 'M', ['l'], ['-', 'c', 'o', 's'], ['m', 'f', 'n'], ['s', 'd', 'p'], ['n', 'g', 'd', 'a', 'l', 'i'], ['-', 'n', 'y']], [17, 'N', ['c'], ['m', 'f', 'n'], ['s', 'd', 'p'], ['n', 'g', 'd', 'a', 'l', 'i'], ['-', 'n', 'y']], [40, 'P', ['p', 's', 'd', 'r', 'x', 'g', 'q', 'i', 'z'], ['-', '1', '2', '3'], ['-', 'm', 'f', 'n'], ['-', 's', 'd', 'p'], ['-', 'n', 'g', 'd', 'a', 'l', 'i'], ['-', 's', 'd', 'p'], ['-', 'm', 'f', 'n'], ['-', 'y', 'b']], [1, 'Q'], [5, 'R', ['g'], ['p', 'c', 's']], [7, 'S', ['-', 'g', 'd', 'a', 'l', 'i']], [24, 'V', ['m'], ['-'], ['n', 'u', 'p', 'r', 'f', 'c'], ['-', '1', '2', '3'], ['-', 's', 'p', 'd'], ['-', 'm', 'f', 'n'], ['-', 'n', 'y']]]\n",
"[[21, 'P', ['p', 's'], ['n', 'p', 's'], ['m', 'z', 's'], ['e', 'd', 'm'], ['i', 'r', 'd', 't', 'm', 'o'], ['-', 'n', 'd']], [3, 'V', ['p', 'd']], [1, 'M'], [21, 'K', ['b'], ['-', 'g', 'v', 'd'], ['m', 'z', 's'], ['e', 'd', 'm'], ['i', 'r', 'd', 't', 'm', 'o'], ['-', 'n', 'd']], [17, 'S', ['o'], ['m', 'z', 's'], ['e', 'd', 'm'], ['i', 'r', 'd', 't', 'm', 'o'], ['-', 'n', 'd']], [40, 'Z', ['o', 's', 'k', 'z', 'p', 'c', 'v', 'n', 'l'], ['-', 'p', 'd', 't'], ['-', 'm', 'z', 's'], ['-', 'e', 'd', 'm'], ['-', 'i', 'r', 'd', 't', 'm', 'o'], ['-', 'e', 'd', 'm'], ['-', 'm', 'z', 's'], ['-', 'k', 'z']], [1, 'L'], [5, 'R', ['s'], ['n', 'r', 's']], [7, 'D', ['-', 'r', 'd', 't', 'm', 'o']], [24, 'G', ['g'], ['-'], ['n', 'm', 'd', 's', 'p', 'g'], ['-', 'p', 'd', 't'], ['-', 'e', 'm', 'd'], ['-', 'm', 'z', 's'], ['-', 'n', 'd']]]\n"
]
}
],
"source": [
"print(feature__en_dictionary)\n",
"print(feature__slo_dictionary)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']\n",
"words = [[\"Gorejevemu\", \"\", \"Psnsed\", \"Gorejevemu\"]]"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%run prepare_data.py\n",
"data = Data('l', shuffle_all_inputs=False)\n",
"location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
" letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
" dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Gorejévemu']\n",
"['Gorejěvemu']\n",
"[['Gorejevemu', '', 'Psnsed', 'Gorejevemu']]\n"
]
}
],
"source": [
"pos = 4282\n",
"print(location_accented_words)\n",
"print(accented_words)\n",
"print(words)"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"hello\n"
]
},
{
"ename": "NameError",
"evalue": "name 'wait' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-165-0a2d5e185f36>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mstart\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"hello\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mend\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'wait' is not defined"
]
}
],
"source": [
"import time\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Words proccesed: 0\n",
"Word indeks: 0\n"
]
},
{
"ename": "NameError",
"evalue": "name 'words' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-5-0e24b34aba55>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Word indeks: \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Word number: \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mend_timer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'words' is not defined"
]
}
],
"source": [
"from lxml import etree\n",
"import time\n",
"\n",
"gen = xml_words_generator('data/Sloleks_v1.2.xml')\n",
"word_glob_num = 0\n",
"word_limit = 0\n",
"iter_num = 50000\n",
"word_index = 0\n",
"start_timer = time.time()\n",
"iter_index = 0\n",
"myfile = open('data/new_sloleks/test' + str(iter_index) + '.xml', 'ab')\n",
"#with open(\"data/new_sloleks/new_sloleks.xml\", \"ab\") as myfile:\n",
"\n",
"enable_print = False\n",
"\n",
"for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n",
" # LOAD NEW WORDS AND ACCENTUATE THEM\n",
" if word_glob_num >= word_limit:\n",
" myfile.close()\n",
" myfile = open('data/new_sloleks/test' + str(iter_index) + '.xml', 'ab')\n",
" #if iter_index == 5:\n",
" # break\n",
" \n",
" iter_index += 1\n",
" print(\"Words proccesed: \" + str(word_glob_num))\n",
" \n",
" print(\"Word indeks: \" + str(word_index))\n",
" print(\"Word number: \" + str(len(words)))\n",
" \n",
" end_timer = time.time()\n",
" print(\"Elapsed time: \" + \"{0:.2f}\".format((end_timer - start_timer)/60.0) + \" minutes\")\n",
" \n",
" \n",
" word_index = 0\n",
" words = []\n",
" #print(\"HERE!!!\")\n",
" while len(words) < iter_num:\n",
" try:\n",
" words.extend(next(gen))\n",
" except:\n",
" break\n",
" #print(\"HERE!!!\")\n",
" #if word_glob_num > 1:\n",
" # break\n",
"\n",
" word_limit += len(words)\n",
" #print(\"HERE!!!\")\n",
" \n",
" \n",
" \n",
" for child in element:\n",
" if child.tag == 'WordForm':\n",
" #msd = None\n",
" #word = None\n",
" for wf in child:\n",
" if wf.tag == 'FormRepresentation':\n",
" sloleks_word = None\n",
" for form_rep in wf:\n",
" if form_rep.attrib['att'] == 'zapis_oblike':\n",
" sloleks_word = form_rep.attrib['val']\n",
" \n",
" if sloleks_word != words[word_index][0]:\n",
" print(sloleks_word)\n",
" print(words[word_index][0])\n",
" print(word_index)\n",
" \n",
" \n",
" #if sloleks_word == \n",
" new_element = etree.Element('feat')\n",
" new_element.attrib['att']='naglasna_mesta_oblike'\n",
" new_element.attrib['val']=words[word_index][0]\n",
" wf.append(new_element)\n",
"\n",
" new_element = etree.Element('feat')\n",
" new_element.attrib['att']='naglašena_oblika'\n",
" new_element.attrib['val']=words[word_index][0]\n",
" wf.append(new_element)\n",
" word_glob_num += 1\n",
" word_index += 1\n",
"\n",
" #myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
" element.clear()\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'xml_words_generator' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-8-44b0367c6cbf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mgen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mxml_words_generator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data/Sloleks_v1.2_p2.xml'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mword_glob_num\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mword_limit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'xml_words_generator' is not defined"
]
}
],
"source": [
"#Words proccesed: 650250\n",
"#Word indeks: 50023\n",
"#Word number: 50023\n",
"\n",
"from lxml import etree\n",
"import time\n",
"\n",
"gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')\n",
"word_glob_num = 0\n",
"word_limit = 0\n",
"iter_num = 50000\n",
"word_index = 0\n",
"start_timer = time.time()\n",
"iter_index = 0\n",
"words = []\n",
"\n",
"lexical_entries_load_number = 0\n",
"lexical_entries_save_number = 0\n",
"\n",
"\n",
"# INSIDE\n",
"#word_glob_num = 1500686\n",
"word_glob_num = 1550705\n",
"\n",
"#word_limit = 1500686\n",
"word_limit = 1550705\n",
"\n",
"\n",
"iter_index = 31\n",
"\n",
"#done_lexical_entries = 33522\n",
"\n",
"with open(\"data/new_sloleks/new_sloleks.xml\", \"ab\") as myfile:\n",
" myfile2 = open('data/new_sloleks/pa' + str(iter_index) + '.xml', 'ab')\n",
" for event, element in etree.iterparse('data/Sloleks_v1.2_p2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n",
" # LOAD NEW WORDS AND ACCENTUATE THEM\n",
" #print(\"HERE\")\n",
" \n",
"# if lexical_entries_save_number < done_lexical_entries:\n",
"# next(gen)\n",
"# #print(lexical_entries_save_number)\n",
"# lexical_entries_save_number += 1\n",
"# lexical_entries_load_number += 1\n",
"# continue\n",
" \n",
" if word_glob_num >= word_limit:\n",
" myfile2.close()\n",
" myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')\n",
" iter_index += 1\n",
" print(\"Words proccesed: \" + str(word_glob_num))\n",
"\n",
" print(\"Word indeks: \" + str(word_index))\n",
" print(\"Word number: \" + str(len(words)))\n",
" \n",
" #print(\"lexical_entries_load_number: \" + str(lexical_entries_load_number))\n",
" #print(\"lexical_entries_save_number: \" + str(lexical_entries_save_number))\n",
"\n",
" end_timer = time.time()\n",
" print(\"Elapsed time: \" + \"{0:.2f}\".format((end_timer - start_timer)/60.0) + \" minutes\")\n",
"\n",
"\n",
" word_index = 0\n",
" words = []\n",
"\n",
" while len(words) < iter_num:\n",
" try:\n",
" words.extend(next(gen))\n",
" lexical_entries_load_number += 1\n",
" except:\n",
" break\n",
" #if word_glob_num > 1:\n",
" # break\n",
"\n",
" #problem_words = words\n",
" #break\n",
" data = Data('l', shuffle_all_inputs=False)\n",
" location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
" letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
" dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)\n",
"\n",
" word_limit += len(words)\n",
" \n",
" \n",
" # READ DATA\n",
" for child in element:\n",
" if child.tag == 'WordForm':\n",
" msd = None\n",
" word = None\n",
" for wf in child:\n",
" if wf.tag == 'FormRepresentation':\n",
" new_element = etree.Element('feat')\n",
" new_element.attrib['att']='naglasna_mesta_oblike'\n",
" new_element.attrib['val']=location_accented_words[word_index]\n",
" wf.append(new_element)\n",
"\n",
" new_element = etree.Element('feat')\n",
" new_element.attrib['att']='naglašena_oblika'\n",
" new_element.attrib['val']=accented_words[word_index]\n",
" wf.append(new_element)\n",
" word_glob_num += 1\n",
" word_index += 1\n",
"\n",
" # print(etree.tostring(element, encoding=\"UTF-8\"))\n",
" myfile2.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
" myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
" element.clear()\n",
" lexical_entries_save_number += 1\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"problem_words = []"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Words proccesed: 1550705\n",
"Word indeks: 0\n",
"Word number: 0\n",
"Elapsed time: 0.00 minutes\n",
"Words proccesed: 1600757\n",
"Word indeks: 50052\n",
"Word number: 50052\n",
"Elapsed time: 9.39 minutes\n",
"Words proccesed: 1650762\n",
"Word indeks: 50005\n",
"Word number: 50005\n",
"Elapsed time: 18.22 minutes\n",
"Words proccesed: 1700781\n",
"Word indeks: 50019\n",
"Word number: 50019\n",
"Elapsed time: 27.47 minutes\n",
"Words proccesed: 1750833\n",
"Word indeks: 50052\n",
"Word number: 50052\n",
"Elapsed time: 36.58 minutes\n",
"Words proccesed: 1800864\n",
"Word indeks: 50031\n",
"Word number: 50031\n",
"Elapsed time: 45.39 minutes\n",
"Words proccesed: 1850886\n",
"Word indeks: 50022\n",
"Word number: 50022\n",
"Elapsed time: 54.31 minutes\n",
"Words proccesed: 1900898\n",
"Word indeks: 50012\n",
"Word number: 50012\n",
"Elapsed time: 62.81 minutes\n",
"Words proccesed: 1950911\n",
"Word indeks: 50013\n",
"Word number: 50013\n",
"Elapsed time: 70.84 minutes\n",
"Words proccesed: 2000920\n",
"Word indeks: 50009\n",
"Word number: 50009\n",
"Elapsed time: 79.08 minutes\n",
"Words proccesed: 2050927\n",
"Word indeks: 50007\n",
"Word number: 50007\n",
"Elapsed time: 87.50 minutes\n",
"Words proccesed: 2100944\n",
"Word indeks: 50017\n",
"Word number: 50017\n",
"Elapsed time: 95.62 minutes\n",
"Words proccesed: 2150949\n",
"Word indeks: 50005\n",
"Word number: 50005\n",
"Elapsed time: 104.08 minutes\n",
"Words proccesed: 2200958\n",
"Word indeks: 50009\n",
"Word number: 50009\n",
"Elapsed time: 112.44 minutes\n",
"Words proccesed: 2250969\n",
"Word indeks: 50011\n",
"Word number: 50011\n",
"Elapsed time: 120.68 minutes\n",
"Words proccesed: 2300978\n",
"Word indeks: 50009\n",
"Word number: 50009\n",
"Elapsed time: 129.58 minutes\n",
"Words proccesed: 2350986\n",
"Word indeks: 50008\n",
"Word number: 50008\n",
"Elapsed time: 139.40 minutes\n",
"Words proccesed: 2400993\n",
"Word indeks: 50007\n",
"Word number: 50007\n",
"Elapsed time: 148.05 minutes\n",
"Words proccesed: 2451000\n",
"Word indeks: 50007\n",
"Word number: 50007\n",
"Elapsed time: 156.79 minutes\n",
"Words proccesed: 2501005\n",
"Word indeks: 50005\n",
"Word number: 50005\n",
"Elapsed time: 165.57 minutes\n",
"Words proccesed: 2551016\n",
"Word indeks: 50011\n",
"Word number: 50011\n",
"Elapsed time: 174.29 minutes\n",
"Words proccesed: 2601024\n",
"Word indeks: 50008\n",
"Word number: 50008\n",
"Elapsed time: 183.20 minutes\n",
"Words proccesed: 2651037\n",
"Word indeks: 50013\n",
"Word number: 50013\n",
"Elapsed time: 191.94 minutes\n",
"Words proccesed: 2701046\n",
"Word indeks: 50009\n",
"Word number: 50009\n",
"Elapsed time: 200.67 minutes\n",
"Words proccesed: 2751050\n",
"Word indeks: 50004\n",
"Word number: 50004\n",
"Elapsed time: 209.40 minutes\n"
]
}
],
"source": [
"#Words proccesed: 650250\n",
"#Word indeks: 50023\n",
"#Word number: 50023\n",
"\n",
"from lxml import etree\n",
"import time\n",
"\n",
"gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')\n",
"word_glob_num = 0\n",
"word_limit = 0\n",
"iter_num = 50000\n",
"word_index = 0\n",
"start_timer = time.time()\n",
"iter_index = 0\n",
"words = []\n",
"\n",
"lexical_entries_load_number = 0\n",
"lexical_entries_save_number = 0\n",
"\n",
"\n",
"# INSIDE\n",
"#word_glob_num = 1500686\n",
"word_glob_num = 1550705\n",
"\n",
"#word_limit = 1500686\n",
"word_limit = 1550705\n",
"\n",
"\n",
"iter_index = 31\n",
"\n",
"#done_lexical_entries = 33522\n",
"\n",
"with open(\"data/new_sloleks/new_sloleks.xml\", \"ab\") as myfile:\n",
" myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')\n",
" for event, element in etree.iterparse('data/Sloleks_v1.2_p2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n",
" # LOAD NEW WORDS AND ACCENTUATE THEM\n",
" #print(\"HERE\")\n",
" \n",
"# if lexical_entries_save_number < done_lexical_entries:\n",
"# next(gen)\n",
"# #print(lexical_entries_save_number)\n",
"# lexical_entries_save_number += 1\n",
"# lexical_entries_load_number += 1\n",
"# continue\n",
" \n",
" if word_glob_num >= word_limit:\n",
" myfile2.close()\n",
" myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')\n",
" iter_index += 1\n",
" print(\"Words proccesed: \" + str(word_glob_num))\n",
"\n",
" print(\"Word indeks: \" + str(word_index))\n",
" print(\"Word number: \" + str(len(words)))\n",
" \n",
" #print(\"lexical_entries_load_number: \" + str(lexical_entries_load_number))\n",
" #print(\"lexical_entries_save_number: \" + str(lexical_entries_save_number))\n",
"\n",
" end_timer = time.time()\n",
" print(\"Elapsed time: \" + \"{0:.2f}\".format((end_timer - start_timer)/60.0) + \" minutes\")\n",
"\n",
"\n",
" word_index = 0\n",
" words = []\n",
"\n",
" while len(words) < iter_num:\n",
" try:\n",
" words.extend(next(gen))\n",
" lexical_entries_load_number += 1\n",
" except:\n",
" break\n",
" #if word_glob_num > 1:\n",
" # break\n",
"\n",
" #problem_words = words\n",
" #break\n",
" data = Data('l', shuffle_all_inputs=False)\n",
" location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
" letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
" dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)\n",
"\n",
" word_limit += len(words)\n",
" \n",
" \n",
" # READ DATA\n",
" for child in element:\n",
" if child.tag == 'WordForm':\n",
" msd = None\n",
" word = None\n",
" for wf in child:\n",
" if wf.tag == 'FormRepresentation':\n",
" new_element = etree.Element('feat')\n",
" new_element.attrib['att']='naglasna_mesta_oblike'\n",
" new_element.attrib['val']=location_accented_words[word_index]\n",
" wf.append(new_element)\n",
"\n",
" new_element = etree.Element('feat')\n",
" new_element.attrib['att']='naglašena_oblika'\n",
" new_element.attrib['val']=accented_words[word_index]\n",
" wf.append(new_element)\n",
" word_glob_num += 1\n",
" word_index += 1\n",
"\n",
" # print(etree.tostring(element, encoding=\"UTF-8\"))\n",
" myfile2.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
" myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
" element.clear()\n",
" lexical_entries_save_number += 1\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"50052"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(problem_words)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%run prepare_data.py\n",
"data = Data('l', shuffle_all_inputs=False)\n",
"location_accented_words, accented_words = data.accentuate_word(problem_words[:], letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
" letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
" dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1558562"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# CALCULATE INDEX NUMBER:\n",
"previous_file_len = [622061, 618306, 618266, 618483, 619342]\n",
"word_nums = [50017, 50007, 50017, 50012, 50024]\n",
"def calculate_index(previous_files_len, word_nums):\n",
" return sum(previous_files_len) - 2 * sum(word_nums) + 11\n",
"calculate_index(previous_file_len[:3], word_nums[:3])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"250002\n",
"250000\n",
"50000\n",
"0\n"
]
}
],
"source": [
"print(word_glob_num)\n",
"print(word_limit)\n",
"print(iter_num)\n",
"print(word_index)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"23"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"max_word"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['zapirati', '', 'Ggnn', 'zapirati']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"words[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with open(\"new_sloleks.xml\", \"ab\") as myfile:\n",
" for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n",
" # READ DATA\n",
" for child in element:\n",
" if child.tag == 'WordForm':\n",
" msd = None\n",
" word = None\n",
" for wf in child:\n",
"# print(wf.attrib['att'])\n",
" if 'att' in wf.attrib and wf.attrib['att'] == 'msd':\n",
" msd = wf.attrib['val']\n",
" elif wf.tag == 'FormRepresentation':\n",
" for form_rep in wf:\n",
" if form_rep.attrib['att'] == 'zapis_oblike':\n",
" word = form_rep.attrib['val']\n",
"\n",
" new_element = etree.Element('feat')\n",
" new_element.attrib['att']='naglasna_mesta_oblike'\n",
" new_element.attrib['val']='test'\n",
" wf.append(new_element)\n",
"\n",
" new_element = etree.Element('feat')\n",
" new_element.attrib['att']='naglašena_oblika'\n",
" new_element.attrib['val']='test'\n",
" wf.append(new_element)\n",
" if msd is not None and word is not None:\n",
" print(msd)\n",
" print(word)\n",
" else:\n",
" print('NOOOOO')\n",
" print(etree.tostring(element, encoding=\"UTF-8\"))\n",
" myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
" element.clear()\n",
" break"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}