{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using Theano backend.\n" ] } ], "source": [ "# -*- coding: utf-8 -*-\n", "from __future__ import unicode_literals\n", "\n", "import numpy as np\n", "from keras.models import load_model\n", "import sys\n", "import pickle\n", "\n", "from prepare_data import *\n", "\n", "np.random.seed(7)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "data = Data('l', shuffle_all_inputs=False)\n", "content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex')\n", "dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content)\n", "feature_dictionary = data._create_slovene_feature_dictionary()\n", "syllable_dictionary = data._create_syllables_dictionary(content, vowels)\n", "accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']\n", "\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "environment = {}\n", "environment['dictionary'] = dictionary\n", "environment['max_word'] = max_word\n", "environment['max_num_vowels'] = max_num_vowels\n", "environment['vowels'] = vowels\n", "environment['accented_vowels'] = accented_vowels\n", "environment['feature_dictionary'] = feature_dictionary\n", "environment['eng_feature_dictionary'] = feature_dictionary\n", "environment['syllable_dictionary'] = syllable_dictionary\n", "output = open('environment.pkl', 'wb')\n", "pickle.dump(environment, output)\n", "output.close()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "407\n" ] } ], "source": [ "i = 0\n", "for el in syllable_dictionary:\n", " if el == \"da\":\n", " print(i)\n", " i += 1" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(\n", " 'cnn/word_accetuation/cnn_dictionary/v3_10/20_test_epoch.h5',\n", " 'cnn/word_accetuation/syllables/v2_4/20_test_epoch.h5',\n", " 'cnn/word_accetuation/syllabled_letters/v2_5_3/20_test_epoch.h5')\n", "\n", "letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models(\n", " 'cnn/accent_classification/letters/v2_1/20_test_epoch.h5',\n", " 'cnn/accent_classification/syllables/v1_0/20_test_epoch.h5',\n", " 'cnn/accent_classification/syllabled_letters/v1_0/20_test_epoch.h5')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "test_input = [['uradni', '', 'Agpmpn', 'uradni'], ['podatki', '', 'Ncmpn', 'podatki'], ['policije', '', 'Ncfsg', 'policije'], ['kažejo', '', 'Vmpr3p', 'kažejo'], ['na', '', 'Sa', 'na'], ['precej', '', 'Rgp', 'precej'], ['napete', '', 'Appfpa', 'napete'], ['razmere', '', 'Ncfpa', 'razmere'], ['v', '', 'Sl', 'v'], ['piranskem', '', 'Agpmsl', 'piranskem'], ['zalivu', '', 'Ncmsl', 'zalivu'], ['je', '', 'Va-r3s-n', 'je'], ['danes', '', 'Rgp', 'danes'], ['poročala', '', 'Vmpp-sf', 'poročala'], ['oddaja', '', 'Ncfsn', 'oddaja'], ['do', '', 'Sg', 'do'], ['danes', '', 'Rgp', 'danes'], ['se', '', 'Px------y', 'se'], ['je', '', 'Va-r3s-n', 'je'], ['zgodilo', '', 'Vmep-sn', 'zgodilo']]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%run prepare_data.py\n", "data = Data('s', shuffle_all_inputs=False)\n", "location_accented_words, accented_words = data.accentuate_word(test_input, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n", " letter_type_model, syllable_type_model, syllabled_letter_type_model,\n", " dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['uradni', 'podatkí', 'policíje', 'kažéjo', 'ná', 'precéj', 'napeté', 'razmeré', 'v', 'piranském', 'zalivú', 'jé', 'danés', 'poročála', 'oddajá', 'dó', 'danés', 'sé', 'jé', 'zgodílo']\n", "['uradni', 'pödatki', 'polícije', 'kažëjo', 'ná', 'prëcej', 'nápete', 'räzmere', 'v', 'pîranskem', 'zálivu', 'jë', 'dánes', 'poróčala', 'öddaja', 'dó', 'dánes', 'së', 'jë', 'zgodílo']\n" ] } ], "source": [ "print(location_accented_words)\n", "print(accented_words)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CREATING OTHER FEATURES...\n", "OTHER FEATURES CREATED!\n" ] }, { "data": { "text/plain": [ "'nädnarávnih'" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def predict_word(word_acentuation_model, accent_type_model, word, msd, dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary):\n", " eye_input_accent = np.eye(10, dtype=int)\n", " \n", " english_msd = msd\n", " fake_content = [[word, '-', msd, '-']]\n", " x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, fake_content, vowels, accented_vowels, feature_dictionary, 'who cares')\n", "# print(x)\n", " accent_loc = word_acentuation_model.predict([x, x_other_features])\n", " \n", " j = 0\n", " word=list(word)[::-1]\n", "# print(word)\n", "# print(accent_loc)\n", " \n", " for i in range(len(word)):\n", " if data._is_vowel(word, i, vowels):\n", " if accent_loc[0][j] >= 0.5:\n", " # print(x_other_features[0])\n", " # print(eye_input_accent[i])\n", " new_x_other_features = np.array([np.concatenate((x_other_features[0], eye_input_accent[j]))])\n", " # print(x_other_features)\n", " # print(new_x_other_features)\n", " final_accent = accent_type_model.predict([x, new_x_other_features])\n", "# print(accented_vowels[final_accent[0].argmax(axis=0)])\n", " word[i] = accented_vowels[final_accent[0].argmax(axis=0)]\n", "# print(final_accent)\n", " j += 1\n", "\n", " \n", " \n", " return ''.join(word[::-1])\n", "\n", "predict_word(word_acentuation_model, accent_type_model, 'nadnaravnih', 'Afpfdg', dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from lxml import etree\n", "\n", "def xml_words_generator(xml_path):\n", " for event, element in etree.iterparse(xml_path, tag=\"LexicalEntry\", encoding=\"UTF-8\"):\n", " words = []\n", " for child in element:\n", " if child.tag == 'WordForm':\n", " msd = None\n", " word = None\n", " for wf in child:\n", " if 'att' in wf.attrib and wf.attrib['att'] == 'msd':\n", " msd = wf.attrib['val']\n", " elif wf.tag == 'FormRepresentation':\n", " for form_rep in wf:\n", " if form_rep.attrib['att'] == 'zapis_oblike':\n", " word = form_rep.attrib['val']\n", " #if msd is not None and word is not None:\n", " # pass\n", " #else:\n", " # print('NOOOOO')\n", " words.append([word, '', msd, word])\n", " yield words\n", " \n", "gen = xml_words_generator('data/Sloleks_v1.2.xml')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# SPLIT ALL TEXT!!!\n", "NUM_OF_LINES=16660466\n", "filename = 'data/Sloleks_v1.2.xml'\n", "with open(filename) as fin:\n", " fout = open('data/Sloleks_v1.2_p2.xml',\"a\")\n", " for i,line in enumerate(fin):\n", " if NUM_OF_LINES < i:\n", " fout.write(line)\n", " fout.close()\n", " fout = open('data/Sloleks_v1.2_p2.xml',\"a\")\n", "\n", " fout.close()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "50017\n" ] } ], "source": [ "words = []\n", "while len(words) < 50000:\n", " words.extend(next(gen))\n", "print(len(words))" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[21, 'A', ['g', 's'], ['p', 'c', 's'], ['m', 'f', 'n'], ['s', 'd', 'p'], ['n', 'g', 'd', 'a', 'l', 'i'], ['-', 'n', 'y']], [3, 'C', ['c', 's']], [1, 'I'], [21, 'M', ['l'], ['-', 'c', 'o', 's'], ['m', 'f', 'n'], ['s', 'd', 'p'], ['n', 'g', 'd', 'a', 'l', 'i'], ['-', 'n', 'y']], [17, 'N', ['c'], ['m', 'f', 'n'], ['s', 'd', 'p'], ['n', 'g', 'd', 'a', 'l', 'i'], ['-', 'n', 'y']], [40, 'P', ['p', 's', 'd', 'r', 'x', 'g', 'q', 'i', 'z'], ['-', '1', '2', '3'], ['-', 'm', 'f', 'n'], ['-', 's', 'd', 'p'], ['-', 'n', 'g', 'd', 'a', 'l', 'i'], ['-', 's', 'd', 'p'], ['-', 'm', 'f', 'n'], ['-', 'y', 'b']], [1, 'Q'], [5, 'R', ['g'], ['p', 'c', 's']], [7, 'S', ['-', 'g', 'd', 'a', 'l', 'i']], [24, 'V', ['m'], ['-'], ['n', 'u', 'p', 'r', 'f', 'c'], ['-', '1', '2', '3'], ['-', 's', 'p', 'd'], ['-', 'm', 'f', 'n'], ['-', 'n', 'y']]]\n", "[[21, 'P', ['p', 's'], ['n', 'p', 's'], ['m', 'z', 's'], ['e', 'd', 'm'], ['i', 'r', 'd', 't', 'm', 'o'], ['-', 'n', 'd']], [3, 'V', ['p', 'd']], [1, 'M'], [21, 'K', ['b'], ['-', 'g', 'v', 'd'], ['m', 'z', 's'], ['e', 'd', 'm'], ['i', 'r', 'd', 't', 'm', 'o'], ['-', 'n', 'd']], [17, 'S', ['o'], ['m', 'z', 's'], ['e', 'd', 'm'], ['i', 'r', 'd', 't', 'm', 'o'], ['-', 'n', 'd']], [40, 'Z', ['o', 's', 'k', 'z', 'p', 'c', 'v', 'n', 'l'], ['-', 'p', 'd', 't'], ['-', 'm', 'z', 's'], ['-', 'e', 'd', 'm'], ['-', 'i', 'r', 'd', 't', 'm', 'o'], ['-', 'e', 'd', 'm'], ['-', 'm', 'z', 's'], ['-', 'k', 'z']], [1, 'L'], [5, 'R', ['s'], ['n', 'r', 's']], [7, 'D', ['-', 'r', 'd', 't', 'm', 'o']], [24, 'G', ['g'], ['-'], ['n', 'm', 'd', 's', 'p', 'g'], ['-', 'p', 'd', 't'], ['-', 'e', 'm', 'd'], ['-', 'm', 'z', 's'], ['-', 'n', 'd']]]\n" ] } ], "source": [ "print(feature__en_dictionary)\n", "print(feature__slo_dictionary)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']\n", "words = [[\"Gorejevemu\", \"\", \"Psnsed\", \"Gorejevemu\"]]" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%run prepare_data.py\n", "data = Data('l', shuffle_all_inputs=False)\n", "location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n", " letter_type_model, syllable_type_model, syllabled_letter_type_model,\n", " dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)" ] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Gorejévemu']\n", "['Gorejěvemu']\n", "[['Gorejevemu', '', 'Psnsed', 'Gorejevemu']]\n" ] } ], "source": [ "pos = 4282\n", "print(location_accented_words)\n", "print(accented_words)\n", "print(words)" ] }, { "cell_type": "code", "execution_count": 165, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hello\n" ] }, { "ename": "NameError", "evalue": "name 'wait' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mstart\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"hello\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mend\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'wait' is not defined" ] } ], "source": [ "import time\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Words proccesed: 0\n", "Word indeks: 0\n" ] }, { "ename": "NameError", "evalue": "name 'words' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Word indeks: \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Word number: \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mend_timer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'words' is not defined" ] } ], "source": [ "from lxml import etree\n", "import time\n", "\n", "gen = xml_words_generator('data/Sloleks_v1.2.xml')\n", "word_glob_num = 0\n", "word_limit = 0\n", "iter_num = 50000\n", "word_index = 0\n", "start_timer = time.time()\n", "iter_index = 0\n", "myfile = open('data/new_sloleks/test' + str(iter_index) + '.xml', 'ab')\n", "#with open(\"data/new_sloleks/new_sloleks.xml\", \"ab\") as myfile:\n", "\n", "enable_print = False\n", "\n", "for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n", " # LOAD NEW WORDS AND ACCENTUATE THEM\n", " if word_glob_num >= word_limit:\n", " myfile.close()\n", " myfile = open('data/new_sloleks/test' + str(iter_index) + '.xml', 'ab')\n", " #if iter_index == 5:\n", " # break\n", " \n", " iter_index += 1\n", " print(\"Words proccesed: \" + str(word_glob_num))\n", " \n", " print(\"Word indeks: \" + str(word_index))\n", " print(\"Word number: \" + str(len(words)))\n", " \n", " end_timer = time.time()\n", " print(\"Elapsed time: \" + \"{0:.2f}\".format((end_timer - start_timer)/60.0) + \" minutes\")\n", " \n", " \n", " word_index = 0\n", " words = []\n", " #print(\"HERE!!!\")\n", " while len(words) < iter_num:\n", " try:\n", " words.extend(next(gen))\n", " except:\n", " break\n", " #print(\"HERE!!!\")\n", " #if word_glob_num > 1:\n", " # break\n", "\n", " word_limit += len(words)\n", " #print(\"HERE!!!\")\n", " \n", " \n", " \n", " for child in element:\n", " if child.tag == 'WordForm':\n", " #msd = None\n", " #word = None\n", " for wf in child:\n", " if wf.tag == 'FormRepresentation':\n", " sloleks_word = None\n", " for form_rep in wf:\n", " if form_rep.attrib['att'] == 'zapis_oblike':\n", " sloleks_word = form_rep.attrib['val']\n", " \n", " if sloleks_word != words[word_index][0]:\n", " print(sloleks_word)\n", " print(words[word_index][0])\n", " print(word_index)\n", " \n", " \n", " #if sloleks_word == \n", " new_element = etree.Element('feat')\n", " new_element.attrib['att']='naglasna_mesta_oblike'\n", " new_element.attrib['val']=words[word_index][0]\n", " wf.append(new_element)\n", "\n", " new_element = etree.Element('feat')\n", " new_element.attrib['att']='naglašena_oblika'\n", " new_element.attrib['val']=words[word_index][0]\n", " wf.append(new_element)\n", " word_glob_num += 1\n", " word_index += 1\n", "\n", " #myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n", " element.clear()\n", " \n", " " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'xml_words_generator' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mgen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mxml_words_generator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data/Sloleks_v1.2_p2.xml'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mword_glob_num\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mword_limit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'xml_words_generator' is not defined" ] } ], "source": [ "#Words proccesed: 650250\n", "#Word indeks: 50023\n", "#Word number: 50023\n", "\n", "from lxml import etree\n", "import time\n", "\n", "gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')\n", "word_glob_num = 0\n", "word_limit = 0\n", "iter_num = 50000\n", "word_index = 0\n", "start_timer = time.time()\n", "iter_index = 0\n", "words = []\n", "\n", "lexical_entries_load_number = 0\n", "lexical_entries_save_number = 0\n", "\n", "\n", "# INSIDE\n", "#word_glob_num = 1500686\n", "word_glob_num = 1550705\n", "\n", "#word_limit = 1500686\n", "word_limit = 1550705\n", "\n", "\n", "iter_index = 31\n", "\n", "#done_lexical_entries = 33522\n", "\n", "with open(\"data/new_sloleks/new_sloleks.xml\", \"ab\") as myfile:\n", " myfile2 = open('data/new_sloleks/pa' + str(iter_index) + '.xml', 'ab')\n", " for event, element in etree.iterparse('data/Sloleks_v1.2_p2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n", " # LOAD NEW WORDS AND ACCENTUATE THEM\n", " #print(\"HERE\")\n", " \n", "# if lexical_entries_save_number < done_lexical_entries:\n", "# next(gen)\n", "# #print(lexical_entries_save_number)\n", "# lexical_entries_save_number += 1\n", "# lexical_entries_load_number += 1\n", "# continue\n", " \n", " if word_glob_num >= word_limit:\n", " myfile2.close()\n", " myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')\n", " iter_index += 1\n", " print(\"Words proccesed: \" + str(word_glob_num))\n", "\n", " print(\"Word indeks: \" + str(word_index))\n", " print(\"Word number: \" + str(len(words)))\n", " \n", " #print(\"lexical_entries_load_number: \" + str(lexical_entries_load_number))\n", " #print(\"lexical_entries_save_number: \" + str(lexical_entries_save_number))\n", "\n", " end_timer = time.time()\n", " print(\"Elapsed time: \" + \"{0:.2f}\".format((end_timer - start_timer)/60.0) + \" minutes\")\n", "\n", "\n", " word_index = 0\n", " words = []\n", "\n", " while len(words) < iter_num:\n", " try:\n", " words.extend(next(gen))\n", " lexical_entries_load_number += 1\n", " except:\n", " break\n", " #if word_glob_num > 1:\n", " # break\n", "\n", " #problem_words = words\n", " #break\n", " data = Data('l', shuffle_all_inputs=False)\n", " location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n", " letter_type_model, syllable_type_model, syllabled_letter_type_model,\n", " dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)\n", "\n", " word_limit += len(words)\n", " \n", " \n", " # READ DATA\n", " for child in element:\n", " if child.tag == 'WordForm':\n", " msd = None\n", " word = None\n", " for wf in child:\n", " if wf.tag == 'FormRepresentation':\n", " new_element = etree.Element('feat')\n", " new_element.attrib['att']='naglasna_mesta_oblike'\n", " new_element.attrib['val']=location_accented_words[word_index]\n", " wf.append(new_element)\n", "\n", " new_element = etree.Element('feat')\n", " new_element.attrib['att']='naglašena_oblika'\n", " new_element.attrib['val']=accented_words[word_index]\n", " wf.append(new_element)\n", " word_glob_num += 1\n", " word_index += 1\n", "\n", " # print(etree.tostring(element, encoding=\"UTF-8\"))\n", " myfile2.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n", " myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n", " element.clear()\n", " lexical_entries_save_number += 1\n", " " ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "problem_words = []" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Words proccesed: 1550705\n", "Word indeks: 0\n", "Word number: 0\n", "Elapsed time: 0.00 minutes\n", "Words proccesed: 1600757\n", "Word indeks: 50052\n", "Word number: 50052\n", "Elapsed time: 9.39 minutes\n", "Words proccesed: 1650762\n", "Word indeks: 50005\n", "Word number: 50005\n", "Elapsed time: 18.22 minutes\n", "Words proccesed: 1700781\n", "Word indeks: 50019\n", "Word number: 50019\n", "Elapsed time: 27.47 minutes\n", "Words proccesed: 1750833\n", "Word indeks: 50052\n", "Word number: 50052\n", "Elapsed time: 36.58 minutes\n", "Words proccesed: 1800864\n", "Word indeks: 50031\n", "Word number: 50031\n", "Elapsed time: 45.39 minutes\n", "Words proccesed: 1850886\n", "Word indeks: 50022\n", "Word number: 50022\n", "Elapsed time: 54.31 minutes\n", "Words proccesed: 1900898\n", "Word indeks: 50012\n", "Word number: 50012\n", "Elapsed time: 62.81 minutes\n", "Words proccesed: 1950911\n", "Word indeks: 50013\n", "Word number: 50013\n", "Elapsed time: 70.84 minutes\n", "Words proccesed: 2000920\n", "Word indeks: 50009\n", "Word number: 50009\n", "Elapsed time: 79.08 minutes\n", "Words proccesed: 2050927\n", "Word indeks: 50007\n", "Word number: 50007\n", "Elapsed time: 87.50 minutes\n", "Words proccesed: 2100944\n", "Word indeks: 50017\n", "Word number: 50017\n", "Elapsed time: 95.62 minutes\n", "Words proccesed: 2150949\n", "Word indeks: 50005\n", "Word number: 50005\n", "Elapsed time: 104.08 minutes\n", "Words proccesed: 2200958\n", "Word indeks: 50009\n", "Word number: 50009\n", "Elapsed time: 112.44 minutes\n", "Words proccesed: 2250969\n", "Word indeks: 50011\n", "Word number: 50011\n", "Elapsed time: 120.68 minutes\n", "Words proccesed: 2300978\n", "Word indeks: 50009\n", "Word number: 50009\n", "Elapsed time: 129.58 minutes\n", "Words proccesed: 2350986\n", "Word indeks: 50008\n", "Word number: 50008\n", "Elapsed time: 139.40 minutes\n", "Words proccesed: 2400993\n", "Word indeks: 50007\n", "Word number: 50007\n", "Elapsed time: 148.05 minutes\n", "Words proccesed: 2451000\n", "Word indeks: 50007\n", "Word number: 50007\n", "Elapsed time: 156.79 minutes\n", "Words proccesed: 2501005\n", "Word indeks: 50005\n", "Word number: 50005\n", "Elapsed time: 165.57 minutes\n", "Words proccesed: 2551016\n", "Word indeks: 50011\n", "Word number: 50011\n", "Elapsed time: 174.29 minutes\n", "Words proccesed: 2601024\n", "Word indeks: 50008\n", "Word number: 50008\n", "Elapsed time: 183.20 minutes\n", "Words proccesed: 2651037\n", "Word indeks: 50013\n", "Word number: 50013\n", "Elapsed time: 191.94 minutes\n", "Words proccesed: 2701046\n", "Word indeks: 50009\n", "Word number: 50009\n", "Elapsed time: 200.67 minutes\n", "Words proccesed: 2751050\n", "Word indeks: 50004\n", "Word number: 50004\n", "Elapsed time: 209.40 minutes\n" ] } ], "source": [ "#Words proccesed: 650250\n", "#Word indeks: 50023\n", "#Word number: 50023\n", "\n", "from lxml import etree\n", "import time\n", "\n", "gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')\n", "word_glob_num = 0\n", "word_limit = 0\n", "iter_num = 50000\n", "word_index = 0\n", "start_timer = time.time()\n", "iter_index = 0\n", "words = []\n", "\n", "lexical_entries_load_number = 0\n", "lexical_entries_save_number = 0\n", "\n", "\n", "# INSIDE\n", "#word_glob_num = 1500686\n", "word_glob_num = 1550705\n", "\n", "#word_limit = 1500686\n", "word_limit = 1550705\n", "\n", "\n", "iter_index = 31\n", "\n", "#done_lexical_entries = 33522\n", "\n", "with open(\"data/new_sloleks/new_sloleks.xml\", \"ab\") as myfile:\n", " myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')\n", " for event, element in etree.iterparse('data/Sloleks_v1.2_p2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n", " # LOAD NEW WORDS AND ACCENTUATE THEM\n", " #print(\"HERE\")\n", " \n", "# if lexical_entries_save_number < done_lexical_entries:\n", "# next(gen)\n", "# #print(lexical_entries_save_number)\n", "# lexical_entries_save_number += 1\n", "# lexical_entries_load_number += 1\n", "# continue\n", " \n", " if word_glob_num >= word_limit:\n", " myfile2.close()\n", " myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')\n", " iter_index += 1\n", " print(\"Words proccesed: \" + str(word_glob_num))\n", "\n", " print(\"Word indeks: \" + str(word_index))\n", " print(\"Word number: \" + str(len(words)))\n", " \n", " #print(\"lexical_entries_load_number: \" + str(lexical_entries_load_number))\n", " #print(\"lexical_entries_save_number: \" + str(lexical_entries_save_number))\n", "\n", " end_timer = time.time()\n", " print(\"Elapsed time: \" + \"{0:.2f}\".format((end_timer - start_timer)/60.0) + \" minutes\")\n", "\n", "\n", " word_index = 0\n", " words = []\n", "\n", " while len(words) < iter_num:\n", " try:\n", " words.extend(next(gen))\n", " lexical_entries_load_number += 1\n", " except:\n", " break\n", " #if word_glob_num > 1:\n", " # break\n", "\n", " #problem_words = words\n", " #break\n", " data = Data('l', shuffle_all_inputs=False)\n", " location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n", " letter_type_model, syllable_type_model, syllabled_letter_type_model,\n", " dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)\n", "\n", " word_limit += len(words)\n", " \n", " \n", " # READ DATA\n", " for child in element:\n", " if child.tag == 'WordForm':\n", " msd = None\n", " word = None\n", " for wf in child:\n", " if wf.tag == 'FormRepresentation':\n", " new_element = etree.Element('feat')\n", " new_element.attrib['att']='naglasna_mesta_oblike'\n", " new_element.attrib['val']=location_accented_words[word_index]\n", " wf.append(new_element)\n", "\n", " new_element = etree.Element('feat')\n", " new_element.attrib['att']='naglašena_oblika'\n", " new_element.attrib['val']=accented_words[word_index]\n", " wf.append(new_element)\n", " word_glob_num += 1\n", " word_index += 1\n", "\n", " # print(etree.tostring(element, encoding=\"UTF-8\"))\n", " myfile2.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n", " myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n", " element.clear()\n", " lexical_entries_save_number += 1\n", " " ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "50052" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(problem_words)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%run prepare_data.py\n", "data = Data('l', shuffle_all_inputs=False)\n", "location_accented_words, accented_words = data.accentuate_word(problem_words[:], letter_location_model, syllable_location_model, syllabled_letters_location_model,\n", " letter_type_model, syllable_type_model, syllabled_letter_type_model,\n", " dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1558562" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# CALCULATE INDEX NUMBER:\n", "previous_file_len = [622061, 618306, 618266, 618483, 619342]\n", "word_nums = [50017, 50007, 50017, 50012, 50024]\n", "def calculate_index(previous_files_len, word_nums):\n", " return sum(previous_files_len) - 2 * sum(word_nums) + 11\n", "calculate_index(previous_file_len[:3], word_nums[:3])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "250002\n", "250000\n", "50000\n", "0\n" ] } ], "source": [ "print(word_glob_num)\n", "print(word_limit)\n", "print(iter_num)\n", "print(word_index)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "23" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "max_word" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['zapirati', '', 'Ggnn', 'zapirati']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "words[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with open(\"new_sloleks.xml\", \"ab\") as myfile:\n", " for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n", " # READ DATA\n", " for child in element:\n", " if child.tag == 'WordForm':\n", " msd = None\n", " word = None\n", " for wf in child:\n", "# print(wf.attrib['att'])\n", " if 'att' in wf.attrib and wf.attrib['att'] == 'msd':\n", " msd = wf.attrib['val']\n", " elif wf.tag == 'FormRepresentation':\n", " for form_rep in wf:\n", " if form_rep.attrib['att'] == 'zapis_oblike':\n", " word = form_rep.attrib['val']\n", "\n", " new_element = etree.Element('feat')\n", " new_element.attrib['att']='naglasna_mesta_oblike'\n", " new_element.attrib['val']='test'\n", " wf.append(new_element)\n", "\n", " new_element = etree.Element('feat')\n", " new_element.attrib['att']='naglašena_oblika'\n", " new_element.attrib['val']='test'\n", " wf.append(new_element)\n", " if msd is not None and word is not None:\n", " print(msd)\n", " print(word)\n", " else:\n", " print('NOOOOO')\n", " print(etree.tostring(element, encoding=\"UTF-8\"))\n", " myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n", " element.clear()\n", " break" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }