stress_asignment/sloleks_accetuation.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using Theano backend.\n"
     ]
    }
   ],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "from __future__ import unicode_literals\n",
    "\n",
    "import numpy as np\n",
    "from keras.models import load_model\n",
    "import sys\n",
    "import pickle\n",
    "\n",
    "from prepare_data import *\n",
    "\n",
    "np.random.seed(7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = Data('l', shuffle_all_inputs=False)\n",
    "content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex')\n",
    "dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content)\n",
    "feature_dictionary = data._create_slovene_feature_dictionary()\n",
    "syllable_dictionary = data._create_syllables_dictionary(content, vowels)\n",
    "accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "environment = {}\n",
    "environment['dictionary'] = dictionary\n",
    "environment['max_word'] = max_word\n",
    "environment['max_num_vowels'] = max_num_vowels\n",
    "environment['vowels'] = vowels\n",
    "environment['accented_vowels'] = accented_vowels\n",
    "environment['feature_dictionary'] = feature_dictionary\n",
    "environment['eng_feature_dictionary'] = feature_dictionary\n",
    "environment['syllable_dictionary'] = syllable_dictionary\n",
    "output = open('environment.pkl', 'wb')\n",
    "pickle.dump(environment, output)\n",
    "output.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "407\n"
     ]
    }
   ],
   "source": [
    "i = 0\n",
    "for el in syllable_dictionary:\n",
    "    if el == \"da\":\n",
    "        print(i)\n",
    "    i += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(\n",
    "    'cnn/word_accetuation/cnn_dictionary/v3_10/20_test_epoch.h5',\n",
    "    'cnn/word_accetuation/syllables/v2_4/20_test_epoch.h5',\n",
    "    'cnn/word_accetuation/syllabled_letters/v2_5_3/20_test_epoch.h5')\n",
    "\n",
    "letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models(\n",
    "    'cnn/accent_classification/letters/v2_1/20_test_epoch.h5',\n",
    "    'cnn/accent_classification/syllables/v1_0/20_test_epoch.h5',\n",
    "    'cnn/accent_classification/syllabled_letters/v1_0/20_test_epoch.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_input = [['uradni', '', 'Agpmpn', 'uradni'], ['podatki', '', 'Ncmpn', 'podatki'], ['policije', '', 'Ncfsg', 'policije'], ['kažejo', '', 'Vmpr3p', 'kažejo'], ['na', '', 'Sa', 'na'], ['precej', '', 'Rgp', 'precej'], ['napete', '', 'Appfpa', 'napete'], ['razmere', '', 'Ncfpa', 'razmere'], ['v', '', 'Sl', 'v'], ['piranskem', '', 'Agpmsl', 'piranskem'], ['zalivu', '', 'Ncmsl', 'zalivu'], ['je', '', 'Va-r3s-n', 'je'], ['danes', '', 'Rgp', 'danes'], ['poročala', '', 'Vmpp-sf', 'poročala'], ['oddaja', '', 'Ncfsn', 'oddaja'], ['do', '', 'Sg', 'do'], ['danes', '', 'Rgp', 'danes'], ['se', '', 'Px------y', 'se'], ['je', '', 'Va-r3s-n', 'je'], ['zgodilo', '', 'Vmep-sn', 'zgodilo']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%run prepare_data.py\n",
    "data = Data('s', shuffle_all_inputs=False)\n",
    "location_accented_words, accented_words = data.accentuate_word(test_input, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
    "                        letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
    "                        dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['uradni', 'podatkí', 'policíje', 'kažéjo', 'ná', 'precéj', 'napeté', 'razmeré', 'v', 'piranském', 'zalivú', 'jé', 'danés', 'poročála', 'oddajá', 'dó', 'danés', 'sé', 'jé', 'zgodílo']\n",
      "['uradni', 'pödatki', 'polícije', 'kažëjo', 'ná', 'prëcej', 'nápete', 'räzmere', 'v', 'pîranskem', 'zálivu', 'jë', 'dánes', 'poróčala', 'öddaja', 'dó', 'dánes', 'së', 'jë', 'zgodílo']\n"
     ]
    }
   ],
   "source": [
    "print(location_accented_words)\n",
    "print(accented_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CREATING OTHER FEATURES...\n",
      "OTHER FEATURES CREATED!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'nädnarávnih'"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def predict_word(word_acentuation_model, accent_type_model, word, msd, dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary):\n",
    "    eye_input_accent = np.eye(10, dtype=int)\n",
    "    \n",
    "    english_msd = msd\n",
    "    fake_content = [[word, '-', msd, '-']]\n",
    "    x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, fake_content, vowels, accented_vowels, feature_dictionary, 'who cares')\n",
    "#     print(x)\n",
    "    accent_loc = word_acentuation_model.predict([x, x_other_features])\n",
    "    \n",
    "    j = 0\n",
    "    word=list(word)[::-1]\n",
    "#     print(word)\n",
    "#     print(accent_loc)\n",
    "    \n",
    "    for i in range(len(word)):\n",
    "        if data._is_vowel(word, i, vowels):\n",
    "            if accent_loc[0][j] >= 0.5:\n",
    "    #             print(x_other_features[0])\n",
    "    #             print(eye_input_accent[i])\n",
    "                new_x_other_features = np.array([np.concatenate((x_other_features[0], eye_input_accent[j]))])\n",
    "    #             print(x_other_features)\n",
    "    #             print(new_x_other_features)\n",
    "                final_accent = accent_type_model.predict([x, new_x_other_features])\n",
    "#                 print(accented_vowels[final_accent[0].argmax(axis=0)])\n",
    "                word[i] = accented_vowels[final_accent[0].argmax(axis=0)]\n",
    "#                 print(final_accent)\n",
    "            j += 1\n",
    "\n",
    "    \n",
    "    \n",
    "    return ''.join(word[::-1])\n",
    "\n",
    "predict_word(word_acentuation_model, accent_type_model, 'nadnaravnih', 'Afpfdg', dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from lxml import etree\n",
    "\n",
    "def xml_words_generator(xml_path):\n",
    "    for event, element in etree.iterparse(xml_path, tag=\"LexicalEntry\", encoding=\"UTF-8\"):\n",
    "        words = []\n",
    "        for child in element:\n",
    "            if child.tag == 'WordForm':\n",
    "                msd = None\n",
    "                word = None\n",
    "                for wf in child:\n",
    "                    if 'att' in wf.attrib and wf.attrib['att'] == 'msd':\n",
    "                        msd = wf.attrib['val']\n",
    "                    elif wf.tag == 'FormRepresentation':\n",
    "                        for form_rep in wf:\n",
    "                            if form_rep.attrib['att'] == 'zapis_oblike':\n",
    "                                word = form_rep.attrib['val']\n",
    "                        #if msd is not None and word is not None:\n",
    "                        #    pass\n",
    "                        #else:\n",
    "                        #    print('NOOOOO')\n",
    "                        words.append([word, '', msd, word])\n",
    "        yield words\n",
    "        \n",
    "gen = xml_words_generator('data/Sloleks_v1.2.xml')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# SPLIT ALL TEXT!!!\n",
    "NUM_OF_LINES=16660466\n",
    "filename = 'data/Sloleks_v1.2.xml'\n",
    "with open(filename) as fin:\n",
    "    fout = open('data/Sloleks_v1.2_p2.xml',\"a\")\n",
    "    for i,line in enumerate(fin):\n",
    "        if NUM_OF_LINES < i:\n",
    "            fout.write(line)\n",
    "            fout.close()\n",
    "            fout = open('data/Sloleks_v1.2_p2.xml',\"a\")\n",
    "\n",
    "    fout.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "50017\n"
     ]
    }
   ],
   "source": [
    "words = []\n",
    "while len(words) < 50000:\n",
    "    words.extend(next(gen))\n",
    "print(len(words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[21, 'A', ['g', 's'], ['p', 'c', 's'], ['m', 'f', 'n'], ['s', 'd', 'p'], ['n', 'g', 'd', 'a', 'l', 'i'], ['-', 'n', 'y']], [3, 'C', ['c', 's']], [1, 'I'], [21, 'M', ['l'], ['-', 'c', 'o', 's'], ['m', 'f', 'n'], ['s', 'd', 'p'], ['n', 'g', 'd', 'a', 'l', 'i'], ['-', 'n', 'y']], [17, 'N', ['c'], ['m', 'f', 'n'], ['s', 'd', 'p'], ['n', 'g', 'd', 'a', 'l', 'i'], ['-', 'n', 'y']], [40, 'P', ['p', 's', 'd', 'r', 'x', 'g', 'q', 'i', 'z'], ['-', '1', '2', '3'], ['-', 'm', 'f', 'n'], ['-', 's', 'd', 'p'], ['-', 'n', 'g', 'd', 'a', 'l', 'i'], ['-', 's', 'd', 'p'], ['-', 'm', 'f', 'n'], ['-', 'y', 'b']], [1, 'Q'], [5, 'R', ['g'], ['p', 'c', 's']], [7, 'S', ['-', 'g', 'd', 'a', 'l', 'i']], [24, 'V', ['m'], ['-'], ['n', 'u', 'p', 'r', 'f', 'c'], ['-', '1', '2', '3'], ['-', 's', 'p', 'd'], ['-', 'm', 'f', 'n'], ['-', 'n', 'y']]]\n",
      "[[21, 'P', ['p', 's'], ['n', 'p', 's'], ['m', 'z', 's'], ['e', 'd', 'm'], ['i', 'r', 'd', 't', 'm', 'o'], ['-', 'n', 'd']], [3, 'V', ['p', 'd']], [1, 'M'], [21, 'K', ['b'], ['-', 'g', 'v', 'd'], ['m', 'z', 's'], ['e', 'd', 'm'], ['i', 'r', 'd', 't', 'm', 'o'], ['-', 'n', 'd']], [17, 'S', ['o'], ['m', 'z', 's'], ['e', 'd', 'm'], ['i', 'r', 'd', 't', 'm', 'o'], ['-', 'n', 'd']], [40, 'Z', ['o', 's', 'k', 'z', 'p', 'c', 'v', 'n', 'l'], ['-', 'p', 'd', 't'], ['-', 'm', 'z', 's'], ['-', 'e', 'd', 'm'], ['-', 'i', 'r', 'd', 't', 'm', 'o'], ['-', 'e', 'd', 'm'], ['-', 'm', 'z', 's'], ['-', 'k', 'z']], [1, 'L'], [5, 'R', ['s'], ['n', 'r', 's']], [7, 'D', ['-', 'r', 'd', 't', 'm', 'o']], [24, 'G', ['g'], ['-'], ['n', 'm', 'd', 's', 'p', 'g'], ['-', 'p', 'd', 't'], ['-', 'e', 'm', 'd'], ['-', 'm', 'z', 's'], ['-', 'n', 'd']]]\n"
     ]
    }
   ],
   "source": [
    "print(feature__en_dictionary)\n",
    "print(feature__slo_dictionary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']\n",
    "words = [[\"Gorejevemu\", \"\", \"Psnsed\", \"Gorejevemu\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%run prepare_data.py\n",
    "data = Data('l', shuffle_all_inputs=False)\n",
    "location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
    "                        letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
    "                        dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Gorejévemu']\n",
      "['Gorejěvemu']\n",
      "[['Gorejevemu', '', 'Psnsed', 'Gorejevemu']]\n"
     ]
    }
   ],
   "source": [
    "pos = 4282\n",
    "print(location_accented_words)\n",
    "print(accented_words)\n",
    "print(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hello\n"
     ]
    },
    {
     "ename": "NameError",
     "evalue": "name 'wait' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-165-0a2d5e185f36>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mstart\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"hello\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mend\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'wait' is not defined"
     ]
    }
   ],
   "source": [
    "import time\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Words proccesed: 0\n",
      "Word indeks: 0\n"
     ]
    },
    {
     "ename": "NameError",
     "evalue": "name 'words' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-5-0e24b34aba55>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     26\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     27\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Word indeks: \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Word number: \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     30\u001b[0m         \u001b[0mend_timer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'words' is not defined"
     ]
    }
   ],
   "source": [
    "from lxml import etree\n",
    "import time\n",
    "\n",
    "gen = xml_words_generator('data/Sloleks_v1.2.xml')\n",
    "word_glob_num = 0\n",
    "word_limit = 0\n",
    "iter_num = 50000\n",
    "word_index = 0\n",
    "start_timer = time.time()\n",
    "iter_index = 0\n",
    "myfile = open('data/new_sloleks/test' + str(iter_index) + '.xml', 'ab')\n",
    "#with open(\"data/new_sloleks/new_sloleks.xml\", \"ab\") as myfile:\n",
    "\n",
    "enable_print = False\n",
    "\n",
    "for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n",
    "    # LOAD NEW WORDS AND ACCENTUATE THEM\n",
    "    if word_glob_num >= word_limit:\n",
    "        myfile.close()\n",
    "        myfile = open('data/new_sloleks/test' + str(iter_index) + '.xml', 'ab')\n",
    "        #if iter_index == 5:\n",
    "        #    break\n",
    "        \n",
    "        iter_index += 1\n",
    "        print(\"Words proccesed: \" + str(word_glob_num))\n",
    "        \n",
    "        print(\"Word indeks: \" + str(word_index))\n",
    "        print(\"Word number: \" + str(len(words)))\n",
    "        \n",
    "        end_timer = time.time()\n",
    "        print(\"Elapsed time: \" + \"{0:.2f}\".format((end_timer - start_timer)/60.0) + \" minutes\")\n",
    "        \n",
    "        \n",
    "        word_index = 0\n",
    "        words = []\n",
    "        #print(\"HERE!!!\")\n",
    "        while len(words) < iter_num:\n",
    "            try:\n",
    "                words.extend(next(gen))\n",
    "            except:\n",
    "                break\n",
    "        #print(\"HERE!!!\")\n",
    "        #if word_glob_num > 1:\n",
    "        #    break\n",
    "\n",
    "        word_limit += len(words)\n",
    "    #print(\"HERE!!!\")\n",
    "    \n",
    "    \n",
    "    \n",
    "    for child in element:\n",
    "        if child.tag == 'WordForm':\n",
    "            #msd = None\n",
    "            #word = None\n",
    "            for wf in child:\n",
    "                if wf.tag == 'FormRepresentation':\n",
    "                    sloleks_word = None\n",
    "                    for form_rep in wf:\n",
    "                        if form_rep.attrib['att'] == 'zapis_oblike':\n",
    "                            sloleks_word = form_rep.attrib['val']\n",
    "                    \n",
    "                    if sloleks_word != words[word_index][0]:\n",
    "                        print(sloleks_word)\n",
    "                        print(words[word_index][0])\n",
    "                        print(word_index)\n",
    "                            \n",
    "                    \n",
    "                    #if sloleks_word == \n",
    "                    new_element = etree.Element('feat')\n",
    "                    new_element.attrib['att']='naglasna_mesta_oblike'\n",
    "                    new_element.attrib['val']=words[word_index][0]\n",
    "                    wf.append(new_element)\n",
    "\n",
    "                    new_element = etree.Element('feat')\n",
    "                    new_element.attrib['att']='naglašena_oblika'\n",
    "                    new_element.attrib['val']=words[word_index][0]\n",
    "                    wf.append(new_element)\n",
    "                    word_glob_num += 1\n",
    "                    word_index += 1\n",
    "\n",
    "    #myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
    "    element.clear()\n",
    "        \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'xml_words_generator' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-8-44b0367c6cbf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mgen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mxml_words_generator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data/Sloleks_v1.2_p2.xml'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      9\u001b[0m \u001b[0mword_glob_num\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0mword_limit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'xml_words_generator' is not defined"
     ]
    }
   ],
   "source": [
    "#Words proccesed: 650250\n",
    "#Word indeks: 50023\n",
    "#Word number: 50023\n",
    "\n",
    "from lxml import etree\n",
    "import time\n",
    "\n",
    "gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')\n",
    "word_glob_num = 0\n",
    "word_limit = 0\n",
    "iter_num = 50000\n",
    "word_index = 0\n",
    "start_timer = time.time()\n",
    "iter_index = 0\n",
    "words = []\n",
    "\n",
    "lexical_entries_load_number = 0\n",
    "lexical_entries_save_number = 0\n",
    "\n",
    "\n",
    "# INSIDE\n",
    "#word_glob_num = 1500686\n",
    "word_glob_num = 1550705\n",
    "\n",
    "#word_limit = 1500686\n",
    "word_limit = 1550705\n",
    "\n",
    "\n",
    "iter_index = 31\n",
    "\n",
    "#done_lexical_entries = 33522\n",
    "\n",
    "with open(\"data/new_sloleks/new_sloleks.xml\", \"ab\") as myfile:\n",
    "    myfile2 = open('data/new_sloleks/pa' + str(iter_index) + '.xml', 'ab')\n",
    "    for event, element in etree.iterparse('data/Sloleks_v1.2_p2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n",
    "        # LOAD NEW WORDS AND ACCENTUATE THEM\n",
    "        #print(\"HERE\")\n",
    "        \n",
    "#        if lexical_entries_save_number < done_lexical_entries:\n",
    "#            next(gen)\n",
    "#            #print(lexical_entries_save_number)\n",
    "#            lexical_entries_save_number += 1\n",
    "#            lexical_entries_load_number += 1\n",
    "#            continue\n",
    "        \n",
    "        if word_glob_num >= word_limit:\n",
    "            myfile2.close()\n",
    "            myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')\n",
    "            iter_index += 1\n",
    "            print(\"Words proccesed: \" + str(word_glob_num))\n",
    "\n",
    "            print(\"Word indeks: \" + str(word_index))\n",
    "            print(\"Word number: \" + str(len(words)))\n",
    "            \n",
    "            #print(\"lexical_entries_load_number: \" + str(lexical_entries_load_number))\n",
    "            #print(\"lexical_entries_save_number: \" + str(lexical_entries_save_number))\n",
    "\n",
    "            end_timer = time.time()\n",
    "            print(\"Elapsed time: \" + \"{0:.2f}\".format((end_timer - start_timer)/60.0) + \" minutes\")\n",
    "\n",
    "\n",
    "            word_index = 0\n",
    "            words = []\n",
    "\n",
    "            while len(words) < iter_num:\n",
    "                try:\n",
    "                    words.extend(next(gen))\n",
    "                    lexical_entries_load_number += 1\n",
    "                except:\n",
    "                    break\n",
    "            #if word_glob_num > 1:\n",
    "            #    break\n",
    "\n",
    "            #problem_words = words\n",
    "            #break\n",
    "            data = Data('l', shuffle_all_inputs=False)\n",
    "            location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
    "                                    letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
    "                                    dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)\n",
    "\n",
    "            word_limit += len(words)\n",
    "            \n",
    "        \n",
    "        # READ DATA\n",
    "        for child in element:\n",
    "            if child.tag == 'WordForm':\n",
    "                msd = None\n",
    "                word = None\n",
    "                for wf in child:\n",
    "                    if wf.tag == 'FormRepresentation':\n",
    "                        new_element = etree.Element('feat')\n",
    "                        new_element.attrib['att']='naglasna_mesta_oblike'\n",
    "                        new_element.attrib['val']=location_accented_words[word_index]\n",
    "                        wf.append(new_element)\n",
    "\n",
    "                        new_element = etree.Element('feat')\n",
    "                        new_element.attrib['att']='naglašena_oblika'\n",
    "                        new_element.attrib['val']=accented_words[word_index]\n",
    "                        wf.append(new_element)\n",
    "                        word_glob_num += 1\n",
    "                        word_index += 1\n",
    "\n",
    "        # print(etree.tostring(element, encoding=\"UTF-8\"))\n",
    "        myfile2.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
    "        myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
    "        element.clear()\n",
    "        lexical_entries_save_number += 1\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "problem_words = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Words proccesed: 1550705\n",
      "Word indeks: 0\n",
      "Word number: 0\n",
      "Elapsed time: 0.00 minutes\n",
      "Words proccesed: 1600757\n",
      "Word indeks: 50052\n",
      "Word number: 50052\n",
      "Elapsed time: 9.39 minutes\n",
      "Words proccesed: 1650762\n",
      "Word indeks: 50005\n",
      "Word number: 50005\n",
      "Elapsed time: 18.22 minutes\n",
      "Words proccesed: 1700781\n",
      "Word indeks: 50019\n",
      "Word number: 50019\n",
      "Elapsed time: 27.47 minutes\n",
      "Words proccesed: 1750833\n",
      "Word indeks: 50052\n",
      "Word number: 50052\n",
      "Elapsed time: 36.58 minutes\n",
      "Words proccesed: 1800864\n",
      "Word indeks: 50031\n",
      "Word number: 50031\n",
      "Elapsed time: 45.39 minutes\n",
      "Words proccesed: 1850886\n",
      "Word indeks: 50022\n",
      "Word number: 50022\n",
      "Elapsed time: 54.31 minutes\n",
      "Words proccesed: 1900898\n",
      "Word indeks: 50012\n",
      "Word number: 50012\n",
      "Elapsed time: 62.81 minutes\n",
      "Words proccesed: 1950911\n",
      "Word indeks: 50013\n",
      "Word number: 50013\n",
      "Elapsed time: 70.84 minutes\n",
      "Words proccesed: 2000920\n",
      "Word indeks: 50009\n",
      "Word number: 50009\n",
      "Elapsed time: 79.08 minutes\n",
      "Words proccesed: 2050927\n",
      "Word indeks: 50007\n",
      "Word number: 50007\n",
      "Elapsed time: 87.50 minutes\n",
      "Words proccesed: 2100944\n",
      "Word indeks: 50017\n",
      "Word number: 50017\n",
      "Elapsed time: 95.62 minutes\n",
      "Words proccesed: 2150949\n",
      "Word indeks: 50005\n",
      "Word number: 50005\n",
      "Elapsed time: 104.08 minutes\n",
      "Words proccesed: 2200958\n",
      "Word indeks: 50009\n",
      "Word number: 50009\n",
      "Elapsed time: 112.44 minutes\n",
      "Words proccesed: 2250969\n",
      "Word indeks: 50011\n",
      "Word number: 50011\n",
      "Elapsed time: 120.68 minutes\n",
      "Words proccesed: 2300978\n",
      "Word indeks: 50009\n",
      "Word number: 50009\n",
      "Elapsed time: 129.58 minutes\n",
      "Words proccesed: 2350986\n",
      "Word indeks: 50008\n",
      "Word number: 50008\n",
      "Elapsed time: 139.40 minutes\n",
      "Words proccesed: 2400993\n",
      "Word indeks: 50007\n",
      "Word number: 50007\n",
      "Elapsed time: 148.05 minutes\n",
      "Words proccesed: 2451000\n",
      "Word indeks: 50007\n",
      "Word number: 50007\n",
      "Elapsed time: 156.79 minutes\n",
      "Words proccesed: 2501005\n",
      "Word indeks: 50005\n",
      "Word number: 50005\n",
      "Elapsed time: 165.57 minutes\n",
      "Words proccesed: 2551016\n",
      "Word indeks: 50011\n",
      "Word number: 50011\n",
      "Elapsed time: 174.29 minutes\n",
      "Words proccesed: 2601024\n",
      "Word indeks: 50008\n",
      "Word number: 50008\n",
      "Elapsed time: 183.20 minutes\n",
      "Words proccesed: 2651037\n",
      "Word indeks: 50013\n",
      "Word number: 50013\n",
      "Elapsed time: 191.94 minutes\n",
      "Words proccesed: 2701046\n",
      "Word indeks: 50009\n",
      "Word number: 50009\n",
      "Elapsed time: 200.67 minutes\n",
      "Words proccesed: 2751050\n",
      "Word indeks: 50004\n",
      "Word number: 50004\n",
      "Elapsed time: 209.40 minutes\n"
     ]
    }
   ],
   "source": [
    "#Words proccesed: 650250\n",
    "#Word indeks: 50023\n",
    "#Word number: 50023\n",
    "\n",
    "from lxml import etree\n",
    "import time\n",
    "\n",
    "gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')\n",
    "word_glob_num = 0\n",
    "word_limit = 0\n",
    "iter_num = 50000\n",
    "word_index = 0\n",
    "start_timer = time.time()\n",
    "iter_index = 0\n",
    "words = []\n",
    "\n",
    "lexical_entries_load_number = 0\n",
    "lexical_entries_save_number = 0\n",
    "\n",
    "\n",
    "# INSIDE\n",
    "#word_glob_num = 1500686\n",
    "word_glob_num = 1550705\n",
    "\n",
    "#word_limit = 1500686\n",
    "word_limit = 1550705\n",
    "\n",
    "\n",
    "iter_index = 31\n",
    "\n",
    "#done_lexical_entries = 33522\n",
    "\n",
    "with open(\"data/new_sloleks/new_sloleks.xml\", \"ab\") as myfile:\n",
    "    myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')\n",
    "    for event, element in etree.iterparse('data/Sloleks_v1.2_p2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n",
    "        # LOAD NEW WORDS AND ACCENTUATE THEM\n",
    "        #print(\"HERE\")\n",
    "        \n",
    "#        if lexical_entries_save_number < done_lexical_entries:\n",
    "#            next(gen)\n",
    "#            #print(lexical_entries_save_number)\n",
    "#            lexical_entries_save_number += 1\n",
    "#            lexical_entries_load_number += 1\n",
    "#            continue\n",
    "        \n",
    "        if word_glob_num >= word_limit:\n",
    "            myfile2.close()\n",
    "            myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')\n",
    "            iter_index += 1\n",
    "            print(\"Words proccesed: \" + str(word_glob_num))\n",
    "\n",
    "            print(\"Word indeks: \" + str(word_index))\n",
    "            print(\"Word number: \" + str(len(words)))\n",
    "            \n",
    "            #print(\"lexical_entries_load_number: \" + str(lexical_entries_load_number))\n",
    "            #print(\"lexical_entries_save_number: \" + str(lexical_entries_save_number))\n",
    "\n",
    "            end_timer = time.time()\n",
    "            print(\"Elapsed time: \" + \"{0:.2f}\".format((end_timer - start_timer)/60.0) + \" minutes\")\n",
    "\n",
    "\n",
    "            word_index = 0\n",
    "            words = []\n",
    "\n",
    "            while len(words) < iter_num:\n",
    "                try:\n",
    "                    words.extend(next(gen))\n",
    "                    lexical_entries_load_number += 1\n",
    "                except:\n",
    "                    break\n",
    "            #if word_glob_num > 1:\n",
    "            #    break\n",
    "\n",
    "            #problem_words = words\n",
    "            #break\n",
    "            data = Data('l', shuffle_all_inputs=False)\n",
    "            location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
    "                                    letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
    "                                    dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)\n",
    "\n",
    "            word_limit += len(words)\n",
    "            \n",
    "        \n",
    "        # READ DATA\n",
    "        for child in element:\n",
    "            if child.tag == 'WordForm':\n",
    "                msd = None\n",
    "                word = None\n",
    "                for wf in child:\n",
    "                    if wf.tag == 'FormRepresentation':\n",
    "                        new_element = etree.Element('feat')\n",
    "                        new_element.attrib['att']='naglasna_mesta_oblike'\n",
    "                        new_element.attrib['val']=location_accented_words[word_index]\n",
    "                        wf.append(new_element)\n",
    "\n",
    "                        new_element = etree.Element('feat')\n",
    "                        new_element.attrib['att']='naglašena_oblika'\n",
    "                        new_element.attrib['val']=accented_words[word_index]\n",
    "                        wf.append(new_element)\n",
    "                        word_glob_num += 1\n",
    "                        word_index += 1\n",
    "\n",
    "        # print(etree.tostring(element, encoding=\"UTF-8\"))\n",
    "        myfile2.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
    "        myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
    "        element.clear()\n",
    "        lexical_entries_save_number += 1\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "50052"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(problem_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%run prepare_data.py\n",
    "data = Data('l', shuffle_all_inputs=False)\n",
    "location_accented_words, accented_words = data.accentuate_word(problem_words[:], letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
    "                                    letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
    "                                    dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1558562"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# CALCULATE INDEX NUMBER:\n",
    "previous_file_len = [622061, 618306, 618266, 618483, 619342]\n",
    "word_nums = [50017, 50007, 50017, 50012, 50024]\n",
    "def calculate_index(previous_files_len, word_nums):\n",
    "    return sum(previous_files_len) - 2 * sum(word_nums) + 11\n",
    "calculate_index(previous_file_len[:3], word_nums[:3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "250002\n",
      "250000\n",
      "50000\n",
      "0\n"
     ]
    }
   ],
   "source": [
    "print(word_glob_num)\n",
    "print(word_limit)\n",
    "print(iter_num)\n",
    "print(word_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "23"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "max_word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['zapirati', '', 'Ggnn', 'zapirati']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open(\"new_sloleks.xml\", \"ab\") as myfile:\n",
    "    for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n",
    "        # READ DATA\n",
    "        for child in element:\n",
    "            if child.tag == 'WordForm':\n",
    "                msd = None\n",
    "                word = None\n",
    "                for wf in child:\n",
    "#                     print(wf.attrib['att'])\n",
    "                    if 'att' in wf.attrib and wf.attrib['att'] == 'msd':\n",
    "                        msd = wf.attrib['val']\n",
    "                    elif wf.tag == 'FormRepresentation':\n",
    "                        for form_rep in wf:\n",
    "                            if form_rep.attrib['att'] == 'zapis_oblike':\n",
    "                                word = form_rep.attrib['val']\n",
    "\n",
    "                        new_element = etree.Element('feat')\n",
    "                        new_element.attrib['att']='naglasna_mesta_oblike'\n",
    "                        new_element.attrib['val']='test'\n",
    "                        wf.append(new_element)\n",
    "\n",
    "                        new_element = etree.Element('feat')\n",
    "                        new_element.attrib['att']='naglašena_oblika'\n",
    "                        new_element.attrib['val']='test'\n",
    "                        wf.append(new_element)\n",
    "                if msd is not None and word is not None:\n",
    "                    print(msd)\n",
    "                    print(word)\n",
    "                else:\n",
    "                    print('NOOOOO')\n",
    "        print(etree.tostring(element, encoding=\"UTF-8\"))\n",
    "        myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
    "        element.clear()\n",
    "        break"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}