stress_asignment/sloleks_accentuation.py

# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import numpy as np
from keras.models import load_model
import sys

from prepare_data import *

np.random.seed(7)

data = Data('l', shuffle_all_inputs=False)
content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex')
dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content)
feature_dictionary = data._create_slovene_feature_dictionary()
syllable_dictionary = data._create_syllables_dictionary(content, vowels)
accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']


letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(
    'cnn/word_accetuation/cnn_dictionary/v3_10/20_test_epoch.h5',
    'cnn/word_accetuation/syllables/v2_4/20_test_epoch.h5',
    'cnn/word_accetuation/syllabled_letters/v2_5_3/20_test_epoch.h5')

letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models(
    'cnn/accent_classification/letters/v2_1/20_test_epoch.h5',
    'cnn/accent_classification/syllables/v1_0/20_test_epoch.h5',
    'cnn/accent_classification/syllabled_letters/v1_0/20_test_epoch.h5')

from lxml import etree


def xml_words_generator(xml_path):
    for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):
        words = []
        for child in element:
            if child.tag == 'WordForm':
                msd = None
                word = None
                for wf in child:
                    if 'att' in wf.attrib and wf.attrib['att'] == 'msd':
                        msd = wf.attrib['val']
                    elif wf.tag == 'FormRepresentation':
                        for form_rep in wf:
                            if form_rep.attrib['att'] == 'zapis_oblike':
                                word = form_rep.attrib['val']
                        # if msd is not None and word is not None:
                        #    pass
                        # else:
                        #    print('NOOOOO')
                        words.append([word, '', msd, word])
        yield words


gen = xml_words_generator('data/Sloleks_v1.2.xml')

# Words proccesed: 650250
# Word indeks: 50023
# Word number: 50023

from lxml import etree
import time

gen = xml_words_generator('data/Sloleks_v1.2.xml')
word_glob_num = 0
word_limit = 0
iter_num = 50000
word_index = 0
start_timer = time.time()
iter_index = 0
words = []

lexical_entries_load_number = 0
lexical_entries_save_number = 0

# INSIDE
word_glob_num = 1500686

word_limit = 50000
iter_index = 30

done_lexical_entries = 33522

import gc

with open("data/new_sloleks/new_sloleks.xml", "ab") as myfile:
    myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
    for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
        # LOAD NEW WORDS AND ACCENTUATE THEM
        # print("HERE")

        if lexical_entries_save_number < done_lexical_entries:
            g = next(gen)
            # print(lexical_entries_save_number)
            lexical_entries_save_number += 1
            lexical_entries_load_number += 1
            print(lexical_entries_save_number)
            del g
            gc.collect()
            continue

        if word_glob_num >= word_limit:
            myfile2.close()
            myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
            iter_index += 1
            print("Words proccesed: " + str(word_glob_num))

            print("Word indeks: " + str(word_index))
            print("Word number: " + str(len(words)))

            print("lexical_entries_load_number: " + str(lexical_entries_load_number))
            print("lexical_entries_save_number: " + str(lexical_entries_save_number))

            end_timer = time.time()
            print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")

            word_index = 0
            words = []

            while len(words) < iter_num:
                try:
                    words.extend(next(gen))
                    lexical_entries_load_number += 1
                except:
                    break
            # if word_glob_num > 1:
            #    break

            data = Data('l', shuffle_all_inputs=False)
            location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model,
                                                                           syllabled_letters_location_model,
                                                                           letter_type_model, syllable_type_model, syllabled_letter_type_model,
                                                                           dictionary, max_word, max_num_vowels, vowels, accented_vowels,
                                                                           feature_dictionary, syllable_dictionary)

            word_limit += len(words)

        # READ DATA
        for child in element:
            if child.tag == 'WordForm':
                msd = None
                word = None
                for wf in child:
                    if wf.tag == 'FormRepresentation':
                        new_element = etree.Element('feat')
                        new_element.attrib['att'] = 'naglasna_mesta_oblike'
                        new_element.attrib['val'] = location_accented_words[word_index]
                        wf.append(new_element)

                        new_element = etree.Element('feat')
                        new_element.attrib['att'] = 'naglašena_oblika'
                        new_element.attrib['val'] = accented_words[word_index]
                        wf.append(new_element)
                        word_glob_num += 1
                        word_index += 1

        # print(etree.tostring(element, encoding="UTF-8"))
        myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
        myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
        element.clear()
        lexical_entries_save_number += 1
Commit before major RAM lack update 2018-03-21 10:35:05 +00:00			`# -- coding: utf-8 --`
			`from __future__ import unicode_literals`

			`import numpy as np`
			`from keras.models import load_model`
			`import sys`

			`from prepare_data import *`

			`np.random.seed(7)`

			`data = Data('l', shuffle_all_inputs=False)`
			`content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex')`
			`dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content)`
			`feature_dictionary = data._create_slovene_feature_dictionary()`
			`syllable_dictionary = data._create_syllables_dictionary(content, vowels)`
			`accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']`


			`letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(`
			`'cnn/word_accetuation/cnn_dictionary/v3_10/20_test_epoch.h5',`
			`'cnn/word_accetuation/syllables/v2_4/20_test_epoch.h5',`
			`'cnn/word_accetuation/syllabled_letters/v2_5_3/20_test_epoch.h5')`

			`letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models(`
			`'cnn/accent_classification/letters/v2_1/20_test_epoch.h5',`
			`'cnn/accent_classification/syllables/v1_0/20_test_epoch.h5',`
			`'cnn/accent_classification/syllabled_letters/v1_0/20_test_epoch.h5')`

			`from lxml import etree`


			`def xml_words_generator(xml_path):`
			`for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):`
			`words = []`
			`for child in element:`
			`if child.tag == 'WordForm':`
			`msd = None`
			`word = None`
			`for wf in child:`
			`if 'att' in wf.attrib and wf.attrib['att'] == 'msd':`
			`msd = wf.attrib['val']`
			`elif wf.tag == 'FormRepresentation':`
			`for form_rep in wf:`
			`if form_rep.attrib['att'] == 'zapis_oblike':`
			`word = form_rep.attrib['val']`
			`# if msd is not None and word is not None:`
			`# pass`
			`# else:`
			`# print('NOOOOO')`
			`words.append([word, '', msd, word])`
			`yield words`


			`gen = xml_words_generator('data/Sloleks_v1.2.xml')`

			`# Words proccesed: 650250`
			`# Word indeks: 50023`
			`# Word number: 50023`

			`from lxml import etree`
			`import time`

			`gen = xml_words_generator('data/Sloleks_v1.2.xml')`
			`word_glob_num = 0`
			`word_limit = 0`
			`iter_num = 50000`
			`word_index = 0`
			`start_timer = time.time()`
			`iter_index = 0`
			`words = []`

			`lexical_entries_load_number = 0`
			`lexical_entries_save_number = 0`

			`# INSIDE`
			`word_glob_num = 1500686`

			`word_limit = 50000`
			`iter_index = 30`

			`done_lexical_entries = 33522`

			`import gc`

			`with open("data/new_sloleks/new_sloleks.xml", "ab") as myfile:`
			`myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')`
			`for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):`
			`# LOAD NEW WORDS AND ACCENTUATE THEM`
			`# print("HERE")`

			`if lexical_entries_save_number < done_lexical_entries:`
			`g = next(gen)`
			`# print(lexical_entries_save_number)`
			`lexical_entries_save_number += 1`
			`lexical_entries_load_number += 1`
			`print(lexical_entries_save_number)`
			`del g`
			`gc.collect()`
			`continue`

			`if word_glob_num >= word_limit:`
			`myfile2.close()`
			`myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')`
			`iter_index += 1`
			`print("Words proccesed: " + str(word_glob_num))`

			`print("Word indeks: " + str(word_index))`
			`print("Word number: " + str(len(words)))`

			`print("lexical_entries_load_number: " + str(lexical_entries_load_number))`
			`print("lexical_entries_save_number: " + str(lexical_entries_save_number))`

			`end_timer = time.time()`
			`print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")`

			`word_index = 0`
			`words = []`

			`while len(words) < iter_num:`
			`try:`
			`words.extend(next(gen))`
			`lexical_entries_load_number += 1`
			`except:`
			`break`
			`# if word_glob_num > 1:`
			`# break`

			`data = Data('l', shuffle_all_inputs=False)`
			`location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model,`
			`syllabled_letters_location_model,`
			`letter_type_model, syllable_type_model, syllabled_letter_type_model,`
			`dictionary, max_word, max_num_vowels, vowels, accented_vowels,`
			`feature_dictionary, syllable_dictionary)`

			`word_limit += len(words)`

			`# READ DATA`
			`for child in element:`
			`if child.tag == 'WordForm':`
			`msd = None`
			`word = None`
			`for wf in child:`
			`if wf.tag == 'FormRepresentation':`
			`new_element = etree.Element('feat')`
			`new_element.attrib['att'] = 'naglasna_mesta_oblike'`
			`new_element.attrib['val'] = location_accented_words[word_index]`
			`wf.append(new_element)`

			`new_element = etree.Element('feat')`
			`new_element.attrib['att'] = 'naglašena_oblika'`
			`new_element.attrib['val'] = accented_words[word_index]`
			`wf.append(new_element)`
			`word_glob_num += 1`
			`word_index += 1`

			`# print(etree.tostring(element, encoding="UTF-8"))`
			`myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))`
			`myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))`
			`element.clear()`
			`lexical_entries_save_number += 1`