Commit before major RAM lack update

2018-03-21 11:35:05 +01:00
parent a316574314
commit 9edad0ad07
4 changed files with 1367 additions and 1108 deletions
--- a/sloleks_accentuation.py
+++ b/sloleks_accentuation.py
@@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import numpy as np
+from keras.models import load_model
+import sys
+
+from prepare_data import *
+
+np.random.seed(7)
+
+data = Data('l', shuffle_all_inputs=False)
+content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex')
+dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content)
+feature_dictionary = data._create_slovene_feature_dictionary()
+syllable_dictionary = data._create_syllables_dictionary(content, vowels)
+accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']
+
+
+letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(
+    'cnn/word_accetuation/cnn_dictionary/v3_10/20_test_epoch.h5',
+    'cnn/word_accetuation/syllables/v2_4/20_test_epoch.h5',
+    'cnn/word_accetuation/syllabled_letters/v2_5_3/20_test_epoch.h5')
+
+letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models(
+    'cnn/accent_classification/letters/v2_1/20_test_epoch.h5',
+    'cnn/accent_classification/syllables/v1_0/20_test_epoch.h5',
+    'cnn/accent_classification/syllabled_letters/v1_0/20_test_epoch.h5')
+
+from lxml import etree
+
+
+def xml_words_generator(xml_path):
+    for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):
+        words = []
+        for child in element:
+            if child.tag == 'WordForm':
+                msd = None
+                word = None
+                for wf in child:
+                    if 'att' in wf.attrib and wf.attrib['att'] == 'msd':
+                        msd = wf.attrib['val']
+                    elif wf.tag == 'FormRepresentation':
+                        for form_rep in wf:
+                            if form_rep.attrib['att'] == 'zapis_oblike':
+                                word = form_rep.attrib['val']
+                        # if msd is not None and word is not None:
+                        #    pass
+                        # else:
+                        #    print('NOOOOO')
+                        words.append([word, '', msd, word])
+        yield words
+
+
+gen = xml_words_generator('data/Sloleks_v1.2.xml')
+
+# Words proccesed: 650250
+# Word indeks: 50023
+# Word number: 50023
+
+from lxml import etree
+import time
+
+gen = xml_words_generator('data/Sloleks_v1.2.xml')
+word_glob_num = 0
+word_limit = 0
+iter_num = 50000
+word_index = 0
+start_timer = time.time()
+iter_index = 0
+words = []
+
+lexical_entries_load_number = 0
+lexical_entries_save_number = 0
+
+# INSIDE
+word_glob_num = 1500686
+
+word_limit = 50000
+iter_index = 30
+
+done_lexical_entries = 33522
+
+import gc
+
+with open("data/new_sloleks/new_sloleks.xml", "ab") as myfile:
+    myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
+    for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
+        # LOAD NEW WORDS AND ACCENTUATE THEM
+        # print("HERE")
+
+        if lexical_entries_save_number < done_lexical_entries:
+            g = next(gen)
+            # print(lexical_entries_save_number)
+            lexical_entries_save_number += 1
+            lexical_entries_load_number += 1
+            print(lexical_entries_save_number)
+            del g
+            gc.collect()
+            continue
+
+        if word_glob_num >= word_limit:
+            myfile2.close()
+            myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
+            iter_index += 1
+            print("Words proccesed: " + str(word_glob_num))
+
+            print("Word indeks: " + str(word_index))
+            print("Word number: " + str(len(words)))
+
+            print("lexical_entries_load_number: " + str(lexical_entries_load_number))
+            print("lexical_entries_save_number: " + str(lexical_entries_save_number))
+
+            end_timer = time.time()
+            print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
+
+            word_index = 0
+            words = []
+
+            while len(words) < iter_num:
+                try:
+                    words.extend(next(gen))
+                    lexical_entries_load_number += 1
+                except:
+                    break
+            # if word_glob_num > 1:
+            #    break
+
+            data = Data('l', shuffle_all_inputs=False)
+            location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model,
+                                                                           syllabled_letters_location_model,
+                                                                           letter_type_model, syllable_type_model, syllabled_letter_type_model,
+                                                                           dictionary, max_word, max_num_vowels, vowels, accented_vowels,
+                                                                           feature_dictionary, syllable_dictionary)
+
+            word_limit += len(words)
+
+        # READ DATA
+        for child in element:
+            if child.tag == 'WordForm':
+                msd = None
+                word = None
+                for wf in child:
+                    if wf.tag == 'FormRepresentation':
+                        new_element = etree.Element('feat')
+                        new_element.attrib['att'] = 'naglasna_mesta_oblike'
+                        new_element.attrib['val'] = location_accented_words[word_index]
+                        wf.append(new_element)
+
+                        new_element = etree.Element('feat')
+                        new_element.attrib['att'] = 'naglašena_oblika'
+                        new_element.attrib['val'] = accented_words[word_index]
+                        wf.append(new_element)
+                        word_glob_num += 1
+                        word_index += 1
+
+        # print(etree.tostring(element, encoding="UTF-8"))
+        myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
+        myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
+        element.clear()
+        lexical_entries_save_number += 1