From def19f1eb15d18ac0d471fb5480bb836bd56fd89 Mon Sep 17 00:00:00 2001 From: Luka Date: Mon, 1 Oct 2018 13:09:57 +0200 Subject: [PATCH] =?UTF-8?q?Added=20fix=20for=20g=C3=ADmilj=C3=AAnje."?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 ++ .../assign_stress2lemmas.py => assign_stress2lemmas.py | 8 ++++---- prepare_data.py | 2 +- sloleks_accentuation2.py | 5 +++-- 4 files changed, 10 insertions(+), 7 deletions(-) rename postprocessing/assign_stress2lemmas.py => assign_stress2lemmas.py (95%) diff --git a/.gitignore b/.gitignore index d0a5416..5ff6846 100755 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,5 @@ postprocessing/data_merge_analysis.py data_merge_analysis.py postprocessing/sp_sloleks_data_merge.py sp_sloleks_data_merge.py +postprocessing/data_merge_xml2tab.py +data_merge_xml2tab.py diff --git a/postprocessing/assign_stress2lemmas.py b/assign_stress2lemmas.py similarity index 95% rename from postprocessing/assign_stress2lemmas.py rename to assign_stress2lemmas.py index 66d579b..1cede29 100755 --- a/postprocessing/assign_stress2lemmas.py +++ b/assign_stress2lemmas.py @@ -5,7 +5,7 @@ import re from lxml import etree import time -from prepare_data import * +# from prepare_data import * accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù'] @@ -77,15 +77,15 @@ word_index = 0 # iter_index = 31 # done_lexical_entries = 33522 -data = Data('s', shuffle_all_inputs=False) +# data = Data('s', shuffle_all_inputs=False) # accentuated_content = data._read_content('data/new_sloleks/final_sloleks2.tab') start_timer = time.time() lemmas = 0 print('Copy initialization complete') -with open("data/contextual_changes/accented_lemmas_final_sloleks2_small.xml", "ab") as myfile: +with open("data/contextual_changes/stressed_lemmas_sloleks2.xml", "ab") as myfile: # myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab') - for event, element in etree.iterparse('data/new_sloleks/final_sloleks2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): + for event, element in etree.iterparse('data/contextual_changes/final_sloleks2_inhouse2S.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): # for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): # if word_glob_num >= word_limit: # myfile2.close() diff --git a/prepare_data.py b/prepare_data.py index 50f640a..dc9e53a 100755 --- a/prepare_data.py +++ b/prepare_data.py @@ -1204,7 +1204,7 @@ class Data: else: word_list = list(word[::-1]) for i in range(len(word_list)): - if self._is_vowel(word_list, i, vowels): + if self._is_vowel(word_list, i, vowels + accented_vowels): if location == vowel_index: if len(np.where(y == 1)[0]) == 1: word_list[i] = accented_vowels[np.where(y == 1)[0][0]] diff --git a/sloleks_accentuation2.py b/sloleks_accentuation2.py index 8b9c0ac..3a33d2e 100755 --- a/sloleks_accentuation2.py +++ b/sloleks_accentuation2.py @@ -41,13 +41,14 @@ letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model = d 'cnn/accent_classification/syllabled_letters/v2_0/20_final_epoch.h5') data = Data('s', shuffle_all_inputs=False) -new_content = data._read_content('data/sloleks-sl_v1.2.tbl') +# new_content = data._read_content('data/sloleks-sl_v1.2.tbl') +new_content = data._read_content('data/contextual_changes/small/sloleks-sl_v1.2.tbl') print('Commencing accentuator!') rate = 100000 start_timer = time.time() -with open("data/new_sloleks/new_sloleks2.tab", "a") as myfile: +with open("data/contextual_changes/small/new_sloleks2_small2.tab", "a") as myfile: for index in range(0, len(new_content), rate): if index+rate >= len(new_content): words = [[el[0], '', el[2], el[0]] for el in new_content][index:len(new_content)]