Added fix for gímiljênje."
This commit is contained in:
		
							parent
							
								
									96d03b5e47
								
							
						
					
					
						commit
						def19f1eb1
					
				
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -109,3 +109,5 @@ postprocessing/data_merge_analysis.py | ||||
| data_merge_analysis.py | ||||
| postprocessing/sp_sloleks_data_merge.py | ||||
| sp_sloleks_data_merge.py | ||||
| postprocessing/data_merge_xml2tab.py | ||||
| data_merge_xml2tab.py | ||||
|  | ||||
| @ -5,7 +5,7 @@ import re | ||||
| 
 | ||||
| from lxml import etree | ||||
| import time | ||||
| from prepare_data import * | ||||
| # from prepare_data import * | ||||
| 
 | ||||
| accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù'] | ||||
| 
 | ||||
| @ -77,15 +77,15 @@ word_index = 0 | ||||
| # iter_index = 31 | ||||
| 
 | ||||
| # done_lexical_entries = 33522 | ||||
| data = Data('s', shuffle_all_inputs=False) | ||||
| # data = Data('s', shuffle_all_inputs=False) | ||||
| # accentuated_content = data._read_content('data/new_sloleks/final_sloleks2.tab') | ||||
| 
 | ||||
| start_timer = time.time() | ||||
| lemmas = 0 | ||||
| print('Copy initialization complete') | ||||
| with open("data/contextual_changes/accented_lemmas_final_sloleks2_small.xml", "ab") as myfile: | ||||
| with open("data/contextual_changes/stressed_lemmas_sloleks2.xml", "ab") as myfile: | ||||
|     # myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab') | ||||
|     for event, element in etree.iterparse('data/new_sloleks/final_sloleks2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): | ||||
|     for event, element in etree.iterparse('data/contextual_changes/final_sloleks2_inhouse2S.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): | ||||
|     # for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): | ||||
|         # if word_glob_num >= word_limit: | ||||
|         #     myfile2.close() | ||||
| @ -1204,7 +1204,7 @@ class Data: | ||||
|         else: | ||||
|             word_list = list(word[::-1]) | ||||
|         for i in range(len(word_list)): | ||||
|             if self._is_vowel(word_list, i, vowels): | ||||
|             if self._is_vowel(word_list, i, vowels + accented_vowels): | ||||
|                 if location == vowel_index: | ||||
|                     if len(np.where(y == 1)[0]) == 1: | ||||
|                         word_list[i] = accented_vowels[np.where(y == 1)[0][0]] | ||||
|  | ||||
| @ -41,13 +41,14 @@ letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model = d | ||||
|     'cnn/accent_classification/syllabled_letters/v2_0/20_final_epoch.h5') | ||||
| 
 | ||||
| data = Data('s', shuffle_all_inputs=False) | ||||
| new_content = data._read_content('data/sloleks-sl_v1.2.tbl') | ||||
| # new_content = data._read_content('data/sloleks-sl_v1.2.tbl') | ||||
| new_content = data._read_content('data/contextual_changes/small/sloleks-sl_v1.2.tbl') | ||||
| 
 | ||||
| print('Commencing accentuator!') | ||||
| 
 | ||||
| rate = 100000 | ||||
| start_timer = time.time() | ||||
| with open("data/new_sloleks/new_sloleks2.tab", "a") as myfile: | ||||
| with open("data/contextual_changes/small/new_sloleks2_small2.tab", "a") as myfile: | ||||
|     for index in range(0, len(new_content), rate): | ||||
|         if index+rate >= len(new_content): | ||||
|             words = [[el[0], '', el[2], el[0]] for el in new_content][index:len(new_content)] | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user