Fixed wrong stress location asignment of new stress type"
This commit is contained in:
		
							parent
							
								
									d09b5a8293
								
							
						
					
					
						commit
						d4ea584fc4
					
				
							
								
								
									
										5
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -97,5 +97,6 @@ new_sloleks.xml | ||||
| grid_results/ | ||||
| .idea/ | ||||
| cnn/word_accetuation/svm/data/ | ||||
| data_merge.ipynb | ||||
| data_merge.py | ||||
| postprocessing/data_merge.ipynb | ||||
| postprocessing/data_merge.py | ||||
| postprocessing/sp_data_merge.py | ||||
|  | ||||
| @ -54,7 +54,7 @@ letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model = d | ||||
| content = data._read_content(read_location) | ||||
| 
 | ||||
| # format data for accentuate_word function it has to be like [['besedišči', '', 'Ncnpi', 'besedišči'], ] | ||||
| content = [[el[0], '', el[1][:-1], el[0]] for el in content[:-1]] | ||||
| content = [[el[0], '', el[1][:-1], el[0]] for el in content] | ||||
| 
 | ||||
| # use environment variables and models to accentuate words | ||||
| data = Data('l', shuffle_all_inputs=False) | ||||
|  | ||||
							
								
								
									
										173
									
								
								postprocessing/assign_stress2lemmas.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										173
									
								
								postprocessing/assign_stress2lemmas.py
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,173 @@ | ||||
| # Words proccesed: 650250 | ||||
| # Word indeks: 50023 | ||||
| # Word number: 50023 | ||||
| import re | ||||
| 
 | ||||
| from lxml import etree | ||||
| import time | ||||
| from prepare_data import * | ||||
| 
 | ||||
| accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù'] | ||||
| 
 | ||||
| def stressed2unstressed(w): | ||||
|     w = w.replace('ŕ', 'r') | ||||
|     w = w.replace('á', 'a') | ||||
|     w = w.replace('à', 'a') | ||||
|     w = w.replace('é', 'e') | ||||
|     w = w.replace('è', 'e') | ||||
|     w = w.replace('ê', 'e') | ||||
|     w = w.replace('í', 'i') | ||||
|     w = w.replace('ì', 'i') | ||||
|     w = w.replace('ó', 'o') | ||||
|     w = w.replace('ô', 'o') | ||||
|     w = w.replace('ò', 'o') | ||||
|     w = w.replace('ú', 'u') | ||||
|     w = w.replace('ù', 'u') | ||||
| 
 | ||||
|     return w | ||||
| 
 | ||||
| 
 | ||||
| """Works on finalized XML | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| from text2SAMPA import * | ||||
| 
 | ||||
| # def xml_words_generator(xml_path): | ||||
| #     for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"): | ||||
| #         words = [] | ||||
| #         for child in element: | ||||
| #             if child.tag == 'WordForm': | ||||
| #                 msd = None | ||||
| #                 word = None | ||||
| #                 for wf in child: | ||||
| #                     if 'att' in wf.attrib and wf.attrib['att'] == 'msd': | ||||
| #                         msd = wf.attrib['val'] | ||||
| #                     elif wf.tag == 'FormRepresentation': | ||||
| #                         for form_rep in wf: | ||||
| #                             if form_rep.attrib['att'] == 'zapis_oblike': | ||||
| #                                 word = form_rep.attrib['val'] | ||||
| #                         #if msd is not None and word is not None: | ||||
| #                         #    pass | ||||
| #                         #else: | ||||
| #                         #    print('NOOOOO') | ||||
| #                         words.append([word, '', msd, word]) | ||||
| #         yield words | ||||
| # | ||||
| # | ||||
| # gen = xml_words_generator('data/Sloleks_v1.2_p2.xml') | ||||
| word_glob_num = 0 | ||||
| word_limit = 50000 | ||||
| iter_num = 50000 | ||||
| word_index = 0 | ||||
| 
 | ||||
| # iter_index = 0 | ||||
| # words = [] | ||||
| # | ||||
| # lexical_entries_load_number = 0 | ||||
| # lexical_entries_save_number = 0 | ||||
| # | ||||
| # # INSIDE | ||||
| # # word_glob_num = 1500686 | ||||
| # word_glob_num = 1550705 | ||||
| # | ||||
| # # word_limit = 1500686 | ||||
| # word_limit = 1550705 | ||||
| # | ||||
| # iter_index = 31 | ||||
| 
 | ||||
| # done_lexical_entries = 33522 | ||||
| data = Data('s', shuffle_all_inputs=False) | ||||
| # accentuated_content = data._read_content('data/new_sloleks/final_sloleks2.tab') | ||||
| 
 | ||||
| start_timer = time.time() | ||||
| lemmas = 0 | ||||
| print('Copy initialization complete') | ||||
| with open("data/contextual_changes/accented_lemmas_final_sloleks2_small.xml", "ab") as myfile: | ||||
|     # myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab') | ||||
|     for event, element in etree.iterparse('data/new_sloleks/final_sloleks2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): | ||||
|     # for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): | ||||
|         # if word_glob_num >= word_limit: | ||||
|         #     myfile2.close() | ||||
|         #     myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab') | ||||
|         #     iter_index += 1 | ||||
|         #     print("Words proccesed: " + str(word_glob_num)) | ||||
|         # | ||||
|         #     print("Word indeks: " + str(word_index)) | ||||
|         #     print("Word number: " + str(len(words))) | ||||
|         # | ||||
|         #     # print("lexical_entries_load_number: " + str(lexical_entries_load_number)) | ||||
|         #     # print("lexical_entries_save_number: " + str(lexical_entries_save_number)) | ||||
|         # | ||||
|         #     end_timer = time.time() | ||||
|         #     print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes") | ||||
|         lemma = '' | ||||
|         stressed_lemma = '' | ||||
|         msd = '' | ||||
|         word_form_found = False | ||||
|         for child in element: | ||||
|             if child.tag == 'Lemma': | ||||
|                 for wf in child: | ||||
|                     if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike': | ||||
|                         lemma = wf.attrib['val'] | ||||
|             if child.tag == 'WordForm': | ||||
|                 msd = None | ||||
|                 word = None | ||||
|                 for wf in child: | ||||
|                     if 'att' in wf.attrib and wf.attrib['att'] == 'msd': | ||||
|                         msd = wf.attrib['val'] | ||||
|                     elif wf.tag == 'FormRepresentation': | ||||
|                         for form_rep in wf: | ||||
|                             if form_rep.attrib['att'] == 'naglašena_beseda': | ||||
|                                 stressed_lemma = form_rep.attrib['val'] | ||||
|                                 word_form_found = True | ||||
|                                 break | ||||
| 
 | ||||
|                         break | ||||
| 
 | ||||
|                         # new_element = etree.Element('feat') | ||||
|                         # new_element.attrib['att'] = 'SAMPA' | ||||
|                         # | ||||
|                         # wf.append(new_element) | ||||
|                         # | ||||
|                         # word_glob_num += 1 | ||||
|                         # word_index += 1 | ||||
|                 break | ||||
| 
 | ||||
|         if re.match(r'S..ei', msd) or re.match(r'S..mi', msd) or re.match(r'Sometn', msd) or re.match(r'P..mei.*', msd) \ | ||||
|                 or re.match(r'P..zei.*', msd) or re.match(r'P..sei.*', msd) or re.match(r'G..n.*', msd) \ | ||||
|                 or re.match(r'R.n', msd) or re.match(r'Rss', msd) or re.match(r'Rd', msd)  \ | ||||
|                 or re.match(r'K.*', msd) or re.match(r'D.', msd) or re.match(r'L', msd) or re.match(r'M', msd) \ | ||||
|                 or re.match(r'O', msd) or re.match(r'Z.*', msd) or re.match(r'V.', msd) or re.match(r'Rsr.', msd)\ | ||||
|                 or msd == "": | ||||
|             if lemma != stressed2unstressed(stressed_lemma): | ||||
|                 print(lemma + " : " + stressed_lemma + " - " + msd) | ||||
|             pass | ||||
|         else: | ||||
|             # print("Error2 - " + msd + " " + lemma + " - " + stressed_lemma) | ||||
|             # print(lemma + " - " + msd) | ||||
|             pass | ||||
| 
 | ||||
|         for child in element: | ||||
|             if child.tag == 'Lemma': | ||||
|                 for wf in child: | ||||
|                     if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike': | ||||
|                         wf.attrib['val'] = stressed_lemma | ||||
|                         break | ||||
|                     else: | ||||
|                         print('Error1') | ||||
|                 break | ||||
| 
 | ||||
| 
 | ||||
|         lemmas += 1 | ||||
|         # print(etree.tostring(element, encoding="UTF-8")) | ||||
|         # myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True)) | ||||
|         if word_glob_num > word_limit: | ||||
|             # print('Proccessed ' + str(word_glob_num) + ' words') | ||||
|             end_timer = time.time() | ||||
|             # print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes") | ||||
|             word_limit += iter_num | ||||
|         myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True)) | ||||
|         element.clear() | ||||
| 
 | ||||
| print(lemmas) | ||||
| @ -1764,3 +1764,16 @@ class Data: | ||||
| #                           [ 0.,  0.92,  0.,  0.51,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])).eval()) | ||||
| def actual_accuracy(y_true, y_pred): | ||||
|     return K.mean(K.equal(K.mean(K.equal(K.round(y_true), K.round(y_pred)), axis=-1), 1.0)) | ||||
| 
 | ||||
| 
 | ||||
| def convert_to_correct_stress(w): | ||||
|     w = w.replace('ì', 'ê') | ||||
|     w = w.replace('à', 'ŕ') | ||||
|     w = w.replace('ä', 'à') | ||||
|     w = w.replace('ë', 'è') | ||||
|     # cor_content[i][3] = cor_content[i][3].replace('ě', 'ê') | ||||
|     w = w.replace('î', 'ì') | ||||
|     w = w.replace('ö', 'ò') | ||||
|     w = w.replace('ü', 'ù') | ||||
| 
 | ||||
|     return w | ||||
|  | ||||
| @ -16,7 +16,8 @@ content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex') | ||||
| dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content) | ||||
| feature_dictionary = data._create_slovene_feature_dictionary() | ||||
| syllable_dictionary = data._create_syllables_dictionary(content, vowels) | ||||
| accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù'] | ||||
| # accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù'] | ||||
| accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü'] | ||||
| 
 | ||||
| data = Data('l', shuffle_all_inputs=False) | ||||
| letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models( | ||||
| @ -62,7 +63,8 @@ with open("data/new_sloleks/new_sloleks2.tab", "a") as myfile: | ||||
|         res = '' | ||||
|         for i in range(index, index + len(words)): | ||||
|             res += new_content[i][0] + '\t' + new_content[i][1] + '\t' + new_content[i][2] + '\t' \ | ||||
|             + new_content[i][3][:-1] + '\t' + location_accented_words[i-index] + '\t' + accented_words[i-index] + '\n' | ||||
|             + new_content[i][3][:-1] + '\t' + convert_to_correct_stress(location_accented_words[i-index]) + '\t' + \ | ||||
|             convert_to_correct_stress(accented_words[i-index]) + '\n' | ||||
| 
 | ||||
|         print('Writing data from ' + str(index) + ' onward.') | ||||
|         end_timer = time.time() | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user