diff --git a/.gitignore b/.gitignore index 1fe4c96..d327e6a 100755 --- a/.gitignore +++ b/.gitignore @@ -97,5 +97,6 @@ new_sloleks.xml grid_results/ .idea/ cnn/word_accetuation/svm/data/ -data_merge.ipynb -data_merge.py +postprocessing/data_merge.ipynb +postprocessing/data_merge.py +postprocessing/sp_data_merge.py diff --git a/accentuate.py b/accentuate.py index c66d955..d7cea22 100755 --- a/accentuate.py +++ b/accentuate.py @@ -54,7 +54,7 @@ letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model = d content = data._read_content(read_location) # format data for accentuate_word function it has to be like [['besedišči', '', 'Ncnpi', 'besedišči'], ] -content = [[el[0], '', el[1][:-1], el[0]] for el in content[:-1]] +content = [[el[0], '', el[1][:-1], el[0]] for el in content] # use environment variables and models to accentuate words data = Data('l', shuffle_all_inputs=False) diff --git a/postprocessing/assign_stress2lemmas.py b/postprocessing/assign_stress2lemmas.py new file mode 100755 index 0000000..3bdbcb0 --- /dev/null +++ b/postprocessing/assign_stress2lemmas.py @@ -0,0 +1,173 @@ +# Words proccesed: 650250 +# Word indeks: 50023 +# Word number: 50023 +import re + +from lxml import etree +import time +from prepare_data import * + +accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù'] + +def stressed2unstressed(w): + w = w.replace('ŕ', 'r') + w = w.replace('á', 'a') + w = w.replace('à', 'a') + w = w.replace('é', 'e') + w = w.replace('è', 'e') + w = w.replace('ê', 'e') + w = w.replace('í', 'i') + w = w.replace('ì', 'i') + w = w.replace('ó', 'o') + w = w.replace('ô', 'o') + w = w.replace('ò', 'o') + w = w.replace('ú', 'u') + w = w.replace('ù', 'u') + + return w + + +"""Works on finalized XML +""" + + +from text2SAMPA import * + +# def xml_words_generator(xml_path): +# for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"): +# words = [] +# for child in element: +# if child.tag == 'WordForm': +# msd = None +# word = None +# for wf in child: +# if 'att' in wf.attrib and wf.attrib['att'] == 'msd': +# msd = wf.attrib['val'] +# elif wf.tag == 'FormRepresentation': +# for form_rep in wf: +# if form_rep.attrib['att'] == 'zapis_oblike': +# word = form_rep.attrib['val'] +# #if msd is not None and word is not None: +# # pass +# #else: +# # print('NOOOOO') +# words.append([word, '', msd, word]) +# yield words +# +# +# gen = xml_words_generator('data/Sloleks_v1.2_p2.xml') +word_glob_num = 0 +word_limit = 50000 +iter_num = 50000 +word_index = 0 + +# iter_index = 0 +# words = [] +# +# lexical_entries_load_number = 0 +# lexical_entries_save_number = 0 +# +# # INSIDE +# # word_glob_num = 1500686 +# word_glob_num = 1550705 +# +# # word_limit = 1500686 +# word_limit = 1550705 +# +# iter_index = 31 + +# done_lexical_entries = 33522 +data = Data('s', shuffle_all_inputs=False) +# accentuated_content = data._read_content('data/new_sloleks/final_sloleks2.tab') + +start_timer = time.time() +lemmas = 0 +print('Copy initialization complete') +with open("data/contextual_changes/accented_lemmas_final_sloleks2_small.xml", "ab") as myfile: + # myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab') + for event, element in etree.iterparse('data/new_sloleks/final_sloleks2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): + # for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): + # if word_glob_num >= word_limit: + # myfile2.close() + # myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab') + # iter_index += 1 + # print("Words proccesed: " + str(word_glob_num)) + # + # print("Word indeks: " + str(word_index)) + # print("Word number: " + str(len(words))) + # + # # print("lexical_entries_load_number: " + str(lexical_entries_load_number)) + # # print("lexical_entries_save_number: " + str(lexical_entries_save_number)) + # + # end_timer = time.time() + # print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes") + lemma = '' + stressed_lemma = '' + msd = '' + word_form_found = False + for child in element: + if child.tag == 'Lemma': + for wf in child: + if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike': + lemma = wf.attrib['val'] + if child.tag == 'WordForm': + msd = None + word = None + for wf in child: + if 'att' in wf.attrib and wf.attrib['att'] == 'msd': + msd = wf.attrib['val'] + elif wf.tag == 'FormRepresentation': + for form_rep in wf: + if form_rep.attrib['att'] == 'naglašena_beseda': + stressed_lemma = form_rep.attrib['val'] + word_form_found = True + break + + break + + # new_element = etree.Element('feat') + # new_element.attrib['att'] = 'SAMPA' + # + # wf.append(new_element) + # + # word_glob_num += 1 + # word_index += 1 + break + + if re.match(r'S..ei', msd) or re.match(r'S..mi', msd) or re.match(r'Sometn', msd) or re.match(r'P..mei.*', msd) \ + or re.match(r'P..zei.*', msd) or re.match(r'P..sei.*', msd) or re.match(r'G..n.*', msd) \ + or re.match(r'R.n', msd) or re.match(r'Rss', msd) or re.match(r'Rd', msd) \ + or re.match(r'K.*', msd) or re.match(r'D.', msd) or re.match(r'L', msd) or re.match(r'M', msd) \ + or re.match(r'O', msd) or re.match(r'Z.*', msd) or re.match(r'V.', msd) or re.match(r'Rsr.', msd)\ + or msd == "": + if lemma != stressed2unstressed(stressed_lemma): + print(lemma + " : " + stressed_lemma + " - " + msd) + pass + else: + # print("Error2 - " + msd + " " + lemma + " - " + stressed_lemma) + # print(lemma + " - " + msd) + pass + + for child in element: + if child.tag == 'Lemma': + for wf in child: + if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike': + wf.attrib['val'] = stressed_lemma + break + else: + print('Error1') + break + + + lemmas += 1 + # print(etree.tostring(element, encoding="UTF-8")) + # myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True)) + if word_glob_num > word_limit: + # print('Proccessed ' + str(word_glob_num) + ' words') + end_timer = time.time() + # print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes") + word_limit += iter_num + myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True)) + element.clear() + +print(lemmas) \ No newline at end of file diff --git a/prepare_data.py b/prepare_data.py index 75eefcb..a626f38 100755 --- a/prepare_data.py +++ b/prepare_data.py @@ -1764,3 +1764,16 @@ class Data: # [ 0., 0.92, 0., 0.51, 0., 0., 0., 0., 0., 0., 0.]])).eval()) def actual_accuracy(y_true, y_pred): return K.mean(K.equal(K.mean(K.equal(K.round(y_true), K.round(y_pred)), axis=-1), 1.0)) + + +def convert_to_correct_stress(w): + w = w.replace('ì', 'ê') + w = w.replace('à', 'ŕ') + w = w.replace('ä', 'à') + w = w.replace('ë', 'è') + # cor_content[i][3] = cor_content[i][3].replace('ě', 'ê') + w = w.replace('î', 'ì') + w = w.replace('ö', 'ò') + w = w.replace('ü', 'ù') + + return w diff --git a/sloleks_accentuation2.py b/sloleks_accentuation2.py index ebf3f9a..8b9c0ac 100755 --- a/sloleks_accentuation2.py +++ b/sloleks_accentuation2.py @@ -16,7 +16,8 @@ content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex') dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content) feature_dictionary = data._create_slovene_feature_dictionary() syllable_dictionary = data._create_syllables_dictionary(content, vowels) -accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù'] +# accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù'] +accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü'] data = Data('l', shuffle_all_inputs=False) letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models( @@ -62,7 +63,8 @@ with open("data/new_sloleks/new_sloleks2.tab", "a") as myfile: res = '' for i in range(index, index + len(words)): res += new_content[i][0] + '\t' + new_content[i][1] + '\t' + new_content[i][2] + '\t' \ - + new_content[i][3][:-1] + '\t' + location_accented_words[i-index] + '\t' + accented_words[i-index] + '\n' + + new_content[i][3][:-1] + '\t' + convert_to_correct_stress(location_accented_words[i-index]) + '\t' + \ + convert_to_correct_stress(accented_words[i-index]) + '\n' print('Writing data from ' + str(index) + ' onward.') end_timer = time.time()