Fixed wrong stress location asignment of new stress type"

2018-09-11 11:34:29 +02:00 · 2018-09-11 11:34:29 +02:00 · d4ea584fc4
commit d4ea584fc4
parent d09b5a8293
5 changed files with 194 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@ -97,5 +97,6 @@ new_sloleks.xml
 grid_results/
 .idea/
 cnn/word_accetuation/svm/data/
-data_merge.ipynb
-data_merge.py
+postprocessing/data_merge.ipynb
+postprocessing/data_merge.py
+postprocessing/sp_data_merge.py
--- a/accentuate.py
+++ b/accentuate.py
@ -54,7 +54,7 @@ letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model = d
 content = data._read_content(read_location)

 # format data for accentuate_word function it has to be like [['besedišči', '', 'Ncnpi', 'besedišči'], ]
-content = [[el[0], '', el[1][:-1], el[0]] for el in content[:-1]]
+content = [[el[0], '', el[1][:-1], el[0]] for el in content]

 # use environment variables and models to accentuate words
 data = Data('l', shuffle_all_inputs=False)
--- a/postprocessing/assign_stress2lemmas.py
+++ b/postprocessing/assign_stress2lemmas.py
@ -0,0 +1,173 @@
+# Words proccesed: 650250
+# Word indeks: 50023
+# Word number: 50023
+import re
+
+from lxml import etree
+import time
+from prepare_data import *
+
+accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù']
+
+def stressed2unstressed(w):
+    w = w.replace('ŕ', 'r')
+    w = w.replace('á', 'a')
+    w = w.replace('à', 'a')
+    w = w.replace('é', 'e')
+    w = w.replace('è', 'e')
+    w = w.replace('ê', 'e')
+    w = w.replace('í', 'i')
+    w = w.replace('ì', 'i')
+    w = w.replace('ó', 'o')
+    w = w.replace('ô', 'o')
+    w = w.replace('ò', 'o')
+    w = w.replace('ú', 'u')
+    w = w.replace('ù', 'u')
+
+    return w
+
+
+"""Works on finalized XML
+"""
+
+
+from text2SAMPA import *
+
+# def xml_words_generator(xml_path):
+#     for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):
+#         words = []
+#         for child in element:
+#             if child.tag == 'WordForm':
+#                 msd = None
+#                 word = None
+#                 for wf in child:
+#                     if 'att' in wf.attrib and wf.attrib['att'] == 'msd':
+#                         msd = wf.attrib['val']
+#                     elif wf.tag == 'FormRepresentation':
+#                         for form_rep in wf:
+#                             if form_rep.attrib['att'] == 'zapis_oblike':
+#                                 word = form_rep.attrib['val']
+#                         #if msd is not None and word is not None:
+#                         #    pass
+#                         #else:
+#                         #    print('NOOOOO')
+#                         words.append([word, '', msd, word])
+#         yield words
+#
+#
+# gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')
+word_glob_num = 0
+word_limit = 50000
+iter_num = 50000
+word_index = 0
+
+# iter_index = 0
+# words = []
+#
+# lexical_entries_load_number = 0
+# lexical_entries_save_number = 0
+#
+# # INSIDE
+# # word_glob_num = 1500686
+# word_glob_num = 1550705
+#
+# # word_limit = 1500686
+# word_limit = 1550705
+#
+# iter_index = 31
+
+# done_lexical_entries = 33522
+data = Data('s', shuffle_all_inputs=False)
+# accentuated_content = data._read_content('data/new_sloleks/final_sloleks2.tab')
+
+start_timer = time.time()
+lemmas = 0
+print('Copy initialization complete')
+with open("data/contextual_changes/accented_lemmas_final_sloleks2_small.xml", "ab") as myfile:
+    # myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
+    for event, element in etree.iterparse('data/new_sloleks/final_sloleks2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
+    # for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
+        # if word_glob_num >= word_limit:
+        #     myfile2.close()
+        #     myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
+        #     iter_index += 1
+        #     print("Words proccesed: " + str(word_glob_num))
+        #
+        #     print("Word indeks: " + str(word_index))
+        #     print("Word number: " + str(len(words)))
+        #
+        #     # print("lexical_entries_load_number: " + str(lexical_entries_load_number))
+        #     # print("lexical_entries_save_number: " + str(lexical_entries_save_number))
+        #
+        #     end_timer = time.time()
+        #     print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
+        lemma = ''
+        stressed_lemma = ''
+        msd = ''
+        word_form_found = False
+        for child in element:
+            if child.tag == 'Lemma':
+                for wf in child:
+                    if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike':
+                        lemma = wf.attrib['val']
+            if child.tag == 'WordForm':
+                msd = None
+                word = None
+                for wf in child:
+                    if 'att' in wf.attrib and wf.attrib['att'] == 'msd':
+                        msd = wf.attrib['val']
+                    elif wf.tag == 'FormRepresentation':
+                        for form_rep in wf:
+                            if form_rep.attrib['att'] == 'naglašena_beseda':
+                                stressed_lemma = form_rep.attrib['val']
+                                word_form_found = True
+                                break
+
+                        break
+
+                        # new_element = etree.Element('feat')
+                        # new_element.attrib['att'] = 'SAMPA'
+                        #
+                        # wf.append(new_element)
+                        #
+                        # word_glob_num += 1
+                        # word_index += 1
+                break
+
+        if re.match(r'S..ei', msd) or re.match(r'S..mi', msd) or re.match(r'Sometn', msd) or re.match(r'P..mei.*', msd) \
+                or re.match(r'P..zei.*', msd) or re.match(r'P..sei.*', msd) or re.match(r'G..n.*', msd) \
+                or re.match(r'R.n', msd) or re.match(r'Rss', msd) or re.match(r'Rd', msd)  \
+                or re.match(r'K.*', msd) or re.match(r'D.', msd) or re.match(r'L', msd) or re.match(r'M', msd) \
+                or re.match(r'O', msd) or re.match(r'Z.*', msd) or re.match(r'V.', msd) or re.match(r'Rsr.', msd)\
+                or msd == "":
+            if lemma != stressed2unstressed(stressed_lemma):
+                print(lemma + " : " + stressed_lemma + " - " + msd)
+            pass
+        else:
+            # print("Error2 - " + msd + " " + lemma + " - " + stressed_lemma)
+            # print(lemma + " - " + msd)
+            pass
+
+        for child in element:
+            if child.tag == 'Lemma':
+                for wf in child:
+                    if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike':
+                        wf.attrib['val'] = stressed_lemma
+                        break
+                    else:
+                        print('Error1')
+                break
+
+
+        lemmas += 1
+        # print(etree.tostring(element, encoding="UTF-8"))
+        # myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
+        if word_glob_num > word_limit:
+            # print('Proccessed ' + str(word_glob_num) + ' words')
+            end_timer = time.time()
+            # print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
+            word_limit += iter_num
+        myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
+        element.clear()
+
+print(lemmas)
--- a/prepare_data.py
+++ b/prepare_data.py
@ -1764,3 +1764,16 @@ class Data:
 #                           [ 0.,  0.92,  0.,  0.51,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])).eval())
 def actual_accuracy(y_true, y_pred):
    return K.mean(K.equal(K.mean(K.equal(K.round(y_true), K.round(y_pred)), axis=-1), 1.0))
+
+
+def convert_to_correct_stress(w):
+    w = w.replace('ì', 'ê')
+    w = w.replace('à', 'ŕ')
+    w = w.replace('ä', 'à')
+    w = w.replace('ë', 'è')
+    # cor_content[i][3] = cor_content[i][3].replace('ě', 'ê')
+    w = w.replace('î', 'ì')
+    w = w.replace('ö', 'ò')
+    w = w.replace('ü', 'ù')
+
+    return w
--- a/sloleks_accentuation2.py
+++ b/sloleks_accentuation2.py
@ -16,7 +16,8 @@ content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex')
 dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content)
 feature_dictionary = data._create_slovene_feature_dictionary()
 syllable_dictionary = data._create_syllables_dictionary(content, vowels)
-accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù']
+# accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù']
+accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']

 data = Data('l', shuffle_all_inputs=False)
 letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(
@ -62,7 +63,8 @@ with open("data/new_sloleks/new_sloleks2.tab", "a") as myfile:
        res = ''
        for i in range(index, index + len(words)):
            res += new_content[i][0] + '\t' + new_content[i][1] + '\t' + new_content[i][2] + '\t' \
-            + new_content[i][3][:-1] + '\t' + location_accented_words[i-index] + '\t' + accented_words[i-index] + '\n'
+            + new_content[i][3][:-1] + '\t' + convert_to_correct_stress(location_accented_words[i-index]) + '\t' + \
+            convert_to_correct_stress(accented_words[i-index]) + '\n'

        print('Writing data from ' + str(index) + ' onward.')
        end_timer = time.time()