stress_asignment/postprocessing/assign_stress2lemmas.py

# Words proccesed: 650250
# Word indeks: 50023
# Word number: 50023
import re

from lxml import etree
import time
from prepare_data import *

accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù']

def stressed2unstressed(w):
    w = w.replace('ŕ', 'r')
    w = w.replace('á', 'a')
    w = w.replace('à', 'a')
    w = w.replace('é', 'e')
    w = w.replace('è', 'e')
    w = w.replace('ê', 'e')
    w = w.replace('í', 'i')
    w = w.replace('ì', 'i')
    w = w.replace('ó', 'o')
    w = w.replace('ô', 'o')
    w = w.replace('ò', 'o')
    w = w.replace('ú', 'u')
    w = w.replace('ù', 'u')

    return w


"""Works on finalized XML
"""


from text2SAMPA import *

# def xml_words_generator(xml_path):
#     for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):
#         words = []
#         for child in element:
#             if child.tag == 'WordForm':
#                 msd = None
#                 word = None
#                 for wf in child:
#                     if 'att' in wf.attrib and wf.attrib['att'] == 'msd':
#                         msd = wf.attrib['val']
#                     elif wf.tag == 'FormRepresentation':
#                         for form_rep in wf:
#                             if form_rep.attrib['att'] == 'zapis_oblike':
#                                 word = form_rep.attrib['val']
#                         #if msd is not None and word is not None:
#                         #    pass
#                         #else:
#                         #    print('NOOOOO')
#                         words.append([word, '', msd, word])
#         yield words
#
#
# gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')
word_glob_num = 0
word_limit = 50000
iter_num = 50000
word_index = 0

# iter_index = 0
# words = []
#
# lexical_entries_load_number = 0
# lexical_entries_save_number = 0
#
# # INSIDE
# # word_glob_num = 1500686
# word_glob_num = 1550705
#
# # word_limit = 1500686
# word_limit = 1550705
#
# iter_index = 31

# done_lexical_entries = 33522
data = Data('s', shuffle_all_inputs=False)
# accentuated_content = data._read_content('data/new_sloleks/final_sloleks2.tab')

start_timer = time.time()
lemmas = 0
print('Copy initialization complete')
with open("data/contextual_changes/accented_lemmas_final_sloleks2_small.xml", "ab") as myfile:
    # myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
    for event, element in etree.iterparse('data/new_sloleks/final_sloleks2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
    # for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
        # if word_glob_num >= word_limit:
        #     myfile2.close()
        #     myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
        #     iter_index += 1
        #     print("Words proccesed: " + str(word_glob_num))
        #
        #     print("Word indeks: " + str(word_index))
        #     print("Word number: " + str(len(words)))
        #
        #     # print("lexical_entries_load_number: " + str(lexical_entries_load_number))
        #     # print("lexical_entries_save_number: " + str(lexical_entries_save_number))
        #
        #     end_timer = time.time()
        #     print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
        lemma = ''
        stressed_lemma = ''
        msd = ''
        word_form_found = False
        for child in element:
            if child.tag == 'Lemma':
                for wf in child:
                    if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike':
                        lemma = wf.attrib['val']
            if child.tag == 'WordForm':
                msd = None
                word = None
                for wf in child:
                    if 'att' in wf.attrib and wf.attrib['att'] == 'msd':
                        msd = wf.attrib['val']
                    elif wf.tag == 'FormRepresentation':
                        for form_rep in wf:
                            if form_rep.attrib['att'] == 'naglašena_beseda':
                                stressed_lemma = form_rep.attrib['val']
                                word_form_found = True
                                break

                        break

                        # new_element = etree.Element('feat')
                        # new_element.attrib['att'] = 'SAMPA'
                        #
                        # wf.append(new_element)
                        #
                        # word_glob_num += 1
                        # word_index += 1
                break

        if re.match(r'S..ei', msd) or re.match(r'S..mi', msd) or re.match(r'Sometn', msd) or re.match(r'P..mei.*', msd) \
                or re.match(r'P..zei.*', msd) or re.match(r'P..sei.*', msd) or re.match(r'G..n.*', msd) \
                or re.match(r'R.n', msd) or re.match(r'Rss', msd) or re.match(r'Rd', msd)  \
                or re.match(r'K.*', msd) or re.match(r'D.', msd) or re.match(r'L', msd) or re.match(r'M', msd) \
                or re.match(r'O', msd) or re.match(r'Z.*', msd) or re.match(r'V.', msd) or re.match(r'Rsr.', msd)\
                or msd == "":

            # when lemma does not equal unstressed version of what is supposed to be lemma, try to find parts of the
            # word that are equal and transfer stress to lemma (if possible)
            if lemma != stressed2unstressed(stressed_lemma):
                identical_length = 0
                # if lemma == 'Latkov':
                #     print('HERE')
                for i in range(min(len(lemma), len(stressed2unstressed(stressed_lemma)))):
                    # a = list(lemma)
                    # b = list(stressed2unstressed(stressed_lemma))
                    identical_length += 1
                    if list(lemma)[i] != list(stressed2unstressed(stressed_lemma))[i]:
                        break


                for l in list(stressed_lemma[identical_length:]):
                    if l in accented_vowels:
                        # print(lemma)
                        # print(stressed2unstressed(stressed_lemma))
                        # print(stressed_lemma[identical_length:])
                        print(lemma + " : " + stressed_lemma + " - " + msd)
                stressed_lemma = stressed_lemma[:identical_length] + lemma[identical_length:]


            # pass
            # if lemma != stressed2unstressed(stressed_lemma):
            #     print(lemma + " : " + stressed_lemma + " - " + msd)
        else:
            # print("Error2 - " + msd + " " + lemma + " - " + stressed_lemma)
            # print(lemma + " - " + msd)
            pass

        for child in element:
            if child.tag == 'Lemma':
                for wf in child:
                    if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike':
                        wf.attrib['val'] = stressed_lemma
                        break
                    else:
                        print('Error1')
                break


        lemmas += 1
        # print(etree.tostring(element, encoding="UTF-8"))
        # myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
        if word_glob_num > word_limit:
            # print('Proccessed ' + str(word_glob_num) + ' words')
            end_timer = time.time()
            # print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
            word_limit += iter_num
        myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
        element.clear()

print(lemmas)
Fixed wrong stress location asignment of new stress type" 2018-09-11 09:34:29 +00:00			`# Words proccesed: 650250`
			`# Word indeks: 50023`
			`# Word number: 50023`
			`import re`

			`from lxml import etree`
			`import time`
			`from prepare_data import *`

			`accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù']`

			`def stressed2unstressed(w):`
			`w = w.replace('ŕ', 'r')`
			`w = w.replace('á', 'a')`
			`w = w.replace('à', 'a')`
			`w = w.replace('é', 'e')`
			`w = w.replace('è', 'e')`
			`w = w.replace('ê', 'e')`
			`w = w.replace('í', 'i')`
			`w = w.replace('ì', 'i')`
			`w = w.replace('ó', 'o')`
			`w = w.replace('ô', 'o')`
			`w = w.replace('ò', 'o')`
			`w = w.replace('ú', 'u')`
			`w = w.replace('ù', 'u')`

			`return w`


			`"""Works on finalized XML`
			`"""`


			`from text2SAMPA import *`

			`# def xml_words_generator(xml_path):`
			`# for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):`
			`# words = []`
			`# for child in element:`
			`# if child.tag == 'WordForm':`
			`# msd = None`
			`# word = None`
			`# for wf in child:`
			`# if 'att' in wf.attrib and wf.attrib['att'] == 'msd':`
			`# msd = wf.attrib['val']`
			`# elif wf.tag == 'FormRepresentation':`
			`# for form_rep in wf:`
			`# if form_rep.attrib['att'] == 'zapis_oblike':`
			`# word = form_rep.attrib['val']`
			`# #if msd is not None and word is not None:`
			`# # pass`
			`# #else:`
			`# # print('NOOOOO')`
			`# words.append([word, '', msd, word])`
			`# yield words`
			`#`
			`#`
			`# gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')`
			`word_glob_num = 0`
			`word_limit = 50000`
			`iter_num = 50000`
			`word_index = 0`

			`# iter_index = 0`
			`# words = []`
			`#`
			`# lexical_entries_load_number = 0`
			`# lexical_entries_save_number = 0`
			`#`
			`# # INSIDE`
			`# # word_glob_num = 1500686`
			`# word_glob_num = 1550705`
			`#`
			`# # word_limit = 1500686`
			`# word_limit = 1550705`
			`#`
			`# iter_index = 31`

			`# done_lexical_entries = 33522`
			`data = Data('s', shuffle_all_inputs=False)`
			`# accentuated_content = data._read_content('data/new_sloleks/final_sloleks2.tab')`

			`start_timer = time.time()`
			`lemmas = 0`
			`print('Copy initialization complete')`
			`with open("data/contextual_changes/accented_lemmas_final_sloleks2_small.xml", "ab") as myfile:`
			`# myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')`
			`for event, element in etree.iterparse('data/new_sloleks/final_sloleks2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):`
			`# for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):`
			`# if word_glob_num >= word_limit:`
			`# myfile2.close()`
			`# myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')`
			`# iter_index += 1`
			`# print("Words proccesed: " + str(word_glob_num))`
			`#`
			`# print("Word indeks: " + str(word_index))`
			`# print("Word number: " + str(len(words)))`
			`#`
			`# # print("lexical_entries_load_number: " + str(lexical_entries_load_number))`
			`# # print("lexical_entries_save_number: " + str(lexical_entries_save_number))`
			`#`
			`# end_timer = time.time()`
			`# print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")`
			`lemma = ''`
			`stressed_lemma = ''`
			`msd = ''`
			`word_form_found = False`
			`for child in element:`
			`if child.tag == 'Lemma':`
			`for wf in child:`
			`if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike':`
			`lemma = wf.attrib['val']`
			`if child.tag == 'WordForm':`
			`msd = None`
			`word = None`
			`for wf in child:`
			`if 'att' in wf.attrib and wf.attrib['att'] == 'msd':`
			`msd = wf.attrib['val']`
			`elif wf.tag == 'FormRepresentation':`
			`for form_rep in wf:`
			`if form_rep.attrib['att'] == 'naglašena_beseda':`
			`stressed_lemma = form_rep.attrib['val']`
			`word_form_found = True`
			`break`

			`break`

			`# new_element = etree.Element('feat')`
			`# new_element.attrib['att'] = 'SAMPA'`
			`#`
			`# wf.append(new_element)`
			`#`
			`# word_glob_num += 1`
			`# word_index += 1`
			`break`

			`if re.match(r'S..ei', msd) or re.match(r'S..mi', msd) or re.match(r'Sometn', msd) or re.match(r'P..mei.*', msd) \`
			`or re.match(r'P..zei.', msd) or re.match(r'P..sei.', msd) or re.match(r'G..n.*', msd) \`
			`or re.match(r'R.n', msd) or re.match(r'Rss', msd) or re.match(r'Rd', msd) \`
			`or re.match(r'K.*', msd) or re.match(r'D.', msd) or re.match(r'L', msd) or re.match(r'M', msd) \`
			`or re.match(r'O', msd) or re.match(r'Z.*', msd) or re.match(r'V.', msd) or re.match(r'Rsr.', msd)\`
			`or msd == "":`
A couple of fixes 2018-09-27 12:41:27 +00:00
			`# when lemma does not equal unstressed version of what is supposed to be lemma, try to find parts of the`
			`# word that are equal and transfer stress to lemma (if possible)`
Fixed wrong stress location asignment of new stress type" 2018-09-11 09:34:29 +00:00			`if lemma != stressed2unstressed(stressed_lemma):`
A couple of fixes 2018-09-27 12:41:27 +00:00			`identical_length = 0`
			`# if lemma == 'Latkov':`
			`# print('HERE')`
			`for i in range(min(len(lemma), len(stressed2unstressed(stressed_lemma)))):`
			`# a = list(lemma)`
			`# b = list(stressed2unstressed(stressed_lemma))`
			`identical_length += 1`
			`if list(lemma)[i] != list(stressed2unstressed(stressed_lemma))[i]:`
			`break`


			`for l in list(stressed_lemma[identical_length:]):`
			`if l in accented_vowels:`
			`# print(lemma)`
			`# print(stressed2unstressed(stressed_lemma))`
			`# print(stressed_lemma[identical_length:])`
			`print(lemma + " : " + stressed_lemma + " - " + msd)`
			`stressed_lemma = stressed_lemma[:identical_length] + lemma[identical_length:]`



			`# pass`
			`# if lemma != stressed2unstressed(stressed_lemma):`
			`# print(lemma + " : " + stressed_lemma + " - " + msd)`
Fixed wrong stress location asignment of new stress type" 2018-09-11 09:34:29 +00:00			`else:`
			`# print("Error2 - " + msd + " " + lemma + " - " + stressed_lemma)`
			`# print(lemma + " - " + msd)`
			`pass`

			`for child in element:`
			`if child.tag == 'Lemma':`
			`for wf in child:`
			`if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike':`
			`wf.attrib['val'] = stressed_lemma`
			`break`
			`else:`
			`print('Error1')`
			`break`


			`lemmas += 1`
			`# print(etree.tostring(element, encoding="UTF-8"))`
			`# myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))`
			`if word_glob_num > word_limit:`
			`# print('Proccessed ' + str(word_glob_num) + ' words')`
			`end_timer = time.time()`
			`# print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")`
			`word_limit += iter_num`
			`myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))`
			`element.clear()`

			`print(lemmas)`