# Words proccesed: 650250 # Word indeks: 50023 # Word number: 50023 import re from lxml import etree import time # from prepare_data import * accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù'] def stressed2unstressed(w): w = w.replace('ŕ', 'r') w = w.replace('á', 'a') w = w.replace('à', 'a') w = w.replace('é', 'e') w = w.replace('è', 'e') w = w.replace('ê', 'e') w = w.replace('í', 'i') w = w.replace('ì', 'i') w = w.replace('ó', 'o') w = w.replace('ô', 'o') w = w.replace('ò', 'o') w = w.replace('ú', 'u') w = w.replace('ù', 'u') return w """Works on finalized XML """ from text2SAMPA import * # def xml_words_generator(xml_path): # for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"): # words = [] # for child in element: # if child.tag == 'WordForm': # msd = None # word = None # for wf in child: # if 'att' in wf.attrib and wf.attrib['att'] == 'msd': # msd = wf.attrib['val'] # elif wf.tag == 'FormRepresentation': # for form_rep in wf: # if form_rep.attrib['att'] == 'zapis_oblike': # word = form_rep.attrib['val'] # #if msd is not None and word is not None: # # pass # #else: # # print('NOOOOO') # words.append([word, '', msd, word]) # yield words # # # gen = xml_words_generator('data/Sloleks_v1.2_p2.xml') word_glob_num = 0 word_limit = 50000 iter_num = 50000 word_index = 0 # iter_index = 0 # words = [] # # lexical_entries_load_number = 0 # lexical_entries_save_number = 0 # # # INSIDE # # word_glob_num = 1500686 # word_glob_num = 1550705 # # # word_limit = 1500686 # word_limit = 1550705 # # iter_index = 31 # done_lexical_entries = 33522 # data = Data('s', shuffle_all_inputs=False) # accentuated_content = data._read_content('data/new_sloleks/final_sloleks2.tab') start_timer = time.time() lemmas = 0 print('Copy initialization complete') with open("data/contextual_changes/stressed_lemmas_sloleks2.xml", "ab") as myfile: # myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab') for event, element in etree.iterparse('data/contextual_changes/final_sloleks2_inhouse2S.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): # for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): # if word_glob_num >= word_limit: # myfile2.close() # myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab') # iter_index += 1 # print("Words proccesed: " + str(word_glob_num)) # # print("Word indeks: " + str(word_index)) # print("Word number: " + str(len(words))) # # # print("lexical_entries_load_number: " + str(lexical_entries_load_number)) # # print("lexical_entries_save_number: " + str(lexical_entries_save_number)) # # end_timer = time.time() # print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes") lemma = '' stressed_lemma = '' msd = '' word_form_found = False for child in element: if child.tag == 'Lemma': for wf in child: if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike': lemma = wf.attrib['val'] if child.tag == 'WordForm': msd = None word = None for wf in child: if 'att' in wf.attrib and wf.attrib['att'] == 'msd': msd = wf.attrib['val'] elif wf.tag == 'FormRepresentation': for form_rep in wf: if form_rep.attrib['att'] == 'naglašena_beseda': stressed_lemma = form_rep.attrib['val'] word_form_found = True break break # new_element = etree.Element('feat') # new_element.attrib['att'] = 'SAMPA' # # wf.append(new_element) # # word_glob_num += 1 # word_index += 1 break if re.match(r'S..ei', msd) or re.match(r'S..mi', msd) or re.match(r'Sometn', msd) or re.match(r'P..mei.*', msd) \ or re.match(r'P..zei.*', msd) or re.match(r'P..sei.*', msd) or re.match(r'G..n.*', msd) \ or re.match(r'R.n', msd) or re.match(r'Rss', msd) or re.match(r'Rd', msd) \ or re.match(r'K.*', msd) or re.match(r'D.', msd) or re.match(r'L', msd) or re.match(r'M', msd) \ or re.match(r'O', msd) or re.match(r'Z.*', msd) or re.match(r'V.', msd) or re.match(r'Rsr.', msd)\ or msd == "": # when lemma does not equal unstressed version of what is supposed to be lemma, try to find parts of the # word that are equal and transfer stress to lemma (if possible) if lemma != stressed2unstressed(stressed_lemma): identical_length = 0 # if lemma == 'Latkov': # print('HERE') for i in range(min(len(lemma), len(stressed2unstressed(stressed_lemma)))): # a = list(lemma) # b = list(stressed2unstressed(stressed_lemma)) identical_length += 1 if list(lemma)[i] != list(stressed2unstressed(stressed_lemma))[i]: break for l in list(stressed_lemma[identical_length:]): if l in accented_vowels: # print(lemma) # print(stressed2unstressed(stressed_lemma)) # print(stressed_lemma[identical_length:]) print(lemma + " : " + stressed_lemma + " - " + msd) stressed_lemma = stressed_lemma[:identical_length] + lemma[identical_length:] # pass # if lemma != stressed2unstressed(stressed_lemma): # print(lemma + " : " + stressed_lemma + " - " + msd) else: # print("Error2 - " + msd + " " + lemma + " - " + stressed_lemma) # print(lemma + " - " + msd) pass for child in element: if child.tag == 'Lemma': for wf in child: if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike': wf.attrib['val'] = stressed_lemma break else: print('Error1') break lemmas += 1 # print(etree.tostring(element, encoding="UTF-8")) # myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True)) if word_glob_num > word_limit: # print('Proccessed ' + str(word_glob_num) + ' words') end_timer = time.time() # print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes") word_limit += iter_num myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True)) element.clear() print(lemmas)