198 lines
7.5 KiB
Python
Executable File
198 lines
7.5 KiB
Python
Executable File
# Words proccesed: 650250
|
|
# Word indeks: 50023
|
|
# Word number: 50023
|
|
import re
|
|
|
|
from lxml import etree
|
|
import time
|
|
from prepare_data import *
|
|
|
|
accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù']
|
|
|
|
def stressed2unstressed(w):
|
|
w = w.replace('ŕ', 'r')
|
|
w = w.replace('á', 'a')
|
|
w = w.replace('à', 'a')
|
|
w = w.replace('é', 'e')
|
|
w = w.replace('è', 'e')
|
|
w = w.replace('ê', 'e')
|
|
w = w.replace('í', 'i')
|
|
w = w.replace('ì', 'i')
|
|
w = w.replace('ó', 'o')
|
|
w = w.replace('ô', 'o')
|
|
w = w.replace('ò', 'o')
|
|
w = w.replace('ú', 'u')
|
|
w = w.replace('ù', 'u')
|
|
|
|
return w
|
|
|
|
|
|
"""Works on finalized XML
|
|
"""
|
|
|
|
|
|
from text2SAMPA import *
|
|
|
|
# def xml_words_generator(xml_path):
|
|
# for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):
|
|
# words = []
|
|
# for child in element:
|
|
# if child.tag == 'WordForm':
|
|
# msd = None
|
|
# word = None
|
|
# for wf in child:
|
|
# if 'att' in wf.attrib and wf.attrib['att'] == 'msd':
|
|
# msd = wf.attrib['val']
|
|
# elif wf.tag == 'FormRepresentation':
|
|
# for form_rep in wf:
|
|
# if form_rep.attrib['att'] == 'zapis_oblike':
|
|
# word = form_rep.attrib['val']
|
|
# #if msd is not None and word is not None:
|
|
# # pass
|
|
# #else:
|
|
# # print('NOOOOO')
|
|
# words.append([word, '', msd, word])
|
|
# yield words
|
|
#
|
|
#
|
|
# gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')
|
|
word_glob_num = 0
|
|
word_limit = 50000
|
|
iter_num = 50000
|
|
word_index = 0
|
|
|
|
# iter_index = 0
|
|
# words = []
|
|
#
|
|
# lexical_entries_load_number = 0
|
|
# lexical_entries_save_number = 0
|
|
#
|
|
# # INSIDE
|
|
# # word_glob_num = 1500686
|
|
# word_glob_num = 1550705
|
|
#
|
|
# # word_limit = 1500686
|
|
# word_limit = 1550705
|
|
#
|
|
# iter_index = 31
|
|
|
|
# done_lexical_entries = 33522
|
|
data = Data('s', shuffle_all_inputs=False)
|
|
# accentuated_content = data._read_content('data/new_sloleks/final_sloleks2.tab')
|
|
|
|
start_timer = time.time()
|
|
lemmas = 0
|
|
print('Copy initialization complete')
|
|
with open("data/contextual_changes/accented_lemmas_final_sloleks2_small.xml", "ab") as myfile:
|
|
# myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
|
|
for event, element in etree.iterparse('data/new_sloleks/final_sloleks2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
|
|
# for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
|
|
# if word_glob_num >= word_limit:
|
|
# myfile2.close()
|
|
# myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
|
|
# iter_index += 1
|
|
# print("Words proccesed: " + str(word_glob_num))
|
|
#
|
|
# print("Word indeks: " + str(word_index))
|
|
# print("Word number: " + str(len(words)))
|
|
#
|
|
# # print("lexical_entries_load_number: " + str(lexical_entries_load_number))
|
|
# # print("lexical_entries_save_number: " + str(lexical_entries_save_number))
|
|
#
|
|
# end_timer = time.time()
|
|
# print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
|
|
lemma = ''
|
|
stressed_lemma = ''
|
|
msd = ''
|
|
word_form_found = False
|
|
for child in element:
|
|
if child.tag == 'Lemma':
|
|
for wf in child:
|
|
if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike':
|
|
lemma = wf.attrib['val']
|
|
if child.tag == 'WordForm':
|
|
msd = None
|
|
word = None
|
|
for wf in child:
|
|
if 'att' in wf.attrib and wf.attrib['att'] == 'msd':
|
|
msd = wf.attrib['val']
|
|
elif wf.tag == 'FormRepresentation':
|
|
for form_rep in wf:
|
|
if form_rep.attrib['att'] == 'naglašena_beseda':
|
|
stressed_lemma = form_rep.attrib['val']
|
|
word_form_found = True
|
|
break
|
|
|
|
break
|
|
|
|
# new_element = etree.Element('feat')
|
|
# new_element.attrib['att'] = 'SAMPA'
|
|
#
|
|
# wf.append(new_element)
|
|
#
|
|
# word_glob_num += 1
|
|
# word_index += 1
|
|
break
|
|
|
|
if re.match(r'S..ei', msd) or re.match(r'S..mi', msd) or re.match(r'Sometn', msd) or re.match(r'P..mei.*', msd) \
|
|
or re.match(r'P..zei.*', msd) or re.match(r'P..sei.*', msd) or re.match(r'G..n.*', msd) \
|
|
or re.match(r'R.n', msd) or re.match(r'Rss', msd) or re.match(r'Rd', msd) \
|
|
or re.match(r'K.*', msd) or re.match(r'D.', msd) or re.match(r'L', msd) or re.match(r'M', msd) \
|
|
or re.match(r'O', msd) or re.match(r'Z.*', msd) or re.match(r'V.', msd) or re.match(r'Rsr.', msd)\
|
|
or msd == "":
|
|
|
|
# when lemma does not equal unstressed version of what is supposed to be lemma, try to find parts of the
|
|
# word that are equal and transfer stress to lemma (if possible)
|
|
if lemma != stressed2unstressed(stressed_lemma):
|
|
identical_length = 0
|
|
# if lemma == 'Latkov':
|
|
# print('HERE')
|
|
for i in range(min(len(lemma), len(stressed2unstressed(stressed_lemma)))):
|
|
# a = list(lemma)
|
|
# b = list(stressed2unstressed(stressed_lemma))
|
|
identical_length += 1
|
|
if list(lemma)[i] != list(stressed2unstressed(stressed_lemma))[i]:
|
|
break
|
|
|
|
|
|
for l in list(stressed_lemma[identical_length:]):
|
|
if l in accented_vowels:
|
|
# print(lemma)
|
|
# print(stressed2unstressed(stressed_lemma))
|
|
# print(stressed_lemma[identical_length:])
|
|
print(lemma + " : " + stressed_lemma + " - " + msd)
|
|
stressed_lemma = stressed_lemma[:identical_length] + lemma[identical_length:]
|
|
|
|
|
|
|
|
# pass
|
|
# if lemma != stressed2unstressed(stressed_lemma):
|
|
# print(lemma + " : " + stressed_lemma + " - " + msd)
|
|
else:
|
|
# print("Error2 - " + msd + " " + lemma + " - " + stressed_lemma)
|
|
# print(lemma + " - " + msd)
|
|
pass
|
|
|
|
for child in element:
|
|
if child.tag == 'Lemma':
|
|
for wf in child:
|
|
if 'att' in wf.attrib and wf.attrib['att'] == 'zapis_oblike':
|
|
wf.attrib['val'] = stressed_lemma
|
|
break
|
|
else:
|
|
print('Error1')
|
|
break
|
|
|
|
|
|
lemmas += 1
|
|
# print(etree.tostring(element, encoding="UTF-8"))
|
|
# myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
|
|
if word_glob_num > word_limit:
|
|
# print('Proccessed ' + str(word_glob_num) + ' words')
|
|
end_timer = time.time()
|
|
# print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
|
|
word_limit += iter_num
|
|
myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
|
|
element.clear()
|
|
|
|
print(lemmas) |