Added some fixes in converting sloleks to the one with stressed words
This commit is contained in:
parent
902de059be
commit
36c8880bfe
|
@ -46,7 +46,7 @@ print('Commencing accentuator!')
|
||||||
|
|
||||||
rate = 100000
|
rate = 100000
|
||||||
start_timer = time.time()
|
start_timer = time.time()
|
||||||
with open("data/new_sloleks/new_sloleks.tab", "a") as myfile:
|
with open("data/new_sloleks/new_sloleks2.tab", "a") as myfile:
|
||||||
for index in range(300000, len(new_content), rate):
|
for index in range(300000, len(new_content), rate):
|
||||||
if index+rate >= len(new_content):
|
if index+rate >= len(new_content):
|
||||||
words = [[el[0], '', el[2], el[0]] for el in new_content][index:len(new_content)]
|
words = [[el[0], '', el[2], el[0]] for el in new_content][index:len(new_content)]
|
||||||
|
|
|
@ -52,14 +52,14 @@ word_index = 0
|
||||||
|
|
||||||
# done_lexical_entries = 33522
|
# done_lexical_entries = 33522
|
||||||
data = Data('s', shuffle_all_inputs=False)
|
data = Data('s', shuffle_all_inputs=False)
|
||||||
accentuated_content = data._read_content('data/new_sloleks/new_sloleks.tab')
|
accentuated_content = data._read_content('data/new_sloleks/final_sloleks2.tab')
|
||||||
|
|
||||||
start_timer = time.time()
|
start_timer = time.time()
|
||||||
|
|
||||||
print('Copy initialization complete')
|
print('Copy initialization complete')
|
||||||
with open("data/new_sloleks/final_sloleks.xml", "ab") as myfile:
|
with open("data/new_sloleks/final_sloleks2.xml", "ab") as myfile:
|
||||||
# myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
|
# myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
|
||||||
for event, element in etree.iterparse('data/new_sloleks/final_sloleks_read.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
|
for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
|
||||||
# for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
|
# for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
|
||||||
# if word_glob_num >= word_limit:
|
# if word_glob_num >= word_limit:
|
||||||
# myfile2.close()
|
# myfile2.close()
|
||||||
|
@ -135,6 +135,7 @@ with open("data/new_sloleks/final_sloleks.xml", "ab") as myfile:
|
||||||
|
|
||||||
new_element = etree.Element('feat')
|
new_element = etree.Element('feat')
|
||||||
new_element.attrib['att'] = 'SAMPA'
|
new_element.attrib['att'] = 'SAMPA'
|
||||||
|
print(accentuated_word)
|
||||||
new_element.attrib['val'] = convert_to_SAMPA(accentuated_word)
|
new_element.attrib['val'] = convert_to_SAMPA(accentuated_word)
|
||||||
wf.append(new_element)
|
wf.append(new_element)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user