Added fix for lacking stressed data in tab form

This commit is contained in:
Luka 2018-08-22 08:46:51 +02:00
parent 36c8880bfe
commit d09b5a8293
3 changed files with 10 additions and 7 deletions

View File

@ -47,7 +47,7 @@ print('Commencing accentuator!')
rate = 100000 rate = 100000
start_timer = time.time() start_timer = time.time()
with open("data/new_sloleks/new_sloleks2.tab", "a") as myfile: with open("data/new_sloleks/new_sloleks2.tab", "a") as myfile:
for index in range(300000, len(new_content), rate): for index in range(0, len(new_content), rate):
if index+rate >= len(new_content): if index+rate >= len(new_content):
words = [[el[0], '', el[2], el[0]] for el in new_content][index:len(new_content)] words = [[el[0], '', el[2], el[0]] for el in new_content][index:len(new_content)]
else: else:

View File

@ -136,6 +136,8 @@ with open("data/new_sloleks/final_sloleks2.xml", "ab") as myfile:
new_element = etree.Element('feat') new_element = etree.Element('feat')
new_element.attrib['att'] = 'SAMPA' new_element.attrib['att'] = 'SAMPA'
print(accentuated_word) print(accentuated_word)
if lemma == 'Barrymore':
print("HERE!")
new_element.attrib['val'] = convert_to_SAMPA(accentuated_word) new_element.attrib['val'] = convert_to_SAMPA(accentuated_word)
wf.append(new_element) wf.append(new_element)

View File

@ -5,7 +5,8 @@ import sys
vowels = ['à', 'á', 'ä', 'é', 'ë', 'ì', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü', 'a', 'e', 'i', 'o', 'u', 'O', 'E'] vowels = ['à', 'á', 'ä', 'é', 'ë', 'ì', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü', 'a', 'e', 'i', 'o', 'u', 'O', 'E']
def syllable_stressed(syllable): def syllable_stressed(syllable):
stressed_letters = [u'ŕ', u'á', u'ä', u'é', u'ë', u'ě', u'í', u'î', u'ó', u'ô', u'ö', u'ú', u'ü'] # stressed_letters = [u'ŕ', u'á', u'ä', u'é', u'ë', u'ě', u'í', u'î', u'ó', u'ô', u'ö', u'ú', u'ü']
stressed_letters = [u'ŕ', u'á', u'à', u'é', u'è', u'ê', u'í', u'ì', u'ó', u'ô', u'ò', u'ú', u'ù']
for letter in syllable: for letter in syllable:
if letter in stressed_letters: if letter in stressed_letters:
return True return True
@ -116,13 +117,13 @@ def convert_to_SAMPA(word):
word[i] = 'tS' word[i] = 'tS'
elif word[i] == 'á': elif word[i] == 'á':
word[i] = 'a:' word[i] = 'a:'
elif word[i] == 'ä': elif word[i] == 'à':
word[i] = 'a' word[i] = 'a'
elif word[i] == 'é': elif word[i] == 'é':
word[i] = 'e:' word[i] = 'e:'
elif word[i] == 'ë': elif word[i] == 'è':
word[i] = 'E' word[i] = 'E'
elif word[i] == 'ě': elif word[i] == 'ê':
word[i] = 'E:' word[i] = 'E:'
elif word[i] == 'í': elif word[i] == 'í':
word[i] = 'i:' word[i] = 'i:'
@ -132,11 +133,11 @@ def convert_to_SAMPA(word):
word[i] = 'o:' word[i] = 'o:'
elif word[i] == 'ô': elif word[i] == 'ô':
word[i] = 'O:' word[i] = 'O:'
elif word[i] == 'ö': elif word[i] == 'ò':
word[i] = 'O' word[i] = 'O'
elif word[i] == 'ú': elif word[i] == 'ú':
word[i] = 'u:' word[i] = 'u:'
elif word[i] == 'ü': elif word[i] == 'ù':
word[i] = 'u' word[i] = 'u'
elif word[i] == 'ŕ': elif word[i] == 'ŕ':
word[i] = '@r' word[i] = '@r'