Added text to speech script and text to SAMPA
This commit is contained in:
parent
43a7866636
commit
4c968b2d0f
|
@ -5,6 +5,7 @@
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import time
|
import time
|
||||||
from prepare_data import *
|
from prepare_data import *
|
||||||
|
from text2SAMPA import *
|
||||||
|
|
||||||
# def xml_words_generator(xml_path):
|
# def xml_words_generator(xml_path):
|
||||||
# for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):
|
# for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):
|
||||||
|
@ -130,6 +131,12 @@ with open("data/new_sloleks/final_sloleks.xml", "ab") as myfile:
|
||||||
new_element.attrib['att'] = 'naglašena_beseda'
|
new_element.attrib['att'] = 'naglašena_beseda'
|
||||||
new_element.attrib['val'] = accentuated_word
|
new_element.attrib['val'] = accentuated_word
|
||||||
wf.append(new_element)
|
wf.append(new_element)
|
||||||
|
|
||||||
|
new_element = etree.Element('feat')
|
||||||
|
new_element.attrib['att'] = 'SAMPA'
|
||||||
|
new_element.attrib['val'] = result = convert_to_SAMPA(accentuated_word)
|
||||||
|
wf.append(new_element)
|
||||||
|
|
||||||
word_glob_num += 1
|
word_glob_num += 1
|
||||||
# word_index += 1
|
# word_index += 1
|
||||||
|
|
||||||
|
|
36
sloleks_xml_checker.py
Normal file
36
sloleks_xml_checker.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
# Words proccesed: 650250
|
||||||
|
# Word indeks: 50023
|
||||||
|
# Word number: 50023
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
word_glob_num = 0
|
||||||
|
word_limit = 50000
|
||||||
|
iter_num = 50000
|
||||||
|
word_index = 0
|
||||||
|
accented_places = 0
|
||||||
|
accented_words = 0
|
||||||
|
enters = 0
|
||||||
|
|
||||||
|
for event, element in etree.iterparse('data/new_sloleks/final_sloleks.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
|
||||||
|
for child in element:
|
||||||
|
for wf in child:
|
||||||
|
if wf.tag == 'FormRepresentation':
|
||||||
|
for form_rep in wf:
|
||||||
|
if form_rep.attrib['att'] == 'naglasna_mesta_besede':
|
||||||
|
accented_places += 1
|
||||||
|
if '\n' in list(form_rep.attrib['val']):
|
||||||
|
enters += 1
|
||||||
|
if form_rep.attrib['att'] == 'naglašena_beseda':
|
||||||
|
accented_words += 1
|
||||||
|
if '\n' in list(form_rep.attrib['val']):
|
||||||
|
enters += 1
|
||||||
|
|
||||||
|
element.clear()
|
||||||
|
|
||||||
|
print(accented_places)
|
||||||
|
print(accented_words)
|
||||||
|
print(enters)
|
237
text2SAMPA.py
Normal file
237
text2SAMPA.py
Normal file
|
@ -0,0 +1,237 @@
|
||||||
|
from copy import copy
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
vowels = ['à', 'á', 'ä', 'é', 'ë', 'ì', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü', 'a', 'e', 'i', 'o', 'u', 'O', 'E']
|
||||||
|
|
||||||
|
def syllable_stressed(syllable):
|
||||||
|
stressed_letters = [u'ŕ', u'á', u'ä', u'é', u'ë', u'ě', u'í', u'î', u'ó', u'ô', u'ö', u'ú', u'ü']
|
||||||
|
for letter in syllable:
|
||||||
|
if letter in stressed_letters:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_vowel(word_list, position, vowels):
|
||||||
|
if word_list[position] in vowels:
|
||||||
|
return True
|
||||||
|
if (word_list[position] == u'r' or word_list[position] == u'R') and (position - 1 < 0 or word_list[position - 1] not in vowels) and (
|
||||||
|
position + 1 >= len(word_list) or word_list[position + 1] not in vowels):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_voiced_consonants():
|
||||||
|
return ['m', 'n', 'v', 'l', 'r', 'j', 'y', 'w', 'F', 'N']
|
||||||
|
|
||||||
|
def get_resonant_silent_consonants():
|
||||||
|
return ['b', 'd', 'z', 'ž', 'g']
|
||||||
|
|
||||||
|
def get_nonresonant_silent_consonants():
|
||||||
|
return ['p', 't', 's', 'š', 'č', 'k', 'f', 'h', 'c', 'x']
|
||||||
|
|
||||||
|
def split_consonants(consonants):
|
||||||
|
voiced_consonants = get_voiced_consonants()
|
||||||
|
resonant_silent_consonants = get_resonant_silent_consonants()
|
||||||
|
unresonant_silent_consonants = get_nonresonant_silent_consonants()
|
||||||
|
if len(consonants) == 0:
|
||||||
|
return [''], ['']
|
||||||
|
elif len(consonants) == 1:
|
||||||
|
return [''], consonants
|
||||||
|
else:
|
||||||
|
split_options = []
|
||||||
|
for i in range(len(consonants) - 1):
|
||||||
|
if consonants[i] == '-' or consonants[i] == '_':
|
||||||
|
split_options.append([i, -1])
|
||||||
|
elif consonants[i] == consonants[i + 1]:
|
||||||
|
split_options.append([i, 0])
|
||||||
|
elif consonants[i] in voiced_consonants:
|
||||||
|
if consonants[i + 1] in resonant_silent_consonants or consonants[i + 1] in unresonant_silent_consonants:
|
||||||
|
split_options.append([i, 2])
|
||||||
|
elif consonants[i] in resonant_silent_consonants:
|
||||||
|
if consonants[i + 1] in resonant_silent_consonants:
|
||||||
|
split_options.append([i, 1])
|
||||||
|
elif consonants[i + 1] in unresonant_silent_consonants:
|
||||||
|
split_options.append([i, 3])
|
||||||
|
elif consonants[i] in unresonant_silent_consonants:
|
||||||
|
if consonants[i + 1] in resonant_silent_consonants:
|
||||||
|
split_options.append([i, 4])
|
||||||
|
|
||||||
|
if split_options == []:
|
||||||
|
return [''], consonants
|
||||||
|
else:
|
||||||
|
split = min(split_options, key=lambda x: x[1])
|
||||||
|
return consonants[:split[0] + 1], consonants[split[0] + 1:]
|
||||||
|
|
||||||
|
def create_syllables(word, vowels):
|
||||||
|
word_list = list(word)
|
||||||
|
consonants = []
|
||||||
|
syllables = []
|
||||||
|
for i in range(len(word_list)):
|
||||||
|
if is_vowel(word_list, i, vowels):
|
||||||
|
if syllables == []:
|
||||||
|
consonants.append(word_list[i])
|
||||||
|
syllables.append(''.join(consonants))
|
||||||
|
else:
|
||||||
|
left_consonants, right_consonants = split_consonants(list(''.join(consonants).lower()))
|
||||||
|
syllables[-1] += ''.join(left_consonants)
|
||||||
|
right_consonants.append(word_list[i])
|
||||||
|
syllables.append(''.join(right_consonants))
|
||||||
|
consonants = []
|
||||||
|
else:
|
||||||
|
consonants.append(word_list[i])
|
||||||
|
if len(syllables) < 1:
|
||||||
|
return word
|
||||||
|
syllables[-1] += ''.join(consonants)
|
||||||
|
|
||||||
|
return syllables
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_SAMPA(word):
|
||||||
|
syllables = create_syllables(word, vowels)
|
||||||
|
letters_in_stressed_syllable = [False] * len(word)
|
||||||
|
# print(syllables)
|
||||||
|
l = 0
|
||||||
|
for syllable in syllables:
|
||||||
|
if syllable_stressed(syllable):
|
||||||
|
for i in range(len(syllable)):
|
||||||
|
letters_in_stressed_syllable[l + i] = True
|
||||||
|
# print(l)
|
||||||
|
l += len(syllable)
|
||||||
|
previous_letter = ''
|
||||||
|
word = list(word)
|
||||||
|
for i in range(len(word)):
|
||||||
|
if word[i] == 'e':
|
||||||
|
word[i] = 'E'
|
||||||
|
elif word[i] == 'o':
|
||||||
|
word[i] = 'O'
|
||||||
|
elif word[i] == 'š':
|
||||||
|
word[i] = 'S'
|
||||||
|
elif word[i] == 'ž':
|
||||||
|
word[i] = 'Z'
|
||||||
|
elif word[i] == 'h':
|
||||||
|
word[i] = 'x'
|
||||||
|
elif word[i] == 'c':
|
||||||
|
word[i] = 'ts'
|
||||||
|
elif word[i] == 'č':
|
||||||
|
word[i] = 'tS'
|
||||||
|
elif word[i] == 'á':
|
||||||
|
word[i] = 'a:'
|
||||||
|
elif word[i] == 'ä':
|
||||||
|
word[i] = 'a'
|
||||||
|
elif word[i] == 'é':
|
||||||
|
word[i] = 'e:'
|
||||||
|
elif word[i] == 'ë':
|
||||||
|
word[i] = 'E'
|
||||||
|
elif word[i] == 'ě':
|
||||||
|
word[i] = 'E:'
|
||||||
|
elif word[i] == 'í':
|
||||||
|
word[i] = 'i:'
|
||||||
|
elif word[i] == 'î':
|
||||||
|
word[i] = 'i'
|
||||||
|
elif word[i] == 'ó':
|
||||||
|
word[i] = 'o:'
|
||||||
|
elif word[i] == 'ô':
|
||||||
|
word[i] = 'O:'
|
||||||
|
elif word[i] == 'ö':
|
||||||
|
word[i] = 'O'
|
||||||
|
elif word[i] == 'ú':
|
||||||
|
word[i] = 'u:'
|
||||||
|
elif word[i] == 'ü':
|
||||||
|
word[i] = 'u'
|
||||||
|
elif word[i] == 'ŕ':
|
||||||
|
word[i] = '@r'
|
||||||
|
|
||||||
|
if letters_in_stressed_syllable[0]:
|
||||||
|
word[0] = '\"' + word[0]
|
||||||
|
for i in range(1, len(letters_in_stressed_syllable)):
|
||||||
|
if not letters_in_stressed_syllable[i - 1] and letters_in_stressed_syllable[i]:
|
||||||
|
word[i] = '\"' + word[i]
|
||||||
|
# if letters_in_stressed_syllable[i - 1] and not letters_in_stressed_syllable[i]:
|
||||||
|
# word[i - 1] = word[i - 1] + ':'
|
||||||
|
# if letters_in_stressed_syllable[-1]:
|
||||||
|
# word[-1] = word[-1] + ':'
|
||||||
|
|
||||||
|
word = list(''.join(word))
|
||||||
|
|
||||||
|
previous_letter_i = -1
|
||||||
|
letter_i = 0
|
||||||
|
next_letter_i = 1
|
||||||
|
if word[0] == '\"':
|
||||||
|
letter_i = 1
|
||||||
|
if word[2] == ':':
|
||||||
|
if len(word) > 3:
|
||||||
|
next_letter_i = 3
|
||||||
|
else:
|
||||||
|
#if word[next_letter_i] == 'l':
|
||||||
|
# word[next_letter_i] = 'l\''
|
||||||
|
#elif word[next_letter_i] == 'n':
|
||||||
|
# word[next_letter_i] = 'n\''
|
||||||
|
return ''.join(word)
|
||||||
|
else:
|
||||||
|
next_letter_i = 2
|
||||||
|
elif len(word) > 1 and word[1] == '\"':
|
||||||
|
next_letter_i = 2
|
||||||
|
# {('m', 'f'): 'F'}
|
||||||
|
|
||||||
|
new_word = copy(word)
|
||||||
|
while True:
|
||||||
|
if word[letter_i] == 'm' and (word[next_letter_i] == 'f' or word[next_letter_i] == 'v'):
|
||||||
|
new_word[letter_i] = 'F'
|
||||||
|
elif word[letter_i] == 'n' and (word[next_letter_i] == 'k' or word[next_letter_i] == 'g' or word[next_letter_i] == 'x'):
|
||||||
|
new_word[letter_i] = 'N'
|
||||||
|
elif word[letter_i] == 'n' and (word[next_letter_i] == 'f' or word[next_letter_i] == 'v'):
|
||||||
|
new_word[letter_i] = 'F'
|
||||||
|
elif word[letter_i] == 'n' and not word[next_letter_i] in vowels and letter_i == len(word) - 2:
|
||||||
|
new_word[letter_i] = 'n\''
|
||||||
|
elif word[letter_i] == 'l' and not word[next_letter_i] in vowels and letter_i == len(word) - 2:
|
||||||
|
new_word[letter_i] = 'l\''
|
||||||
|
elif previous_letter_i >= 0 and word[letter_i] == 'v' and not word[previous_letter_i] in vowels and word[
|
||||||
|
next_letter_i] in get_voiced_consonants():
|
||||||
|
new_word[letter_i] = 'w'
|
||||||
|
elif previous_letter_i >= 0 and word[letter_i] == 'v' and not word[previous_letter_i] in vowels and word[
|
||||||
|
next_letter_i] in get_nonresonant_silent_consonants():
|
||||||
|
new_word[letter_i] = 'W'
|
||||||
|
elif word[letter_i] == 'p' and word[next_letter_i] == 'm':
|
||||||
|
new_word[letter_i] = 'p_n'
|
||||||
|
elif word[letter_i] == 'p' and (word[next_letter_i] == 'f' or word[next_letter_i] == 'v'):
|
||||||
|
new_word[letter_i] = 'p_f'
|
||||||
|
elif word[letter_i] == 'b' and word[next_letter_i] == 'm':
|
||||||
|
new_word[letter_i] = 'b_n'
|
||||||
|
elif word[letter_i] == 'b' and (word[next_letter_i] == 'f' or word[next_letter_i] == 'v'):
|
||||||
|
new_word[letter_i] = 'b_f'
|
||||||
|
elif word[letter_i] == 't' and word[next_letter_i] == 'l':
|
||||||
|
new_word[letter_i] = 't_l'
|
||||||
|
elif word[letter_i] == 't' and word[next_letter_i] == 'n':
|
||||||
|
new_word[letter_i] = 't_n'
|
||||||
|
elif word[letter_i] == 'd' and word[next_letter_i] == 'l':
|
||||||
|
new_word[letter_i] = 'd_l'
|
||||||
|
elif word[letter_i] == 'd' and word[next_letter_i] == 'n':
|
||||||
|
new_word[letter_i] = 'd_n'
|
||||||
|
|
||||||
|
if len(word) > next_letter_i + 1:
|
||||||
|
if word[next_letter_i + 1] == ':' or word[next_letter_i + 1] == '\"':
|
||||||
|
if len(word) > next_letter_i + 2:
|
||||||
|
previous_letter_i = letter_i
|
||||||
|
letter_i = next_letter_i
|
||||||
|
next_letter_i = next_letter_i + 2
|
||||||
|
else:
|
||||||
|
#if word[next_letter_i] == 'l':
|
||||||
|
# new_word[next_letter_i] = 'l\''
|
||||||
|
#elif word[next_letter_i] == 'n':
|
||||||
|
# new_word[next_letter_i] = 'n\''
|
||||||
|
return ''.join(new_word)
|
||||||
|
else:
|
||||||
|
previous_letter_i = letter_i
|
||||||
|
letter_i = next_letter_i
|
||||||
|
next_letter_i = next_letter_i + 1
|
||||||
|
else:
|
||||||
|
#if word[next_letter_i] == 'l':
|
||||||
|
# new_word[next_letter_i] = 'l\''
|
||||||
|
#elif word[next_letter_i] == 'n':
|
||||||
|
# new_word[next_letter_i] = 'n\''
|
||||||
|
return ''.join(new_word)
|
||||||
|
# print(word)
|
||||||
|
|
||||||
|
result = convert_to_SAMPA(sys.argv[1])
|
||||||
|
final_result = result.replace('\"', '\'')
|
||||||
|
print(final_result)
|
||||||
|
#return final_result
|
Loading…
Reference in New Issue
Block a user