Added text to speech script and text to SAMPA

2018-05-31 11:22:42 +02:00
parent 43a7866636
commit 4c968b2d0f
4 changed files with 285 additions and 0 deletions
@@ -5,6 +5,7 @@
 from lxml import etree
 import time
 from prepare_data import *
 from text2SAMPA import *
 # def xml_words_generator(xml_path):
 #     for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"):
@@ -130,6 +131,12 @@ with open("data/new_sloleks/final_sloleks.xml", "ab") as myfile:
                        new_element.attrib['att'] = 'naglašena_beseda'
                        new_element.attrib['val'] = accentuated_word
                        wf.append(new_element)
                        new_element = etree.Element('feat')
                        new_element.attrib['att'] = 'SAMPA'
                        new_element.attrib['val'] = result = convert_to_SAMPA(accentuated_word)
                        wf.append(new_element)
                        word_glob_num += 1
                        # word_index += 1
@@ -0,0 +1,36 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 # Words proccesed: 650250
 # Word indeks: 50023
 # Word number: 50023
 from lxml import etree
 word_glob_num = 0
 word_limit = 50000
 iter_num = 50000
 word_index = 0
 accented_places = 0
 accented_words = 0
 enters = 0
 for event, element in etree.iterparse('data/new_sloleks/final_sloleks.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
    for child in element:
        for wf in child:
            if wf.tag == 'FormRepresentation':
                for form_rep in wf:
                    if form_rep.attrib['att'] == 'naglasna_mesta_besede':
                        accented_places += 1
                        if '\n' in list(form_rep.attrib['val']):
                            enters += 1
                    if form_rep.attrib['att'] == 'naglašena_beseda':
                        accented_words += 1
                        if '\n' in list(form_rep.attrib['val']):
                            enters += 1
    element.clear()
 print(accented_places)
 print(accented_words)
 print(enters)
@@ -0,0 +1,237 @@
 from copy import copy
 import sys
 vowels = ['à', 'á', 'ä', 'é', 'ë', 'ì', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü', 'a', 'e', 'i', 'o', 'u', 'O', 'E']
 def syllable_stressed(syllable):
    stressed_letters = [u'ŕ', u'á', u'ä', u'é', u'ë', u'ě', u'í', u'î', u'ó', u'ô', u'ö', u'ú', u'ü']
    for letter in syllable:
        if letter in stressed_letters:
            return True
    return False
 def is_vowel(word_list, position, vowels):
    if word_list[position] in vowels:
        return True
    if (word_list[position] == u'r' or word_list[position] == u'R') and (position - 1 < 0 or word_list[position - 1] not in vowels) and (
                        position + 1 >= len(word_list) or word_list[position + 1] not in vowels):
        return True
    return False
 def get_voiced_consonants():
    return ['m', 'n', 'v', 'l', 'r', 'j', 'y', 'w', 'F', 'N']
 def get_resonant_silent_consonants():
    return ['b', 'd', 'z', 'ž', 'g']
 def get_nonresonant_silent_consonants():
    return ['p', 't', 's', 'š', 'č', 'k', 'f', 'h', 'c', 'x']
 def split_consonants(consonants):
    voiced_consonants = get_voiced_consonants()
    resonant_silent_consonants = get_resonant_silent_consonants()
    unresonant_silent_consonants = get_nonresonant_silent_consonants()
    if len(consonants) == 0:
        return [''], ['']
    elif len(consonants) == 1:
        return [''], consonants
    else:
        split_options = []
        for i in range(len(consonants) - 1):
            if consonants[i] == '-' or consonants[i] == '_':
                split_options.append([i, -1])
            elif consonants[i] == consonants[i + 1]:
                split_options.append([i, 0])
            elif consonants[i] in voiced_consonants:
                if consonants[i + 1] in resonant_silent_consonants or consonants[i + 1] in unresonant_silent_consonants:
                    split_options.append([i, 2])
            elif consonants[i] in resonant_silent_consonants:
                if consonants[i + 1] in resonant_silent_consonants:
                    split_options.append([i, 1])
                elif consonants[i + 1] in unresonant_silent_consonants:
                    split_options.append([i, 3])
            elif consonants[i] in unresonant_silent_consonants:
                if consonants[i + 1] in resonant_silent_consonants:
                    split_options.append([i, 4])
        if split_options == []:
            return [''], consonants
        else:
            split = min(split_options, key=lambda x: x[1])
            return consonants[:split[0] + 1], consonants[split[0] + 1:]
 def create_syllables(word, vowels):
    word_list = list(word)
    consonants = []
    syllables = []
    for i in range(len(word_list)):
        if is_vowel(word_list, i, vowels):
            if syllables == []:
                consonants.append(word_list[i])
                syllables.append(''.join(consonants))
            else:
                left_consonants, right_consonants = split_consonants(list(''.join(consonants).lower()))
                syllables[-1] += ''.join(left_consonants)
                right_consonants.append(word_list[i])
                syllables.append(''.join(right_consonants))
            consonants = []
        else:
            consonants.append(word_list[i])
    if len(syllables) < 1:
        return word
    syllables[-1] += ''.join(consonants)
    return syllables
 def convert_to_SAMPA(word):
    syllables = create_syllables(word, vowels)
    letters_in_stressed_syllable = [False] * len(word)
    # print(syllables)
    l = 0
    for syllable in syllables:
        if syllable_stressed(syllable):
            for i in range(len(syllable)):
                letters_in_stressed_syllable[l + i] = True
        # print(l)
        l += len(syllable)
    previous_letter = ''
    word = list(word)
    for i in range(len(word)):
        if word[i] == 'e':
            word[i] = 'E'
        elif word[i] == 'o':
            word[i] = 'O'
        elif word[i] == 'š':
            word[i] = 'S'
        elif word[i] == 'ž':
            word[i] = 'Z'
        elif word[i] == 'h':
            word[i] = 'x'
        elif word[i] == 'c':
            word[i] = 'ts'
        elif word[i] == 'č':
            word[i] = 'tS'
        elif word[i] == 'á':
            word[i] = 'a:'
        elif word[i] == 'ä':
            word[i] = 'a'
        elif word[i] == 'é':
            word[i] = 'e:'
        elif word[i] == 'ë':
            word[i] = 'E'
        elif word[i] == 'ě':
            word[i] = 'E:'
        elif word[i] == 'í':
            word[i] = 'i:'
        elif word[i] == 'î':
            word[i] = 'i'
        elif word[i] == 'ó':
            word[i] = 'o:'
        elif word[i] == 'ô':
            word[i] = 'O:'
        elif word[i] == 'ö':
            word[i] = 'O'
        elif word[i] == 'ú':
            word[i] = 'u:'
        elif word[i] == 'ü':
            word[i] = 'u'
        elif word[i] == 'ŕ':
            word[i] = '@r'
    if letters_in_stressed_syllable[0]:
        word[0] = '\"' + word[0]
    for i in range(1, len(letters_in_stressed_syllable)):
        if not letters_in_stressed_syllable[i - 1] and letters_in_stressed_syllable[i]:
            word[i] = '\"' + word[i]
            # if letters_in_stressed_syllable[i - 1] and not letters_in_stressed_syllable[i]:
            #    word[i - 1] = word[i - 1] + ':'
    # if letters_in_stressed_syllable[-1]:
    #    word[-1] = word[-1] + ':'
    word = list(''.join(word))
    previous_letter_i = -1
    letter_i = 0
    next_letter_i = 1
    if word[0] == '\"':
        letter_i = 1
        if word[2] == ':':
            if len(word) > 3:
                next_letter_i = 3
            else:
                #if word[next_letter_i] == 'l':
                #    word[next_letter_i] = 'l\''
                #elif word[next_letter_i] == 'n':
                #    word[next_letter_i] = 'n\''
                return ''.join(word)
        else:
            next_letter_i = 2
    elif len(word) > 1 and word[1] == '\"':
        next_letter_i = 2
    # {('m', 'f'): 'F'}
    new_word = copy(word)
    while True:
        if word[letter_i] == 'm' and (word[next_letter_i] == 'f' or word[next_letter_i] == 'v'):
            new_word[letter_i] = 'F'
        elif word[letter_i] == 'n' and (word[next_letter_i] == 'k' or word[next_letter_i] == 'g' or word[next_letter_i] == 'x'):
            new_word[letter_i] = 'N'
        elif word[letter_i] == 'n' and (word[next_letter_i] == 'f' or word[next_letter_i] == 'v'):
            new_word[letter_i] = 'F'
        elif word[letter_i] == 'n' and not word[next_letter_i] in vowels and letter_i == len(word) - 2:
            new_word[letter_i] = 'n\''
        elif word[letter_i] == 'l' and not word[next_letter_i] in vowels and letter_i == len(word) - 2:
            new_word[letter_i] = 'l\''
        elif previous_letter_i >= 0 and word[letter_i] == 'v' and not word[previous_letter_i] in vowels and word[
            next_letter_i] in get_voiced_consonants():
            new_word[letter_i] = 'w'
        elif previous_letter_i >= 0 and word[letter_i] == 'v' and not word[previous_letter_i] in vowels and word[
            next_letter_i] in get_nonresonant_silent_consonants():
            new_word[letter_i] = 'W'
        elif word[letter_i] == 'p' and word[next_letter_i] == 'm':
            new_word[letter_i] = 'p_n'
        elif word[letter_i] == 'p' and (word[next_letter_i] == 'f' or word[next_letter_i] == 'v'):
            new_word[letter_i] = 'p_f'
        elif word[letter_i] == 'b' and word[next_letter_i] == 'm':
            new_word[letter_i] = 'b_n'
        elif word[letter_i] == 'b' and (word[next_letter_i] == 'f' or word[next_letter_i] == 'v'):
            new_word[letter_i] = 'b_f'
        elif word[letter_i] == 't' and word[next_letter_i] == 'l':
            new_word[letter_i] = 't_l'
        elif word[letter_i] == 't' and word[next_letter_i] == 'n':
            new_word[letter_i] = 't_n'
        elif word[letter_i] == 'd' and word[next_letter_i] == 'l':
            new_word[letter_i] = 'd_l'
        elif word[letter_i] == 'd' and word[next_letter_i] == 'n':
            new_word[letter_i] = 'd_n'
        if len(word) > next_letter_i + 1:
            if word[next_letter_i + 1] == ':' or word[next_letter_i + 1] == '\"':
                if len(word) > next_letter_i + 2:
                    previous_letter_i = letter_i
                    letter_i = next_letter_i
                    next_letter_i = next_letter_i + 2
                else:
                    #if word[next_letter_i] == 'l':
                    #    new_word[next_letter_i] = 'l\''
                    #elif word[next_letter_i] == 'n':
                    #    new_word[next_letter_i] = 'n\''
                    return ''.join(new_word)
            else:
                previous_letter_i = letter_i
                letter_i = next_letter_i
                next_letter_i = next_letter_i + 1
        else:
            #if word[next_letter_i] == 'l':
            #    new_word[next_letter_i] = 'l\''
            #elif word[next_letter_i] == 'n':
            #    new_word[next_letter_i] = 'n\''
            return ''.join(new_word)
            # print(word)
 result = convert_to_SAMPA(sys.argv[1])
 final_result = result.replace('\"', '\'')
 print(final_result)
 #return final_result
@@ -0,0 +1,5 @@
 #!/bin/sh
 SAMPA=$(python text2SAMPA.py $1)
 echo $SAMPA
 espeak -v en "[[$SAMPA]]"