diff --git a/sloleks_accentuation2_tab2xml.py b/sloleks_accentuation2_tab2xml.py index 18f98d0..4011f19 100644 --- a/sloleks_accentuation2_tab2xml.py +++ b/sloleks_accentuation2_tab2xml.py @@ -5,6 +5,7 @@ from lxml import etree import time from prepare_data import * +from text2SAMPA import * # def xml_words_generator(xml_path): # for event, element in etree.iterparse(xml_path, tag="LexicalEntry", encoding="UTF-8"): @@ -130,6 +131,12 @@ with open("data/new_sloleks/final_sloleks.xml", "ab") as myfile: new_element.attrib['att'] = 'naglašena_beseda' new_element.attrib['val'] = accentuated_word wf.append(new_element) + + new_element = etree.Element('feat') + new_element.attrib['att'] = 'SAMPA' + new_element.attrib['val'] = result = convert_to_SAMPA(accentuated_word) + wf.append(new_element) + word_glob_num += 1 # word_index += 1 diff --git a/sloleks_xml_checker.py b/sloleks_xml_checker.py new file mode 100644 index 0000000..f923e8f --- /dev/null +++ b/sloleks_xml_checker.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +# Words proccesed: 650250 +# Word indeks: 50023 +# Word number: 50023 + +from lxml import etree + +word_glob_num = 0 +word_limit = 50000 +iter_num = 50000 +word_index = 0 +accented_places = 0 +accented_words = 0 +enters = 0 + +for event, element in etree.iterparse('data/new_sloleks/final_sloleks.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): + for child in element: + for wf in child: + if wf.tag == 'FormRepresentation': + for form_rep in wf: + if form_rep.attrib['att'] == 'naglasna_mesta_besede': + accented_places += 1 + if '\n' in list(form_rep.attrib['val']): + enters += 1 + if form_rep.attrib['att'] == 'naglašena_beseda': + accented_words += 1 + if '\n' in list(form_rep.attrib['val']): + enters += 1 + + element.clear() + +print(accented_places) +print(accented_words) +print(enters) diff --git a/text2SAMPA.py b/text2SAMPA.py new file mode 100644 index 0000000..136b0db --- /dev/null +++ b/text2SAMPA.py @@ -0,0 +1,237 @@ +from copy import copy +import sys + + +vowels = ['à', 'á', 'ä', 'é', 'ë', 'ì', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü', 'a', 'e', 'i', 'o', 'u', 'O', 'E'] + +def syllable_stressed(syllable): + stressed_letters = [u'ŕ', u'á', u'ä', u'é', u'ë', u'ě', u'í', u'î', u'ó', u'ô', u'ö', u'ú', u'ü'] + for letter in syllable: + if letter in stressed_letters: + return True + return False + +def is_vowel(word_list, position, vowels): + if word_list[position] in vowels: + return True + if (word_list[position] == u'r' or word_list[position] == u'R') and (position - 1 < 0 or word_list[position - 1] not in vowels) and ( + position + 1 >= len(word_list) or word_list[position + 1] not in vowels): + return True + return False + +def get_voiced_consonants(): + return ['m', 'n', 'v', 'l', 'r', 'j', 'y', 'w', 'F', 'N'] + +def get_resonant_silent_consonants(): + return ['b', 'd', 'z', 'ž', 'g'] + +def get_nonresonant_silent_consonants(): + return ['p', 't', 's', 'š', 'č', 'k', 'f', 'h', 'c', 'x'] + +def split_consonants(consonants): + voiced_consonants = get_voiced_consonants() + resonant_silent_consonants = get_resonant_silent_consonants() + unresonant_silent_consonants = get_nonresonant_silent_consonants() + if len(consonants) == 0: + return [''], [''] + elif len(consonants) == 1: + return [''], consonants + else: + split_options = [] + for i in range(len(consonants) - 1): + if consonants[i] == '-' or consonants[i] == '_': + split_options.append([i, -1]) + elif consonants[i] == consonants[i + 1]: + split_options.append([i, 0]) + elif consonants[i] in voiced_consonants: + if consonants[i + 1] in resonant_silent_consonants or consonants[i + 1] in unresonant_silent_consonants: + split_options.append([i, 2]) + elif consonants[i] in resonant_silent_consonants: + if consonants[i + 1] in resonant_silent_consonants: + split_options.append([i, 1]) + elif consonants[i + 1] in unresonant_silent_consonants: + split_options.append([i, 3]) + elif consonants[i] in unresonant_silent_consonants: + if consonants[i + 1] in resonant_silent_consonants: + split_options.append([i, 4]) + + if split_options == []: + return [''], consonants + else: + split = min(split_options, key=lambda x: x[1]) + return consonants[:split[0] + 1], consonants[split[0] + 1:] + +def create_syllables(word, vowels): + word_list = list(word) + consonants = [] + syllables = [] + for i in range(len(word_list)): + if is_vowel(word_list, i, vowels): + if syllables == []: + consonants.append(word_list[i]) + syllables.append(''.join(consonants)) + else: + left_consonants, right_consonants = split_consonants(list(''.join(consonants).lower())) + syllables[-1] += ''.join(left_consonants) + right_consonants.append(word_list[i]) + syllables.append(''.join(right_consonants)) + consonants = [] + else: + consonants.append(word_list[i]) + if len(syllables) < 1: + return word + syllables[-1] += ''.join(consonants) + + return syllables + + +def convert_to_SAMPA(word): + syllables = create_syllables(word, vowels) + letters_in_stressed_syllable = [False] * len(word) + # print(syllables) + l = 0 + for syllable in syllables: + if syllable_stressed(syllable): + for i in range(len(syllable)): + letters_in_stressed_syllable[l + i] = True + # print(l) + l += len(syllable) + previous_letter = '' + word = list(word) + for i in range(len(word)): + if word[i] == 'e': + word[i] = 'E' + elif word[i] == 'o': + word[i] = 'O' + elif word[i] == 'š': + word[i] = 'S' + elif word[i] == 'ž': + word[i] = 'Z' + elif word[i] == 'h': + word[i] = 'x' + elif word[i] == 'c': + word[i] = 'ts' + elif word[i] == 'č': + word[i] = 'tS' + elif word[i] == 'á': + word[i] = 'a:' + elif word[i] == 'ä': + word[i] = 'a' + elif word[i] == 'é': + word[i] = 'e:' + elif word[i] == 'ë': + word[i] = 'E' + elif word[i] == 'ě': + word[i] = 'E:' + elif word[i] == 'í': + word[i] = 'i:' + elif word[i] == 'î': + word[i] = 'i' + elif word[i] == 'ó': + word[i] = 'o:' + elif word[i] == 'ô': + word[i] = 'O:' + elif word[i] == 'ö': + word[i] = 'O' + elif word[i] == 'ú': + word[i] = 'u:' + elif word[i] == 'ü': + word[i] = 'u' + elif word[i] == 'ŕ': + word[i] = '@r' + + if letters_in_stressed_syllable[0]: + word[0] = '\"' + word[0] + for i in range(1, len(letters_in_stressed_syllable)): + if not letters_in_stressed_syllable[i - 1] and letters_in_stressed_syllable[i]: + word[i] = '\"' + word[i] + # if letters_in_stressed_syllable[i - 1] and not letters_in_stressed_syllable[i]: + # word[i - 1] = word[i - 1] + ':' + # if letters_in_stressed_syllable[-1]: + # word[-1] = word[-1] + ':' + + word = list(''.join(word)) + + previous_letter_i = -1 + letter_i = 0 + next_letter_i = 1 + if word[0] == '\"': + letter_i = 1 + if word[2] == ':': + if len(word) > 3: + next_letter_i = 3 + else: + #if word[next_letter_i] == 'l': + # word[next_letter_i] = 'l\'' + #elif word[next_letter_i] == 'n': + # word[next_letter_i] = 'n\'' + return ''.join(word) + else: + next_letter_i = 2 + elif len(word) > 1 and word[1] == '\"': + next_letter_i = 2 + # {('m', 'f'): 'F'} + + new_word = copy(word) + while True: + if word[letter_i] == 'm' and (word[next_letter_i] == 'f' or word[next_letter_i] == 'v'): + new_word[letter_i] = 'F' + elif word[letter_i] == 'n' and (word[next_letter_i] == 'k' or word[next_letter_i] == 'g' or word[next_letter_i] == 'x'): + new_word[letter_i] = 'N' + elif word[letter_i] == 'n' and (word[next_letter_i] == 'f' or word[next_letter_i] == 'v'): + new_word[letter_i] = 'F' + elif word[letter_i] == 'n' and not word[next_letter_i] in vowels and letter_i == len(word) - 2: + new_word[letter_i] = 'n\'' + elif word[letter_i] == 'l' and not word[next_letter_i] in vowels and letter_i == len(word) - 2: + new_word[letter_i] = 'l\'' + elif previous_letter_i >= 0 and word[letter_i] == 'v' and not word[previous_letter_i] in vowels and word[ + next_letter_i] in get_voiced_consonants(): + new_word[letter_i] = 'w' + elif previous_letter_i >= 0 and word[letter_i] == 'v' and not word[previous_letter_i] in vowels and word[ + next_letter_i] in get_nonresonant_silent_consonants(): + new_word[letter_i] = 'W' + elif word[letter_i] == 'p' and word[next_letter_i] == 'm': + new_word[letter_i] = 'p_n' + elif word[letter_i] == 'p' and (word[next_letter_i] == 'f' or word[next_letter_i] == 'v'): + new_word[letter_i] = 'p_f' + elif word[letter_i] == 'b' and word[next_letter_i] == 'm': + new_word[letter_i] = 'b_n' + elif word[letter_i] == 'b' and (word[next_letter_i] == 'f' or word[next_letter_i] == 'v'): + new_word[letter_i] = 'b_f' + elif word[letter_i] == 't' and word[next_letter_i] == 'l': + new_word[letter_i] = 't_l' + elif word[letter_i] == 't' and word[next_letter_i] == 'n': + new_word[letter_i] = 't_n' + elif word[letter_i] == 'd' and word[next_letter_i] == 'l': + new_word[letter_i] = 'd_l' + elif word[letter_i] == 'd' and word[next_letter_i] == 'n': + new_word[letter_i] = 'd_n' + + if len(word) > next_letter_i + 1: + if word[next_letter_i + 1] == ':' or word[next_letter_i + 1] == '\"': + if len(word) > next_letter_i + 2: + previous_letter_i = letter_i + letter_i = next_letter_i + next_letter_i = next_letter_i + 2 + else: + #if word[next_letter_i] == 'l': + # new_word[next_letter_i] = 'l\'' + #elif word[next_letter_i] == 'n': + # new_word[next_letter_i] = 'n\'' + return ''.join(new_word) + else: + previous_letter_i = letter_i + letter_i = next_letter_i + next_letter_i = next_letter_i + 1 + else: + #if word[next_letter_i] == 'l': + # new_word[next_letter_i] = 'l\'' + #elif word[next_letter_i] == 'n': + # new_word[next_letter_i] = 'n\'' + return ''.join(new_word) + # print(word) + +result = convert_to_SAMPA(sys.argv[1]) +final_result = result.replace('\"', '\'') +print(final_result) +#return final_result diff --git a/tts.sh b/tts.sh new file mode 100755 index 0000000..8f7f725 --- /dev/null +++ b/tts.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +SAMPA=$(python text2SAMPA.py $1) +echo $SAMPA +espeak -v en "[[$SAMPA]]"