From d09b5a8293d08647649cc950617ffdf9c92e3f55 Mon Sep 17 00:00:00 2001 From: Luka Date: Wed, 22 Aug 2018 08:46:51 +0200 Subject: [PATCH] Added fix for lacking stressed data in tab form --- sloleks_accentuation2.py | 2 +- sloleks_accentuation2_tab2xml.py | 2 ++ text2SAMPA.py | 13 +++++++------ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/sloleks_accentuation2.py b/sloleks_accentuation2.py index cd00c6d..ebf3f9a 100755 --- a/sloleks_accentuation2.py +++ b/sloleks_accentuation2.py @@ -47,7 +47,7 @@ print('Commencing accentuator!') rate = 100000 start_timer = time.time() with open("data/new_sloleks/new_sloleks2.tab", "a") as myfile: - for index in range(300000, len(new_content), rate): + for index in range(0, len(new_content), rate): if index+rate >= len(new_content): words = [[el[0], '', el[2], el[0]] for el in new_content][index:len(new_content)] else: diff --git a/sloleks_accentuation2_tab2xml.py b/sloleks_accentuation2_tab2xml.py index 7ae8ab8..36a8612 100755 --- a/sloleks_accentuation2_tab2xml.py +++ b/sloleks_accentuation2_tab2xml.py @@ -136,6 +136,8 @@ with open("data/new_sloleks/final_sloleks2.xml", "ab") as myfile: new_element = etree.Element('feat') new_element.attrib['att'] = 'SAMPA' print(accentuated_word) + if lemma == 'Barrymore': + print("HERE!") new_element.attrib['val'] = convert_to_SAMPA(accentuated_word) wf.append(new_element) diff --git a/text2SAMPA.py b/text2SAMPA.py index 1a3000a..1d18030 100755 --- a/text2SAMPA.py +++ b/text2SAMPA.py @@ -5,7 +5,8 @@ import sys vowels = ['à', 'á', 'ä', 'é', 'ë', 'ì', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü', 'a', 'e', 'i', 'o', 'u', 'O', 'E'] def syllable_stressed(syllable): - stressed_letters = [u'ŕ', u'á', u'ä', u'é', u'ë', u'ě', u'í', u'î', u'ó', u'ô', u'ö', u'ú', u'ü'] + # stressed_letters = [u'ŕ', u'á', u'ä', u'é', u'ë', u'ě', u'í', u'î', u'ó', u'ô', u'ö', u'ú', u'ü'] + stressed_letters = [u'ŕ', u'á', u'à', u'é', u'è', u'ê', u'í', u'ì', u'ó', u'ô', u'ò', u'ú', u'ù'] for letter in syllable: if letter in stressed_letters: return True @@ -116,13 +117,13 @@ def convert_to_SAMPA(word): word[i] = 'tS' elif word[i] == 'á': word[i] = 'a:' - elif word[i] == 'ä': + elif word[i] == 'à': word[i] = 'a' elif word[i] == 'é': word[i] = 'e:' - elif word[i] == 'ë': + elif word[i] == 'è': word[i] = 'E' - elif word[i] == 'ě': + elif word[i] == 'ê': word[i] = 'E:' elif word[i] == 'í': word[i] = 'i:' @@ -132,11 +133,11 @@ def convert_to_SAMPA(word): word[i] = 'o:' elif word[i] == 'ô': word[i] = 'O:' - elif word[i] == 'ö': + elif word[i] == 'ò': word[i] = 'O' elif word[i] == 'ú': word[i] = 'u:' - elif word[i] == 'ü': + elif word[i] == 'ù': word[i] = 'u' elif word[i] == 'ŕ': word[i] = '@r'