From ad7d10563ec2edccf8ec253244212140dcbfd55e Mon Sep 17 00:00:00 2001 From: Luka Date: Thu, 27 Sep 2018 14:41:27 +0200 Subject: [PATCH] A couple of fixes --- .gitignore | 9 ++++++++ postprocessing/assign_stress2lemmas.py | 29 ++++++++++++++++++++++++-- prepare_data.py | 2 +- sloleks_accentuation2_tab2xml.py | 3 +-- 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index d327e6a..d0a5416 100755 --- a/.gitignore +++ b/.gitignore @@ -98,5 +98,14 @@ grid_results/ .idea/ cnn/word_accetuation/svm/data/ postprocessing/data_merge.ipynb +data_merge.ipynb postprocessing/data_merge.py +data_merge.py postprocessing/sp_data_merge.py +sp_data_merge.py +postprocessing/data_merge_tab2xml.py +data_merge_tab2xml.py +postprocessing/data_merge_analysis.py +data_merge_analysis.py +postprocessing/sp_sloleks_data_merge.py +sp_sloleks_data_merge.py diff --git a/postprocessing/assign_stress2lemmas.py b/postprocessing/assign_stress2lemmas.py index 3bdbcb0..66d579b 100755 --- a/postprocessing/assign_stress2lemmas.py +++ b/postprocessing/assign_stress2lemmas.py @@ -140,9 +140,34 @@ with open("data/contextual_changes/accented_lemmas_final_sloleks2_small.xml", "a or re.match(r'K.*', msd) or re.match(r'D.', msd) or re.match(r'L', msd) or re.match(r'M', msd) \ or re.match(r'O', msd) or re.match(r'Z.*', msd) or re.match(r'V.', msd) or re.match(r'Rsr.', msd)\ or msd == "": + + # when lemma does not equal unstressed version of what is supposed to be lemma, try to find parts of the + # word that are equal and transfer stress to lemma (if possible) if lemma != stressed2unstressed(stressed_lemma): - print(lemma + " : " + stressed_lemma + " - " + msd) - pass + identical_length = 0 + # if lemma == 'Latkov': + # print('HERE') + for i in range(min(len(lemma), len(stressed2unstressed(stressed_lemma)))): + # a = list(lemma) + # b = list(stressed2unstressed(stressed_lemma)) + identical_length += 1 + if list(lemma)[i] != list(stressed2unstressed(stressed_lemma))[i]: + break + + + for l in list(stressed_lemma[identical_length:]): + if l in accented_vowels: + # print(lemma) + # print(stressed2unstressed(stressed_lemma)) + # print(stressed_lemma[identical_length:]) + print(lemma + " : " + stressed_lemma + " - " + msd) + stressed_lemma = stressed_lemma[:identical_length] + lemma[identical_length:] + + + + # pass + # if lemma != stressed2unstressed(stressed_lemma): + # print(lemma + " : " + stressed_lemma + " - " + msd) else: # print("Error2 - " + msd + " " + lemma + " - " + stressed_lemma) # print(lemma + " - " + msd) diff --git a/prepare_data.py b/prepare_data.py index a626f38..50f640a 100755 --- a/prepare_data.py +++ b/prepare_data.py @@ -1771,7 +1771,7 @@ def convert_to_correct_stress(w): w = w.replace('à', 'ŕ') w = w.replace('ä', 'à') w = w.replace('ë', 'è') - # cor_content[i][3] = cor_content[i][3].replace('ě', 'ê') + w = w.replace('ě', 'ê') w = w.replace('î', 'ì') w = w.replace('ö', 'ò') w = w.replace('ü', 'ù') diff --git a/sloleks_accentuation2_tab2xml.py b/sloleks_accentuation2_tab2xml.py index 36a8612..64d96de 100755 --- a/sloleks_accentuation2_tab2xml.py +++ b/sloleks_accentuation2_tab2xml.py @@ -136,8 +136,7 @@ with open("data/new_sloleks/final_sloleks2.xml", "ab") as myfile: new_element = etree.Element('feat') new_element.attrib['att'] = 'SAMPA' print(accentuated_word) - if lemma == 'Barrymore': - print("HERE!") + new_element.attrib['val'] = convert_to_SAMPA(accentuated_word) wf.append(new_element)