From ad7d10563ec2edccf8ec253244212140dcbfd55e Mon Sep 17 00:00:00 2001
From: Luka <krsnik.luka92@gmail.com>
Date: Thu, 27 Sep 2018 14:41:27 +0200
Subject: [PATCH] A couple of fixes

---
 .gitignore                             |  9 ++++++++
 postprocessing/assign_stress2lemmas.py | 29 ++++++++++++++++++++++++--
 prepare_data.py                        |  2 +-
 sloleks_accentuation2_tab2xml.py       |  3 +--
 4 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index d327e6a..d0a5416 100755
--- a/.gitignore
+++ b/.gitignore
@@ -98,5 +98,14 @@ grid_results/
 .idea/
 cnn/word_accetuation/svm/data/
 postprocessing/data_merge.ipynb
+data_merge.ipynb
 postprocessing/data_merge.py
+data_merge.py
 postprocessing/sp_data_merge.py
+sp_data_merge.py
+postprocessing/data_merge_tab2xml.py
+data_merge_tab2xml.py
+postprocessing/data_merge_analysis.py
+data_merge_analysis.py
+postprocessing/sp_sloleks_data_merge.py
+sp_sloleks_data_merge.py
diff --git a/postprocessing/assign_stress2lemmas.py b/postprocessing/assign_stress2lemmas.py
index 3bdbcb0..66d579b 100755
--- a/postprocessing/assign_stress2lemmas.py
+++ b/postprocessing/assign_stress2lemmas.py
@@ -140,9 +140,34 @@ with open("data/contextual_changes/accented_lemmas_final_sloleks2_small.xml", "a
                 or re.match(r'K.*', msd) or re.match(r'D.', msd) or re.match(r'L', msd) or re.match(r'M', msd) \
                 or re.match(r'O', msd) or re.match(r'Z.*', msd) or re.match(r'V.', msd) or re.match(r'Rsr.', msd)\
                 or msd == "":
+
+            # when lemma does not equal unstressed version of what is supposed to be lemma, try to find parts of the
+            # word that are equal and transfer stress to lemma (if possible)
             if lemma != stressed2unstressed(stressed_lemma):
-                print(lemma + " : " + stressed_lemma + " - " + msd)
-            pass
+                identical_length = 0
+                # if lemma == 'Latkov':
+                #     print('HERE')
+                for i in range(min(len(lemma), len(stressed2unstressed(stressed_lemma)))):
+                    # a = list(lemma)
+                    # b = list(stressed2unstressed(stressed_lemma))
+                    identical_length += 1
+                    if list(lemma)[i] != list(stressed2unstressed(stressed_lemma))[i]:
+                        break
+
+
+                for l in list(stressed_lemma[identical_length:]):
+                    if l in accented_vowels:
+                        # print(lemma)
+                        # print(stressed2unstressed(stressed_lemma))
+                        # print(stressed_lemma[identical_length:])
+                        print(lemma + " : " + stressed_lemma + " - " + msd)
+                stressed_lemma = stressed_lemma[:identical_length] + lemma[identical_length:]
+
+
+
+            # pass
+            # if lemma != stressed2unstressed(stressed_lemma):
+            #     print(lemma + " : " + stressed_lemma + " - " + msd)
         else:
             # print("Error2 - " + msd + " " + lemma + " - " + stressed_lemma)
             # print(lemma + " - " + msd)
diff --git a/prepare_data.py b/prepare_data.py
index a626f38..50f640a 100755
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -1771,7 +1771,7 @@ def convert_to_correct_stress(w):
     w = w.replace('à', 'ŕ')
     w = w.replace('ä', 'à')
     w = w.replace('ë', 'è')
-    # cor_content[i][3] = cor_content[i][3].replace('ě', 'ê')
+    w = w.replace('ě', 'ê')
     w = w.replace('î', 'ì')
     w = w.replace('ö', 'ò')
     w = w.replace('ü', 'ù')
diff --git a/sloleks_accentuation2_tab2xml.py b/sloleks_accentuation2_tab2xml.py
index 36a8612..64d96de 100755
--- a/sloleks_accentuation2_tab2xml.py
+++ b/sloleks_accentuation2_tab2xml.py
@@ -136,8 +136,7 @@ with open("data/new_sloleks/final_sloleks2.xml", "ab") as myfile:
                         new_element = etree.Element('feat')
                         new_element.attrib['att'] = 'SAMPA'
                         print(accentuated_word)
-                        if lemma == 'Barrymore':
-                            print("HERE!")
+
                         new_element.attrib['val'] = convert_to_SAMPA(accentuated_word)
                         wf.append(new_element)