A couple of fixes

This commit is contained in:
2018-09-27 14:41:27 +02:00
parent d4ea584fc4
commit ad7d10563e
4 changed files with 38 additions and 5 deletions

View File

@@ -140,9 +140,34 @@ with open("data/contextual_changes/accented_lemmas_final_sloleks2_small.xml", "a
or re.match(r'K.*', msd) or re.match(r'D.', msd) or re.match(r'L', msd) or re.match(r'M', msd) \
or re.match(r'O', msd) or re.match(r'Z.*', msd) or re.match(r'V.', msd) or re.match(r'Rsr.', msd)\
or msd == "":
# when lemma does not equal unstressed version of what is supposed to be lemma, try to find parts of the
# word that are equal and transfer stress to lemma (if possible)
if lemma != stressed2unstressed(stressed_lemma):
print(lemma + " : " + stressed_lemma + " - " + msd)
pass
identical_length = 0
# if lemma == 'Latkov':
# print('HERE')
for i in range(min(len(lemma), len(stressed2unstressed(stressed_lemma)))):
# a = list(lemma)
# b = list(stressed2unstressed(stressed_lemma))
identical_length += 1
if list(lemma)[i] != list(stressed2unstressed(stressed_lemma))[i]:
break
for l in list(stressed_lemma[identical_length:]):
if l in accented_vowels:
# print(lemma)
# print(stressed2unstressed(stressed_lemma))
# print(stressed_lemma[identical_length:])
print(lemma + " : " + stressed_lemma + " - " + msd)
stressed_lemma = stressed_lemma[:identical_length] + lemma[identical_length:]
# pass
# if lemma != stressed2unstressed(stressed_lemma):
# print(lemma + " : " + stressed_lemma + " - " + msd)
else:
# print("Error2 - " + msd + " " + lemma + " - " + stressed_lemma)
# print(lemma + " - " + msd)