|
|
|
@ -5,7 +5,7 @@ import re
|
|
|
|
|
|
|
|
|
|
from lxml import etree
|
|
|
|
|
import time
|
|
|
|
|
from prepare_data import *
|
|
|
|
|
# from prepare_data import *
|
|
|
|
|
|
|
|
|
|
accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù']
|
|
|
|
|
|
|
|
|
@ -77,15 +77,15 @@ word_index = 0
|
|
|
|
|
# iter_index = 31
|
|
|
|
|
|
|
|
|
|
# done_lexical_entries = 33522
|
|
|
|
|
data = Data('s', shuffle_all_inputs=False)
|
|
|
|
|
# data = Data('s', shuffle_all_inputs=False)
|
|
|
|
|
# accentuated_content = data._read_content('data/new_sloleks/final_sloleks2.tab')
|
|
|
|
|
|
|
|
|
|
start_timer = time.time()
|
|
|
|
|
lemmas = 0
|
|
|
|
|
print('Copy initialization complete')
|
|
|
|
|
with open("data/contextual_changes/accented_lemmas_final_sloleks2_small.xml", "ab") as myfile:
|
|
|
|
|
with open("data/contextual_changes/stressed_lemmas_sloleks2.xml", "ab") as myfile:
|
|
|
|
|
# myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')
|
|
|
|
|
for event, element in etree.iterparse('data/new_sloleks/final_sloleks2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
|
|
|
|
|
for event, element in etree.iterparse('data/contextual_changes/final_sloleks2_inhouse2S.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
|
|
|
|
|
# for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True):
|
|
|
|
|
# if word_glob_num >= word_limit:
|
|
|
|
|
# myfile2.close()
|