Added tab2xml conversion modifications
This commit is contained in:
parent
524ceeb4b6
commit
43a7866636
|
@ -30,8 +30,8 @@ from prepare_data import *
|
|||
#
|
||||
# gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')
|
||||
word_glob_num = 0
|
||||
word_limit = 1000
|
||||
iter_num = 1000
|
||||
word_limit = 50000
|
||||
iter_num = 50000
|
||||
word_index = 0
|
||||
|
||||
# iter_index = 0
|
||||
|
@ -107,7 +107,15 @@ with open("data/new_sloleks/final_sloleks.xml", "ab") as myfile:
|
|||
break
|
||||
word_index = (word_index + 1) % len(accentuated_content)
|
||||
|
||||
if word_index == word_index_sp:
|
||||
error = word_index == word_index_sp
|
||||
if word_index == word_index_sp and word == accentuated_content[word_index][0] and msd == accentuated_content[word_index][2] \
|
||||
and lemma == accentuated_content[word_index][1]:
|
||||
accentuated_word_location = accentuated_content[word_index][4]
|
||||
accentuated_word = accentuated_content[word_index][5][:-1]
|
||||
error = False
|
||||
del(accentuated_content[word_index])
|
||||
|
||||
if error:
|
||||
print('ERROR IN ' + word + ' : ' + lemma + ' : ' + msd)
|
||||
# print('ERROR IN ' + word + ' : ' + accentuated_content[word_index][0] + ' OR ' + msd + ' : '
|
||||
# + accentuated_content[word_index][2])
|
||||
|
@ -128,10 +136,9 @@ with open("data/new_sloleks/final_sloleks.xml", "ab") as myfile:
|
|||
# print(etree.tostring(element, encoding="UTF-8"))
|
||||
# myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
|
||||
if word_glob_num > word_limit:
|
||||
print('Proccessed ' + str(word_glob_num) + ' words')
|
||||
# print('Proccessed ' + str(word_glob_num) + ' words')
|
||||
end_timer = time.time()
|
||||
print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
|
||||
# print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
|
||||
word_limit += iter_num
|
||||
break
|
||||
myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
|
||||
element.clear()
|
||||
|
|
Loading…
Reference in New Issue
Block a user