Added tab2xml conversion modifications
This commit is contained in:
parent
524ceeb4b6
commit
43a7866636
|
@ -30,8 +30,8 @@ from prepare_data import *
|
||||||
#
|
#
|
||||||
# gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')
|
# gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')
|
||||||
word_glob_num = 0
|
word_glob_num = 0
|
||||||
word_limit = 1000
|
word_limit = 50000
|
||||||
iter_num = 1000
|
iter_num = 50000
|
||||||
word_index = 0
|
word_index = 0
|
||||||
|
|
||||||
# iter_index = 0
|
# iter_index = 0
|
||||||
|
@ -107,7 +107,15 @@ with open("data/new_sloleks/final_sloleks.xml", "ab") as myfile:
|
||||||
break
|
break
|
||||||
word_index = (word_index + 1) % len(accentuated_content)
|
word_index = (word_index + 1) % len(accentuated_content)
|
||||||
|
|
||||||
if word_index == word_index_sp:
|
error = word_index == word_index_sp
|
||||||
|
if word_index == word_index_sp and word == accentuated_content[word_index][0] and msd == accentuated_content[word_index][2] \
|
||||||
|
and lemma == accentuated_content[word_index][1]:
|
||||||
|
accentuated_word_location = accentuated_content[word_index][4]
|
||||||
|
accentuated_word = accentuated_content[word_index][5][:-1]
|
||||||
|
error = False
|
||||||
|
del(accentuated_content[word_index])
|
||||||
|
|
||||||
|
if error:
|
||||||
print('ERROR IN ' + word + ' : ' + lemma + ' : ' + msd)
|
print('ERROR IN ' + word + ' : ' + lemma + ' : ' + msd)
|
||||||
# print('ERROR IN ' + word + ' : ' + accentuated_content[word_index][0] + ' OR ' + msd + ' : '
|
# print('ERROR IN ' + word + ' : ' + accentuated_content[word_index][0] + ' OR ' + msd + ' : '
|
||||||
# + accentuated_content[word_index][2])
|
# + accentuated_content[word_index][2])
|
||||||
|
@ -128,10 +136,9 @@ with open("data/new_sloleks/final_sloleks.xml", "ab") as myfile:
|
||||||
# print(etree.tostring(element, encoding="UTF-8"))
|
# print(etree.tostring(element, encoding="UTF-8"))
|
||||||
# myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
|
# myfile2.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
|
||||||
if word_glob_num > word_limit:
|
if word_glob_num > word_limit:
|
||||||
print('Proccessed ' + str(word_glob_num) + ' words')
|
# print('Proccessed ' + str(word_glob_num) + ' words')
|
||||||
end_timer = time.time()
|
end_timer = time.time()
|
||||||
print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
|
# print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer) / 60.0) + " minutes")
|
||||||
word_limit += iter_num
|
word_limit += iter_num
|
||||||
break
|
|
||||||
myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
|
myfile.write(etree.tostring(element, encoding="UTF-8", pretty_print=True))
|
||||||
element.clear()
|
element.clear()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user