UTF-8 encoding fix

This commit is contained in:
Luka Dragar 2025-11-28 16:55:04 +01:00
parent f43ea39f1b
commit b711fae3b5
2 changed files with 6 additions and 7 deletions

View File

@ -312,7 +312,7 @@ def construct_tei_etrees(conllu_lines):
def convert_file(input_file_name, output_file_name):
input_file = open(input_file_name, 'r')
input_file = open(input_file_name, 'r', encoding='utf-8')
root = construct_tei_etrees(input_file)[0]
tree = etree.ElementTree(root)
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
@ -332,7 +332,7 @@ if __name__ == '__main__':
args = parser.parse_args()
if args.out:
f_out = open(args.out, 'w')
f_out = open(args.out, 'w', encoding='utf-8')
else:
f_out = sys.stdout
@ -341,7 +341,7 @@ if __name__ == '__main__':
for arg in args.files:
filelist = glob(arg)
for f in filelist:
with open(f, 'r') as conllu_f:
with open(f, 'r', encoding='utf-8') as conllu_f:
tei_etrees = construct_tei_etrees(conllu_f)
for tei_etree in tei_etrees:
f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode())

View File

@ -4,7 +4,6 @@ This script was developed in the context of a specific task and may not generali
"""
import argparse
import codecs
import lxml.etree as lxml
from importlib_resources import files
@ -13,7 +12,7 @@ from conversion_utils.jos_msds_and_properties import Converter, Msd
def get_syn_map():
dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
dict_file = codecs.open(dict_file_name, 'r')
dict_file = open(dict_file_name, 'r', encoding='utf-8')
root = lxml.parse(dict_file).getroot()
dict_file.close()
return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
@ -23,8 +22,8 @@ def translate(input_file_name, scope, output_file_name):
syn_map = get_syn_map()
output_file = codecs.open(output_file_name, 'w')
input_file = codecs.open(input_file_name, 'r')
output_file = open(output_file_name, 'w', encoding='utf-8')
input_file = open(input_file_name, 'r', encoding='utf-8')
converter = Converter()