UTF-8 encoding fix

This commit is contained in:
Luka Dragar 2025-11-28 16:55:04 +01:00
parent f43ea39f1b
commit b711fae3b5
2 changed files with 6 additions and 7 deletions

View File

@ -312,7 +312,7 @@ def construct_tei_etrees(conllu_lines):
def convert_file(input_file_name, output_file_name): def convert_file(input_file_name, output_file_name):
input_file = open(input_file_name, 'r') input_file = open(input_file_name, 'r', encoding='utf-8')
root = construct_tei_etrees(input_file)[0] root = construct_tei_etrees(input_file)[0]
tree = etree.ElementTree(root) tree = etree.ElementTree(root)
tree.write(output_file_name, encoding='UTF-8', pretty_print=True) tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
@ -332,7 +332,7 @@ if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
if args.out: if args.out:
f_out = open(args.out, 'w') f_out = open(args.out, 'w', encoding='utf-8')
else: else:
f_out = sys.stdout f_out = sys.stdout
@ -341,7 +341,7 @@ if __name__ == '__main__':
for arg in args.files: for arg in args.files:
filelist = glob(arg) filelist = glob(arg)
for f in filelist: for f in filelist:
with open(f, 'r') as conllu_f: with open(f, 'r', encoding='utf-8') as conllu_f:
tei_etrees = construct_tei_etrees(conllu_f) tei_etrees = construct_tei_etrees(conllu_f)
for tei_etree in tei_etrees: for tei_etree in tei_etrees:
f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode()) f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode())

View File

@ -4,7 +4,6 @@ This script was developed in the context of a specific task and may not generali
""" """
import argparse import argparse
import codecs
import lxml.etree as lxml import lxml.etree as lxml
from importlib_resources import files from importlib_resources import files
@ -13,7 +12,7 @@ from conversion_utils.jos_msds_and_properties import Converter, Msd
def get_syn_map(): def get_syn_map():
dict_file_name = files('conversion_utils.resources').joinpath('dict.xml') dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
dict_file = codecs.open(dict_file_name, 'r') dict_file = open(dict_file_name, 'r', encoding='utf-8')
root = lxml.parse(dict_file).getroot() root = lxml.parse(dict_file).getroot()
dict_file.close() dict_file.close()
return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')} return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
@ -23,8 +22,8 @@ def translate(input_file_name, scope, output_file_name):
syn_map = get_syn_map() syn_map = get_syn_map()
output_file = codecs.open(output_file_name, 'w') output_file = open(output_file_name, 'w', encoding='utf-8')
input_file = codecs.open(input_file_name, 'r') input_file = open(input_file_name, 'r', encoding='utf-8')
converter = Converter() converter = Converter()