UTF-8 encoding fix
This commit is contained in:
parent
f43ea39f1b
commit
b711fae3b5
@ -312,7 +312,7 @@ def construct_tei_etrees(conllu_lines):
|
|||||||
|
|
||||||
|
|
||||||
def convert_file(input_file_name, output_file_name):
|
def convert_file(input_file_name, output_file_name):
|
||||||
input_file = open(input_file_name, 'r')
|
input_file = open(input_file_name, 'r', encoding='utf-8')
|
||||||
root = construct_tei_etrees(input_file)[0]
|
root = construct_tei_etrees(input_file)[0]
|
||||||
tree = etree.ElementTree(root)
|
tree = etree.ElementTree(root)
|
||||||
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
||||||
@ -332,7 +332,7 @@ if __name__ == '__main__':
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.out:
|
if args.out:
|
||||||
f_out = open(args.out, 'w')
|
f_out = open(args.out, 'w', encoding='utf-8')
|
||||||
else:
|
else:
|
||||||
f_out = sys.stdout
|
f_out = sys.stdout
|
||||||
|
|
||||||
@ -341,7 +341,7 @@ if __name__ == '__main__':
|
|||||||
for arg in args.files:
|
for arg in args.files:
|
||||||
filelist = glob(arg)
|
filelist = glob(arg)
|
||||||
for f in filelist:
|
for f in filelist:
|
||||||
with open(f, 'r') as conllu_f:
|
with open(f, 'r', encoding='utf-8') as conllu_f:
|
||||||
tei_etrees = construct_tei_etrees(conllu_f)
|
tei_etrees = construct_tei_etrees(conllu_f)
|
||||||
for tei_etree in tei_etrees:
|
for tei_etree in tei_etrees:
|
||||||
f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode())
|
f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode())
|
||||||
|
|||||||
@ -4,7 +4,6 @@ This script was developed in the context of a specific task and may not generali
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import codecs
|
|
||||||
import lxml.etree as lxml
|
import lxml.etree as lxml
|
||||||
from importlib_resources import files
|
from importlib_resources import files
|
||||||
|
|
||||||
@ -13,7 +12,7 @@ from conversion_utils.jos_msds_and_properties import Converter, Msd
|
|||||||
|
|
||||||
def get_syn_map():
|
def get_syn_map():
|
||||||
dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
|
dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
|
||||||
dict_file = codecs.open(dict_file_name, 'r')
|
dict_file = open(dict_file_name, 'r', encoding='utf-8')
|
||||||
root = lxml.parse(dict_file).getroot()
|
root = lxml.parse(dict_file).getroot()
|
||||||
dict_file.close()
|
dict_file.close()
|
||||||
return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
|
return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
|
||||||
@ -23,8 +22,8 @@ def translate(input_file_name, scope, output_file_name):
|
|||||||
|
|
||||||
syn_map = get_syn_map()
|
syn_map = get_syn_map()
|
||||||
|
|
||||||
output_file = codecs.open(output_file_name, 'w')
|
output_file = open(output_file_name, 'w', encoding='utf-8')
|
||||||
input_file = codecs.open(input_file_name, 'r')
|
input_file = open(input_file_name, 'r', encoding='utf-8')
|
||||||
|
|
||||||
converter = Converter()
|
converter = Converter()
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user