IssueID #1835: made encoding-related improvements

This commit is contained in:
Cyprian Laskowski 2021-06-29 17:44:46 +02:00
parent c43c90352d
commit 86e2b12782
2 changed files with 12 additions and 5 deletions

View File

@ -1,3 +1,4 @@
import codecs
import os
import shutil
import tempfile
@ -24,8 +25,9 @@ def strings_to_parse():
if (request.method == 'GET'):
string = request.args.get('string')
with open(string_file_name, 'w') as string_file:
string_file.write(string + '\n')
string_file = codecs.open(string_file_name, 'w', 'UTF-8')
string_file.write(string + '\n')
string_file.close()
elif (request.method == 'POST'):
file_data = request.files['file']
file_data.save(string_file_name)
@ -33,7 +35,7 @@ def strings_to_parse():
try:
runner.strings_to_parse(string_file_name, parsed_file_name)
root = lxml.parse(parsed_file_name).getroot()
message = lxml.tostring(root, encoding='UTF-8', pretty_print=True).decode()
message = lxml.tostring(root, encoding='UTF-8', pretty_print=True)
shutil.rmtree(tmp_directory)
except Exception as e:
message = '<error>' + str(e) + '</error>'

View File

@ -4,7 +4,6 @@ import tempfile
from types import SimpleNamespace
import lxml.etree as lxml
import obeliks
import classla
from structure_assignment.constants import *
@ -105,6 +104,7 @@ class Pipeline:
import sys
sys.path.insert(0, self.tmp_directory)
self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}
self.classla_directory = resource_directory + '/classla'
def import_file(self, file_name, file_key):
shutil.copyfile(file_name, self.file_map[file_key])
@ -113,7 +113,12 @@ class Pipeline:
print('Tokenising with obeliks ...')
input_file_name = self.file_map['strings-list']
output_file_name = self.file_map['obeliks-tokenised']
obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True)
with open(input_file_name, 'r') as input_file:
input_conllu = input_file.read()
tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.classla_directory)
output_conllu = tokeniser(input_conllu).to_conll()
with open(output_file_name, 'w') as output_file:
output_file.write(output_conllu)
def do_tweak_conllu(self):
print('Tweaking conllu ...')