diff --git a/package/structure_assignment/api.py b/package/structure_assignment/api.py index 58686b1..7b1e505 100644 --- a/package/structure_assignment/api.py +++ b/package/structure_assignment/api.py @@ -1,3 +1,4 @@ +import codecs import os import shutil import tempfile @@ -24,8 +25,9 @@ def strings_to_parse(): if (request.method == 'GET'): string = request.args.get('string') - with open(string_file_name, 'w') as string_file: - string_file.write(string + '\n') + string_file = codecs.open(string_file_name, 'w', 'UTF-8') + string_file.write(string + '\n') + string_file.close() elif (request.method == 'POST'): file_data = request.files['file'] file_data.save(string_file_name) @@ -33,7 +35,7 @@ def strings_to_parse(): try: runner.strings_to_parse(string_file_name, parsed_file_name) root = lxml.parse(parsed_file_name).getroot() - message = lxml.tostring(root, encoding='UTF-8', pretty_print=True).decode() + message = lxml.tostring(root, encoding='UTF-8', pretty_print=True) shutil.rmtree(tmp_directory) except Exception as e: message = '' + str(e) + '' diff --git a/package/structure_assignment/pipeline.py b/package/structure_assignment/pipeline.py index f8b96c0..fa08597 100644 --- a/package/structure_assignment/pipeline.py +++ b/package/structure_assignment/pipeline.py @@ -4,7 +4,6 @@ import tempfile from types import SimpleNamespace import lxml.etree as lxml -import obeliks import classla from structure_assignment.constants import * @@ -105,6 +104,7 @@ class Pipeline: import sys sys.path.insert(0, self.tmp_directory) self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()} + self.classla_directory = resource_directory + '/classla' def import_file(self, file_name, file_key): shutil.copyfile(file_name, self.file_map[file_key]) @@ -113,7 +113,12 @@ class Pipeline: print('Tokenising with obeliks ...') input_file_name = self.file_map['strings-list'] output_file_name = self.file_map['obeliks-tokenised'] - obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True) + with open(input_file_name, 'r') as input_file: + input_conllu = input_file.read() + tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.classla_directory) + output_conllu = tokeniser(input_conllu).to_conll() + with open(output_file_name, 'w') as output_file: + output_file.write(output_conllu) def do_tweak_conllu(self): print('Tweaking conllu ...')