|
|
|
@ -4,7 +4,6 @@ import tempfile
|
|
|
|
|
from types import SimpleNamespace
|
|
|
|
|
import lxml.etree as lxml
|
|
|
|
|
|
|
|
|
|
import obeliks
|
|
|
|
|
import classla
|
|
|
|
|
|
|
|
|
|
from structure_assignment.constants import *
|
|
|
|
@ -105,6 +104,7 @@ class Pipeline:
|
|
|
|
|
import sys
|
|
|
|
|
sys.path.insert(0, self.tmp_directory)
|
|
|
|
|
self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}
|
|
|
|
|
self.classla_directory = resource_directory + '/classla'
|
|
|
|
|
|
|
|
|
|
def import_file(self, file_name, file_key):
|
|
|
|
|
shutil.copyfile(file_name, self.file_map[file_key])
|
|
|
|
@ -113,7 +113,12 @@ class Pipeline:
|
|
|
|
|
print('Tokenising with obeliks ...')
|
|
|
|
|
input_file_name = self.file_map['strings-list']
|
|
|
|
|
output_file_name = self.file_map['obeliks-tokenised']
|
|
|
|
|
obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True)
|
|
|
|
|
with open(input_file_name, 'r') as input_file:
|
|
|
|
|
input_conllu = input_file.read()
|
|
|
|
|
tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.classla_directory)
|
|
|
|
|
output_conllu = tokeniser(input_conllu).to_conll()
|
|
|
|
|
with open(output_file_name, 'w') as output_file:
|
|
|
|
|
output_file.write(output_conllu)
|
|
|
|
|
|
|
|
|
|
def do_tweak_conllu(self):
|
|
|
|
|
print('Tweaking conllu ...')
|
|
|
|
|