You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

80 lines
2.5 KiB

import codecs
import shutil
import os
import tempfile
import obeliks
import classla
from classla import Document
from classla.models.common.conll import CoNLLFile
from structure_assignment.constants import *
from structure_assignment.tweak_conllu import tweak as tweak_conllu
from nova_slovnica.translate_jos import translate as translate_jos
from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei
NLP_CONFIG_MAP = {
'treebank': 'sl_ssj_jos',
'processors': 'tokenize,pos,lemma,depparse',
'tokenize_pretokenized': True,
'models_dir': None
}
XML_ID_PREFIX = 's'
tmp_directory = tempfile.mkdtemp()
resource_directory = None
nlp = None
def __get_tmp_file_name(file_key):
return tmp_directory + '/' + FILE_NAME_MAP[file_key]
def initialise(**argument_map):
global resource_directory
resource_directory = argument_map['resource_dir']
NLP_CONFIG_MAP['models_dir'] = resource_directory + '/classla'
def import_file(file_name, file_key):
shutil.copyfile(file_name, __get_tmp_file_name(file_key))
def do_tokenise():
input_file_name = __get_tmp_file_name('strings-list')
output_file_name = __get_tmp_file_name('obeliks-tokenised')
obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True)
def do_tweak_conllu():
input_file_name = __get_tmp_file_name('obeliks-tokenised')
output_file_name = __get_tmp_file_name('obeliks-tweaked')
tweak_conllu(input_file_name, output_file_name)
def load_classla_models():
global nlp
nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
def do_parse():
input_file_name = __get_tmp_file_name('obeliks-tweaked')
output_file_name = __get_tmp_file_name('classla-parsed')
doc = Document(text=None)
conll_file = CoNLLFile(filename=input_file_name)
doc.conll_file = conll_file
result = nlp(doc)
result.conll_file.write_conll(output_file_name)
def do_translate_jos():
input_file_name = __get_tmp_file_name('classla-parsed')
dictionary_file_name = resource_directory + '/dict.xml'
output_file_name = __get_tmp_file_name('classla-translated')
translate_jos(input_file_name, dictionary_file_name, output_file_name)
def do_conllu_to_tei():
input_file_name = __get_tmp_file_name('classla-translated')
output_file_name = __get_tmp_file_name('tei-initial')
conllu_to_tei(input_file_name, output_file_name)
def export_file(file_name, file_key):
shutil.copyfile(__get_tmp_file_name(file_key), file_name)
def cleanup():
shutil.rmtree(tmp_directory, True)