import codecs import shutil import os import tempfile import obeliks import classla from classla import Document from classla.models.common.conll import CoNLLFile from structure_assignment.constants import * from structure_assignment.tweak_conllu import tweak as tweak_conllu from nova_slovnica.translate_jos import translate as translate_jos from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei NLP_CONFIG_MAP = { 'treebank': 'sl_ssj_jos', 'processors': 'tokenize,pos,lemma,depparse', 'tokenize_pretokenized': True, 'models_dir': None } XML_ID_PREFIX = 's' tmp_directory = tempfile.mkdtemp() resource_directory = None nlp = None def __get_tmp_file_name(file_key): return tmp_directory + '/' + FILE_NAME_MAP[file_key] def initialise(**argument_map): global resource_directory resource_directory = argument_map['resource_dir'] NLP_CONFIG_MAP['models_dir'] = resource_directory + '/classla' def import_file(file_name, file_key): shutil.copyfile(file_name, __get_tmp_file_name(file_key)) def do_tokenise(): input_file_name = __get_tmp_file_name('strings-list') output_file_name = __get_tmp_file_name('obeliks-tokenised') obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True) def do_tweak_conllu(): input_file_name = __get_tmp_file_name('obeliks-tokenised') output_file_name = __get_tmp_file_name('obeliks-tweaked') tweak_conllu(input_file_name, output_file_name) def load_classla_models(): global nlp nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP) def do_parse(): input_file_name = __get_tmp_file_name('obeliks-tweaked') output_file_name = __get_tmp_file_name('classla-parsed') doc = Document(text=None) conll_file = CoNLLFile(filename=input_file_name) doc.conll_file = conll_file result = nlp(doc) result.conll_file.write_conll(output_file_name) def do_translate_jos(): input_file_name = __get_tmp_file_name('classla-parsed') dictionary_file_name = resource_directory + '/dict.xml' output_file_name = __get_tmp_file_name('classla-translated') translate_jos(input_file_name, dictionary_file_name, output_file_name) def do_conllu_to_tei(): input_file_name = __get_tmp_file_name('classla-translated') output_file_name = __get_tmp_file_name('tei-initial') conllu_to_tei(input_file_name, output_file_name) def export_file(file_name, file_key): shutil.copyfile(__get_tmp_file_name(file_key), file_name) def cleanup(): shutil.rmtree(tmp_directory, True)