diff --git a/package/structure_assignment/api.py b/package/structure_assignment/api.py index 5fe6381..453804d 100644 --- a/package/structure_assignment/api.py +++ b/package/structure_assignment/api.py @@ -5,23 +5,25 @@ import lxml.etree as lxml from flask import Flask, Response from flask_httpauth import HTTPBasicAuth -import structure_assignment.pipeline as pipeline +from structure_assignment.pipeline import Pipeline, create_nlp app = Flask(__name__) api_prefix = os.environ['API_PREFIX'] resource_directory = os.environ['API_RESOURCE_DIR'] -pipeline.initialise(resource_dir=resource_directory) -pipeline.load_classla_models() + +nlp = create_nlp(resource_directory) @app.route(api_prefix + '/test/', methods=['GET']) def test(string): string_file_name = '/tmp/string.txt' parse_file_name = '/tmp/parse.xml' + with open(string_file_name, 'w') as string_file: string_file.write(string + '\n') try: + pipeline = Pipeline(nlp) pipeline.import_file(string_file_name, 'strings-list') pipeline.do_tokenise() pipeline.do_tweak_conllu() @@ -29,7 +31,7 @@ def test(string): pipeline.do_translate_jos() pipeline.do_conllu_to_tei() pipeline.export_file(parse_file_name, 'tei-initial') - # pipeline.cleanup() + pipeline.cleanup() tei = lxml.parse(parse_file_name).getroot() message = lxml.tostring(tei, encoding='UTF-8', pretty_print=True).decode() ok = True diff --git a/package/structure_assignment/constants.py b/package/structure_assignment/constants.py index 247a52d..41aa8b8 100644 --- a/package/structure_assignment/constants.py +++ b/package/structure_assignment/constants.py @@ -14,22 +14,29 @@ STRUCTURE_SCHEMA_FILE_NAME = '../resources/structures.xsd' DICTIONARY_SCHEMA_FILE_NAME = '../resources/monolingual_dictionaries.xsd' # temporary outputs -FILE_NAME_MAP = {'strings-list': 'strings.txt', - 'obeliks-tokenised': 'obeliks_raw.conllu', - 'obeliks-tweaked': 'obeliks_tweaked.conllu', - 'classla-parsed': 'classla_raw.conllu', - 'classla-translated': 'classla_translated.conllu', - 'tei-initial': 'tei_initial.xml', - 'tei-single': 'tei_single.xml', - 'tei-single-ids': 'tei_single_with_ids.xml', - 'tei-multiple': 'tei_multiple.xml', - 'tei-multiple-ids-1': 'tei_multiple_with_ids1.xml', - 'tei-multiple-ids-2': 'tei_multiple_with_ids2.xml', - 'mwes-1': 'mwes1.csv', - 'mwes-2': 'mwes2.csv', - 'structures-old': 'structures_old.xml', - 'structures-new': 'structures_new.xml', - 'dictionary-single': 'dictionary_single.xml', - 'dictionary-multiple': 'dictionary_multiple.xml', - 'dictionary': 'dictionary.xml' +FILE_MAP = {'strings-list': 'strings.txt', + 'obeliks-tokenised': 'obeliks_raw.conllu', + 'obeliks-tweaked': 'obeliks_tweaked.conllu', + 'classla-parsed': 'classla_raw.conllu', + 'classla-translated': 'classla_translated.conllu', + 'tei-initial': 'tei_initial.xml', + 'tei-single': 'tei_single.xml', + 'tei-single-ids': 'tei_single_with_ids.xml', + 'tei-multiple': 'tei_multiple.xml', + 'tei-multiple-ids-1': 'tei_multiple_with_ids1.xml', + 'tei-multiple-ids-2': 'tei_multiple_with_ids2.xml', + 'mwes-1': 'mwes1.csv', + 'mwes-2': 'mwes2.csv', + 'structures-old': 'structures_old.xml', + 'structures-new': 'structures_new.xml', + 'dictionary-single': 'dictionary_single.xml', + 'dictionary-multiple': 'dictionary_multiple.xml', + 'dictionary': 'dictionary.xml' +} + +NLP_CONFIG_MAP = { + 'type': 'standard_jos', + 'processors': 'tokenize,pos,lemma,depparse', + 'tokenize_pretokenized': True, + 'pos_use_lexicon': True, } diff --git a/package/structure_assignment/pipeline.py b/package/structure_assignment/pipeline.py index 6088c4b..69a739e 100644 --- a/package/structure_assignment/pipeline.py +++ b/package/structure_assignment/pipeline.py @@ -2,6 +2,7 @@ import codecs import shutil import os import tempfile +from copy import deepcopy import obeliks @@ -14,67 +15,52 @@ from structure_assignment.tweak_conllu import tweak as tweak_conllu from nova_slovnica.translate_jos import translate as translate_jos from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei -NLP_CONFIG_MAP = { - 'type': 'standard_jos', - 'processors': 'tokenize,pos,lemma,depparse', - 'tokenize_pretokenized': True, - 'pos_use_lexicon': True, - 'models_dir': None -} - -XML_ID_PREFIX = 's' - -tmp_directory = tempfile.mkdtemp() -resource_directory = None -nlp = None - -def __get_tmp_file_name(file_key): - return tmp_directory + '/' + FILE_NAME_MAP[file_key] - -def initialise(**argument_map): - global resource_directory - resource_directory = argument_map['resource_dir'] +def create_nlp(resource_directory): NLP_CONFIG_MAP['dir'] = resource_directory + '/classla' + return classla.Pipeline('sl', **NLP_CONFIG_MAP) -def import_file(file_name, file_key): - shutil.copyfile(file_name, __get_tmp_file_name(file_key)) +class Pipeline: -def do_tokenise(): - input_file_name = __get_tmp_file_name('strings-list') - output_file_name = __get_tmp_file_name('obeliks-tokenised') - obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True) + def __init__(self, nlp): + self.nlp = nlp + self.tmp_directory = tempfile.mkdtemp() + self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()} -def do_tweak_conllu(): - input_file_name = __get_tmp_file_name('obeliks-tokenised') - output_file_name = __get_tmp_file_name('obeliks-tweaked') - tweak_conllu(input_file_name, output_file_name) + def import_file(self, file_name, file_key): + shutil.copyfile(file_name, self.file_map[file_key]) -def load_classla_models(): - global nlp - nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP) + def do_tokenise(self): + input_file_name = self.file_map['strings-list'] + output_file_name = self.file_map['obeliks-tokenised'] + obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True) -def do_parse(): - input_file_name = __get_tmp_file_name('obeliks-tweaked') - output_file_name = __get_tmp_file_name('classla-parsed') - doc = Document(text=None) - conll_file = CoNLLFile(filename=input_file_name) - doc.conll_file = conll_file - result = nlp(doc) - result.conll_file.write_conll(output_file_name) + def do_tweak_conllu(self): + input_file_name = self.file_map['obeliks-tokenised'] + output_file_name = self.file_map['obeliks-tweaked'] + tweak_conllu(input_file_name, output_file_name) -def do_translate_jos(): - input_file_name = __get_tmp_file_name('classla-parsed') - dictionary_file_name = resource_directory + '/dict.xml' - output_file_name = __get_tmp_file_name('classla-translated') - translate_jos(input_file_name, dictionary_file_name, output_file_name) + def do_parse(self): + input_file_name = self.file_map['obeliks-tweaked'] + output_file_name = self.file_map['classla-parsed'] + doc = Document(text=None) + conll_file = CoNLLFile(filename=input_file_name) + doc.conll_file = conll_file + result = nlp(doc) + result.conll_file.write_conll(output_file_name) -def do_conllu_to_tei(): - input_file_name = __get_tmp_file_name('classla-translated') - output_file_name = __get_tmp_file_name('tei-initial') - conllu_to_tei(input_file_name, output_file_name) + def do_translate_jos(self): + input_file_name = self.file_map['classla-parsed'] + dictionary_file_name = resource_directory + '/dict.xml' + output_file_name = self.file_map['classla-translated'] + translate_jos(input_file_name, dictionary_file_name, output_file_name) -def export_file(file_name, file_key): - shutil.copyfile(__get_tmp_file_name(file_key), file_name) + def do_conllu_to_tei(self): + input_file_name = self.file_map['classla-translated'] + output_file_name = self.file_map['tei-initial'] + conllu_to_tei(input_file_name, output_file_name) -def cleanup(): - shutil.rmtree(tmp_directory, True) + def export_file(self, file_name, file_key): + shutil.copyfile(self.file_map[file_key], file_name) + + def cleanup(self): + shutil.rmtree(self.tmp_directory, True) diff --git a/scripts/pipeline1.py b/scripts/pipeline1.py index 79e6d86..31e11a3 100644 --- a/scripts/pipeline1.py +++ b/scripts/pipeline1.py @@ -1,28 +1,26 @@ import argparse -import classla -from classla import Document -from classla.models.common.conll import CoNLLFile +from structure_assignment.pipeline import Pipeline, create_nlp -import structure_assignment.pipeline as pipeline - -arg_parser = argparse.ArgumentParser(description='Parse Slovene strings and convert to TEI.') -arg_parser.add_argument('-inlist', type=str, help='Input list file') -arg_parser.add_argument('-outtei', type=str, help='Output TEI file') -arguments = arg_parser.parse_args() -input_file_name = arguments.inlist -output_file_name = arguments.outtei - -def run_pipeline(input_file_name, output_file_name): - pipeline.initialise(temp_dir='/tmp/structure_assignment_pipeline1', resource_dir='../resources') +def run_pipeline(nlp, input_file_name, output_file_name): + pipeline = Pipeline(nlp) pipeline.import_file(input_file_name, 'strings-list') pipeline.do_tokenise() pipeline.do_tweak_conllu() - pipeline.load_classla_models() pipeline.do_parse() pipeline.do_translate_jos() pipeline.do_conllu_to_tei() pipeline.export_file(output_file_name, 'tei-initial') + pipeline.cleanup() if (__name__ == '__main__'): + + arg_parser = argparse.ArgumentParser(description='Parse Slovene strings and convert to TEI.') + arg_parser.add_argument('-inlist', type=str, help='Input list file') + arg_parser.add_argument('-outtei', type=str, help='Output TEI file') + arguments = arg_parser.parse_args() + input_file_name = arguments.inlist + output_file_name = arguments.outtei + + nlp = create_nlp('../resources') run_pipeline(input_file_name, output_file_name)