From 5f25682036afdcd1d4b56bb8075926c27a9dcf1a Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Mon, 1 Mar 2021 15:00:25 +0100 Subject: [PATCH] IssueID #1835: moved pipeline1 components into package --- package/structure_assignment/api.py | 33 +++++++-- package/structure_assignment/constants.py | 37 ++++++++++ package/structure_assignment/pipeline.py | 74 ++++++++++++++++++++ package/structure_assignment/tweak_conllu.py | 54 ++++++++++++++ scripts/constants.py | 41 ----------- scripts/pipeline1.py | 67 ++++-------------- scripts/tweak_conllu.py | 47 ------------- 7 files changed, 206 insertions(+), 147 deletions(-) create mode 100644 package/structure_assignment/constants.py create mode 100644 package/structure_assignment/pipeline.py create mode 100644 package/structure_assignment/tweak_conllu.py delete mode 100644 scripts/constants.py delete mode 100644 scripts/tweak_conllu.py diff --git a/package/structure_assignment/api.py b/package/structure_assignment/api.py index 98ac595..66de9fc 100644 --- a/package/structure_assignment/api.py +++ b/package/structure_assignment/api.py @@ -1,15 +1,40 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - import os +import lxml.etree as lxml + from flask import Flask, jsonify, make_response, request from flask_httpauth import HTTPBasicAuth +import structure_assignment.pipeline as pipeline + app = Flask(__name__) api_prefix = os.environ['API_PREFIX'] +resource_directory = os.environ['API_RESOURCE_DIR'] +tmp_directory = os.environ['API_TMP'] @app.route(api_prefix + '/test/', methods=['GET']) def test(string): - results = {'input':string} + + string_file_name = '/tmp/string.txt' + parse_file_name = '/tmp/parse.xml' + with open(string_file_name, 'w') as string_file: + string_file.write(string + '\n') + + try: + pipeline.initialise(temp_dir=tmp_directory, resource_dir=resource_directory) + pipeline.import_string_file(string_file_name) + pipeline.do_tokenise() + pipeline.do_tweak_conllu() + pipeline.do_parse() + pipeline.do_translate_jos() + pipeline.do_conllu_to_tei() + pipeline.export_parsed_file(parse_file_name) + tei = lxml.parse(parse_file_name).getroot() + message = lxml.tostring(tei, encoding='UTF-8', pretty_print=True).decode() + ok = True + except Exception as e: + message = str(e) + ok = False + + results = {'ok':ok, 'message':message} return jsonify(results) diff --git a/package/structure_assignment/constants.py b/package/structure_assignment/constants.py new file mode 100644 index 0000000..6b0df50 --- /dev/null +++ b/package/structure_assignment/constants.py @@ -0,0 +1,37 @@ +# scripts +TEI_SPLIT_SCRIPT_NAME = 'split_tei.py' +CONLLU_TWEAK_SCRIPT_NAME = 'tweak_conllu.py' +TRANSLATION_SCRIPT_NAME = 'translate_jos.py' +CONLLU_TEI_SCRIPT_NAME = 'conllu_to_xml.py' +MWE_EXTRACTION_SCRIPT_NAME = 'wani.py' +STRUCTURE_SINGLE_ASSIGNMENT_SCRIPT_NAME = 'assign_single_structures.py' +STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py' +STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py' +STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py' +TEI_DICTIONARY_SCRIPT_NAME = 'tei_to_dictionary.py' +DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py' + +# resources +TRANSLATION_FILE_NAME = '../resources/dict.xml' +STRUCTURE_SCHEMA_FILE_NAME = '../resources/structures.xsd' +DICTIONARY_SCHEMA_FILE_NAME = '../resources/monolingual_dictionaries.xsd' + +# temporary outputs +STRING_LIST_FILE_NAME = 'strings.txt' +OBELIKS_RAW_FILE_NAME = 'obeliks_raw.conllu' +OBELIKS_TWEAKED_FILE_NAME = 'obeliks_tweaked.conllu' +CLASSLA_OUTPUT_FILE_NAME = 'classla_raw.conllu' +CLASSLA_TRANSLATED_FILE_NAME = 'classla_translated.conllu' +TEI_INIT_FILE_NAME = 'tei_initial.xml' +TEI_SINGLE_FILE_NAME = 'tei_single.xml' +TEI_SINGLE_STRUCTURE_FILE_NAME = 'tei_single_with_ids.xml' +TEI_MULTIPLE_FILE_NAME = 'tei_multiple.xml' +TEI_MULTIPLE_STRUCTURE_1_FILE_NAME = 'tei_multiple_with_ids1.xml' +TEI_MULTIPLE_STRUCTURE_2_FILE_NAME = 'tei_multiple_with_ids2.xml' +MWE_CSV_1_FILE_NAME = 'mwes1.csv' +MWE_CSV_2_FILE_NAME = 'mwes2.csv' +STRUCTURE_OLD_FILE_NAME = 'structures_old.xml' +STRUCTURE_NEW_FILE_NAME = 'structures_new.xml' +DICTIONARY_SINGLE_FILE_NAME = 'dictionary_single.xml' +DICTIONARY_MULTIPLE_FILE_NAME = 'dictionary_multiple.xml' +DICTIONARY_FILE_NAME = 'dictionary.xml' diff --git a/package/structure_assignment/pipeline.py b/package/structure_assignment/pipeline.py new file mode 100644 index 0000000..762c1db --- /dev/null +++ b/package/structure_assignment/pipeline.py @@ -0,0 +1,74 @@ +import codecs +import shutil +import os + +import obeliks + +import classla +from classla import Document +from classla.models.common.conll import CoNLLFile + +from structure_assignment.constants import * +from structure_assignment.tweak_conllu import tweak as tweak_conllu +from nova_slovnica.translate_jos import translate as translate_jos +from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei + +NLP_CONFIG_MAP = { + 'treebank': 'sl_ssj_jos', + 'processors': 'tokenize,pos,lemma,depparse', + 'tokenize_pretokenized': True, + 'models_dir': None +} + +XML_ID_PREFIX = 's' + +resource_directory = None +tmp_directory = None + +def __get_tmp_file_name(file_name): + return tmp_directory + '/' + file_name + +def initialise(**argument_map): + global tmp_directory, resource_directory + tmp_directory = argument_map['temp_dir'] + resource_directory = argument_map['resource_dir'] + shutil.rmtree(tmp_directory, True) + os.makedirs(tmp_directory, exist_ok=True) + +def import_string_file(file_name): + shutil.copyfile(file_name, __get_tmp_file_name(STRING_LIST_FILE_NAME)) + +def do_tokenise(): + input_file_name = __get_tmp_file_name(STRING_LIST_FILE_NAME) + output_file_name = __get_tmp_file_name(OBELIKS_RAW_FILE_NAME) + obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True) + +def do_tweak_conllu(): + input_file_name = __get_tmp_file_name(OBELIKS_RAW_FILE_NAME) + output_file_name = __get_tmp_file_name(OBELIKS_TWEAKED_FILE_NAME) + tweak_conllu(input_file_name, output_file_name) + +def do_parse(): + input_file_name = __get_tmp_file_name(OBELIKS_TWEAKED_FILE_NAME) + output_file_name = __get_tmp_file_name(CLASSLA_OUTPUT_FILE_NAME) + doc = Document(text=None) + conll_file = CoNLLFile(filename=input_file_name) + doc.conll_file = conll_file + NLP_CONFIG_MAP['models_dir'] = resource_directory + '/classla' + nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP) + result = nlp(doc) + result.conll_file.write_conll(output_file_name) + +def do_translate_jos(): + input_file_name = __get_tmp_file_name(CLASSLA_OUTPUT_FILE_NAME) + dictionary_file_name = resource_directory + '/dict.xml' + output_file_name = __get_tmp_file_name(CLASSLA_TRANSLATED_FILE_NAME) + translate_jos(input_file_name, dictionary_file_name, output_file_name) + +def do_conllu_to_tei(): + input_file_name = __get_tmp_file_name(CLASSLA_TRANSLATED_FILE_NAME) + output_file_name = __get_tmp_file_name(TEI_INIT_FILE_NAME) + conllu_to_tei(input_file_name, output_file_name) + +def export_parsed_file(file_name): + shutil.copyfile(__get_tmp_file_name(TEI_INIT_FILE_NAME), file_name) diff --git a/package/structure_assignment/tweak_conllu.py b/package/structure_assignment/tweak_conllu.py new file mode 100644 index 0000000..0593146 --- /dev/null +++ b/package/structure_assignment/tweak_conllu.py @@ -0,0 +1,54 @@ +import argparse +import codecs +import re + +def write(output_file, line): + output_file.write(line + '\n') + +def write_paragraph(output_file, output_map): + if (output_map is not None): + write(output_file, output_map['paragraph']) + write(output_file, output_map['sentence']) + write(output_file, '# text = ' + ' '.join(output_map['texts'])) + for (index, token_line) in enumerate(output_map['tokens'], start=1): + write(output_file, '\t'.join([str(index)] + token_line.split('\t')[1:])) + write(output_file, '') + +def tweak(input_file_name, output_file_name): + + output_file = codecs.open(output_file_name, 'w') + input_file = codecs.open(input_file_name, 'r') + + output_map = None + for line in input_file: + if (line[0].isdigit()): + output_map['tokens'].append(line.strip()) + else: + match = re.search('^# (.+?) = (.+)$', line) + if (match): + (name, value) = match.groups() + if (name == 'newpar id'): + write_paragraph(output_file, output_map) + paragraph_line = re.sub('^(# newpar id = )(\d+)$', r'\1p\2', line.strip()) + output_map = {'paragraph': paragraph_line, 'sentence':None, 'texts':[], 'tokens':[]} + elif (name == 'sent_id'): + if (value.endswith('.1')): + output_map['sentence'] = re.sub('^(# sent_id = )(\d+\.1)$', r'\1s\2', line.strip()) + elif (name == 'text'): + output_map['texts'].append(value) + write_paragraph(output_file, output_map) + + input_file.close() + output_file.close() + + +if (__name__ == '__main__'): + + arg_parser = argparse.ArgumentParser(description='Fix invalid XML ids.') + arg_parser.add_argument('-infile', type=str, help='Input file') + arg_parser.add_argument('-outfile', type=str, help='Output file') + arguments = arg_parser.parse_args() + input_file_name = arguments.infile + output_file_name = arguments.outfile + + tweak(input_file_name, output_file_name) diff --git a/scripts/constants.py b/scripts/constants.py deleted file mode 100644 index 042e768..0000000 --- a/scripts/constants.py +++ /dev/null @@ -1,41 +0,0 @@ -# temporary directory -TMP_DIRECTORY = '../tmp/structure_assignment' - -# scripts -TEI_SPLIT_SCRIPT_NAME = 'split_tei.py' -CONLLU_TWEAK_SCRIPT_NAME = 'tweak_conllu.py' -TRANSLATION_SCRIPT_NAME = 'translate_jos.py' -CONLLU_TEI_SCRIPT_NAME = 'conllu_to_xml.py' -MWE_EXTRACTION_SCRIPT_NAME = 'wani.py' -STRUCTURE_SINGLE_ASSIGNMENT_SCRIPT_NAME = 'assign_single_structures.py' -STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py' -STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py' -STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py' -TEI_DICTIONARY_SCRIPT_NAME = 'tei_to_dictionary.py' -DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py' - -# resources -TRANSLATION_FILE_NAME = '../resources/dict.xml' -CLASSLA_MODELS_DIRECTORY = '../resources/classla' -STRUCTURE_SCHEMA_FILE_NAME = '../resources/structures.xsd' -DICTIONARY_SCHEMA_FILE_NAME = '../resources/monolingual_dictionaries.xsd' - -# temporary outputs -STRING_LIST_FILE_NAME = TMP_DIRECTORY + '/strings.txt' -OBELIKS_RAW_FILE_NAME = TMP_DIRECTORY + '/obeliks_raw.conllu' -OBELIKS_TWEAKED_FILE_NAME = TMP_DIRECTORY + '/obeliks_tweaked.conllu' -CLASSLA_OUTPUT_FILE_NAME = TMP_DIRECTORY + '/classla_raw.conllu' -CLASSLA_TRANSLATED_FILE_NAME = TMP_DIRECTORY + '/classla_translated.conllu' -TEI_INIT_FILE_NAME = TMP_DIRECTORY + '/tei_initial.xml' -TEI_SINGLE_FILE_NAME = TMP_DIRECTORY + '/tei_single.xml' -TEI_SINGLE_STRUCTURE_FILE_NAME = TMP_DIRECTORY + '/tei_single_with_ids.xml' -TEI_MULTIPLE_FILE_NAME = TMP_DIRECTORY + '/tei_multiple.xml' -TEI_MULTIPLE_STRUCTURE_1_FILE_NAME = TMP_DIRECTORY + '/tei_multiple_with_ids1.xml' -TEI_MULTIPLE_STRUCTURE_2_FILE_NAME = TMP_DIRECTORY + '/tei_multiple_with_ids2.xml' -MWE_CSV_1_FILE_NAME = TMP_DIRECTORY + '/mwes1.csv' -MWE_CSV_2_FILE_NAME = TMP_DIRECTORY + '/mwes2.csv' -STRUCTURE_OLD_FILE_NAME = TMP_DIRECTORY + '/structures_old.xml' -STRUCTURE_NEW_FILE_NAME = TMP_DIRECTORY + '/structures_new.xml' -DICTIONARY_SINGLE_FILE_NAME = TMP_DIRECTORY + '/dictionary_single.xml' -DICTIONARY_MULTIPLE_FILE_NAME = TMP_DIRECTORY + '/dictionary_multiple.xml' -DICTIONARY_FILE_NAME = TMP_DIRECTORY + '/dictionary.xml' diff --git a/scripts/pipeline1.py b/scripts/pipeline1.py index 956b433..1da3a16 100644 --- a/scripts/pipeline1.py +++ b/scripts/pipeline1.py @@ -1,13 +1,10 @@ import argparse -import os -import shutil -import codecs import classla from classla import Document from classla.models.common.conll import CoNLLFile -from constants import * +import structure_assignment.pipeline as pipeline arg_parser = argparse.ArgumentParser(description='Parse Slovene strings and convert to TEI.') arg_parser.add_argument('-inlist', type=str, help='Input list file') @@ -16,55 +13,15 @@ arguments = arg_parser.parse_args() input_file_name = arguments.inlist output_file_name = arguments.outtei -NLP_CONFIG_MAP = { - 'treebank': 'sl_ssj_jos', - 'processors': 'tokenize,pos,lemma,depparse', - 'tokenize_pretokenized': True, - 'models_dir': CLASSLA_MODELS_DIRECTORY -} - -XML_ID_PREFIX = 's' - def run_pipeline(input_file_name, output_file_name): - shutil.rmtree(TMP_DIRECTORY, True) - os.makedirs(TMP_DIRECTORY, exist_ok=True) - shutil.copyfile(input_file_name, STRING_LIST_FILE_NAME) - run_obeliks(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME) - tweak_conllu(OBELIKS_RAW_FILE_NAME, OBELIKS_TWEAKED_FILE_NAME) - run_classla(OBELIKS_TWEAKED_FILE_NAME, CLASSLA_OUTPUT_FILE_NAME) - run_jos_translation(CLASSLA_OUTPUT_FILE_NAME, CLASSLA_TRANSLATED_FILE_NAME) - run_tei_conversion(CLASSLA_TRANSLATED_FILE_NAME, TEI_INIT_FILE_NAME) - shutil.copyfile(TEI_INIT_FILE_NAME, output_file_name) - -def run_obeliks(list_file_name, conllu_file_name): - print('Running obeliks ...') - obeliks_command = ' '.join(['obeliks', '-c', '-if', list_file_name, '-o', conllu_file_name]) - os.system(obeliks_command) - -def tweak_conllu(input_file_name, output_file_name): - print('Tweaking conllu results ...') - tweak_command = ' '.join(['python', CONLLU_TWEAK_SCRIPT_NAME, '-infile', input_file_name, '-outfile', output_file_name]) - os.system(tweak_command) - -def run_classla(obeliks_file_name, classla_file_name): - print('Running classla ...') - doc = Document(text=None) - conll_file = CoNLLFile(filename=obeliks_file_name) - doc.conll_file = conll_file - nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP) - result = nlp(doc) - result.conll_file.write_conll(classla_file_name) - -def run_jos_translation(input_file_name, output_file_name): - print('Translating JOS ...') - translate_command = ' '.join(['python', TRANSLATION_SCRIPT_NAME, '-infile', input_file_name, '-dict', TRANSLATION_FILE_NAME, '-outfile', output_file_name]) - print(translate_command) - os.system(translate_command) - -def run_tei_conversion(classla_file_name, tei_file_name): - print('Converting to tei ...') - convert_command = ' '.join(['python', CONLLU_TEI_SCRIPT_NAME, '-o', tei_file_name, classla_file_name]) - print(convert_command) - os.system(convert_command) - -run_pipeline(input_file_name, output_file_name) + pipeline.initialise(temp_dir='/tmp/structure_assignment_pipeline1', resource_dir='../resources') + pipeline.import_string_file(input_file_name) + pipeline.do_tokenise() + pipeline.do_tweak_conllu() + pipeline.do_parse() + pipeline.do_translate_jos() + pipeline.do_conllu_to_tei() + pipeline.export_parsed_file(output_file_name) + +if (__name__ == '__main__'): + run_pipeline(input_file_name, output_file_name) diff --git a/scripts/tweak_conllu.py b/scripts/tweak_conllu.py deleted file mode 100644 index 53ad6d2..0000000 --- a/scripts/tweak_conllu.py +++ /dev/null @@ -1,47 +0,0 @@ -import argparse -import codecs -import re - -arg_parser = argparse.ArgumentParser(description='Fix invalid XML ids.') -arg_parser.add_argument('-infile', type=str, help='Input file') -arg_parser.add_argument('-outfile', type=str, help='Output file') -arguments = arg_parser.parse_args() -input_file_name = arguments.infile -output_file_name = arguments.outfile - -output_file = codecs.open(output_file_name, 'w') -input_file = codecs.open(input_file_name, 'r') - -def write(output_file, line): - output_file.write(line + '\n') - -def write_paragraph(output_file, output_map): - if (output_map is not None): - write(output_file, output_map['paragraph']) - write(output_file, output_map['sentence']) - write(output_file, '# text = ' + ' '.join(output_map['texts'])) - for (index, token_line) in enumerate(output_map['tokens'], start=1): - write(output_file, '\t'.join([str(index)] + token_line.split('\t')[1:])) - write(output_file, '') - -output_map = None -for line in input_file: - if (line[0].isdigit()): - output_map['tokens'].append(line.strip()) - else: - match = re.search('^# (.+?) = (.+)$', line) - if (match): - (name, value) = match.groups() - if (name == 'newpar id'): - write_paragraph(output_file, output_map) - paragraph_line = re.sub('^(# newpar id = )(\d+)$', r'\1p\2', line.strip()) - output_map = {'paragraph': paragraph_line, 'sentence':None, 'texts':[], 'tokens':[]} - elif (name == 'sent_id'): - if (value.endswith('.1')): - output_map['sentence'] = re.sub('^(# sent_id = )(\d+\.1)$', r'\1s\2', line.strip()) - elif (name == 'text'): - output_map['texts'].append(value) -write_paragraph(output_file, output_map) - -input_file.close() -output_file.close() \ No newline at end of file