IssueID #1835: moved pipeline1 components into package

This commit is contained in:
Cyprian Laskowski 2021-03-01 15:00:25 +01:00
parent d88809be8d
commit 5f25682036
7 changed files with 205 additions and 146 deletions

View File

@ -1,15 +1,40 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os import os
import lxml.etree as lxml
from flask import Flask, jsonify, make_response, request from flask import Flask, jsonify, make_response, request
from flask_httpauth import HTTPBasicAuth from flask_httpauth import HTTPBasicAuth
import structure_assignment.pipeline as pipeline
app = Flask(__name__) app = Flask(__name__)
api_prefix = os.environ['API_PREFIX'] api_prefix = os.environ['API_PREFIX']
resource_directory = os.environ['API_RESOURCE_DIR']
tmp_directory = os.environ['API_TMP']
@app.route(api_prefix + '/test/<string:string>', methods=['GET']) @app.route(api_prefix + '/test/<string:string>', methods=['GET'])
def test(string): def test(string):
results = {'input':string}
string_file_name = '/tmp/string.txt'
parse_file_name = '/tmp/parse.xml'
with open(string_file_name, 'w') as string_file:
string_file.write(string + '\n')
try:
pipeline.initialise(temp_dir=tmp_directory, resource_dir=resource_directory)
pipeline.import_string_file(string_file_name)
pipeline.do_tokenise()
pipeline.do_tweak_conllu()
pipeline.do_parse()
pipeline.do_translate_jos()
pipeline.do_conllu_to_tei()
pipeline.export_parsed_file(parse_file_name)
tei = lxml.parse(parse_file_name).getroot()
message = lxml.tostring(tei, encoding='UTF-8', pretty_print=True).decode()
ok = True
except Exception as e:
message = str(e)
ok = False
results = {'ok':ok, 'message':message}
return jsonify(results) return jsonify(results)

View File

@ -0,0 +1,37 @@
# scripts
TEI_SPLIT_SCRIPT_NAME = 'split_tei.py'
CONLLU_TWEAK_SCRIPT_NAME = 'tweak_conllu.py'
TRANSLATION_SCRIPT_NAME = 'translate_jos.py'
CONLLU_TEI_SCRIPT_NAME = 'conllu_to_xml.py'
MWE_EXTRACTION_SCRIPT_NAME = 'wani.py'
STRUCTURE_SINGLE_ASSIGNMENT_SCRIPT_NAME = 'assign_single_structures.py'
STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py'
STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
TEI_DICTIONARY_SCRIPT_NAME = 'tei_to_dictionary.py'
DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py'
# resources
TRANSLATION_FILE_NAME = '../resources/dict.xml'
STRUCTURE_SCHEMA_FILE_NAME = '../resources/structures.xsd'
DICTIONARY_SCHEMA_FILE_NAME = '../resources/monolingual_dictionaries.xsd'
# temporary outputs
STRING_LIST_FILE_NAME = 'strings.txt'
OBELIKS_RAW_FILE_NAME = 'obeliks_raw.conllu'
OBELIKS_TWEAKED_FILE_NAME = 'obeliks_tweaked.conllu'
CLASSLA_OUTPUT_FILE_NAME = 'classla_raw.conllu'
CLASSLA_TRANSLATED_FILE_NAME = 'classla_translated.conllu'
TEI_INIT_FILE_NAME = 'tei_initial.xml'
TEI_SINGLE_FILE_NAME = 'tei_single.xml'
TEI_SINGLE_STRUCTURE_FILE_NAME = 'tei_single_with_ids.xml'
TEI_MULTIPLE_FILE_NAME = 'tei_multiple.xml'
TEI_MULTIPLE_STRUCTURE_1_FILE_NAME = 'tei_multiple_with_ids1.xml'
TEI_MULTIPLE_STRUCTURE_2_FILE_NAME = 'tei_multiple_with_ids2.xml'
MWE_CSV_1_FILE_NAME = 'mwes1.csv'
MWE_CSV_2_FILE_NAME = 'mwes2.csv'
STRUCTURE_OLD_FILE_NAME = 'structures_old.xml'
STRUCTURE_NEW_FILE_NAME = 'structures_new.xml'
DICTIONARY_SINGLE_FILE_NAME = 'dictionary_single.xml'
DICTIONARY_MULTIPLE_FILE_NAME = 'dictionary_multiple.xml'
DICTIONARY_FILE_NAME = 'dictionary.xml'

View File

@ -0,0 +1,74 @@
import codecs
import shutil
import os
import obeliks
import classla
from classla import Document
from classla.models.common.conll import CoNLLFile
from structure_assignment.constants import *
from structure_assignment.tweak_conllu import tweak as tweak_conllu
from nova_slovnica.translate_jos import translate as translate_jos
from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei
NLP_CONFIG_MAP = {
'treebank': 'sl_ssj_jos',
'processors': 'tokenize,pos,lemma,depparse',
'tokenize_pretokenized': True,
'models_dir': None
}
XML_ID_PREFIX = 's'
resource_directory = None
tmp_directory = None
def __get_tmp_file_name(file_name):
return tmp_directory + '/' + file_name
def initialise(**argument_map):
global tmp_directory, resource_directory
tmp_directory = argument_map['temp_dir']
resource_directory = argument_map['resource_dir']
shutil.rmtree(tmp_directory, True)
os.makedirs(tmp_directory, exist_ok=True)
def import_string_file(file_name):
shutil.copyfile(file_name, __get_tmp_file_name(STRING_LIST_FILE_NAME))
def do_tokenise():
input_file_name = __get_tmp_file_name(STRING_LIST_FILE_NAME)
output_file_name = __get_tmp_file_name(OBELIKS_RAW_FILE_NAME)
obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True)
def do_tweak_conllu():
input_file_name = __get_tmp_file_name(OBELIKS_RAW_FILE_NAME)
output_file_name = __get_tmp_file_name(OBELIKS_TWEAKED_FILE_NAME)
tweak_conllu(input_file_name, output_file_name)
def do_parse():
input_file_name = __get_tmp_file_name(OBELIKS_TWEAKED_FILE_NAME)
output_file_name = __get_tmp_file_name(CLASSLA_OUTPUT_FILE_NAME)
doc = Document(text=None)
conll_file = CoNLLFile(filename=input_file_name)
doc.conll_file = conll_file
NLP_CONFIG_MAP['models_dir'] = resource_directory + '/classla'
nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
result = nlp(doc)
result.conll_file.write_conll(output_file_name)
def do_translate_jos():
input_file_name = __get_tmp_file_name(CLASSLA_OUTPUT_FILE_NAME)
dictionary_file_name = resource_directory + '/dict.xml'
output_file_name = __get_tmp_file_name(CLASSLA_TRANSLATED_FILE_NAME)
translate_jos(input_file_name, dictionary_file_name, output_file_name)
def do_conllu_to_tei():
input_file_name = __get_tmp_file_name(CLASSLA_TRANSLATED_FILE_NAME)
output_file_name = __get_tmp_file_name(TEI_INIT_FILE_NAME)
conllu_to_tei(input_file_name, output_file_name)
def export_parsed_file(file_name):
shutil.copyfile(__get_tmp_file_name(TEI_INIT_FILE_NAME), file_name)

View File

@ -0,0 +1,54 @@
import argparse
import codecs
import re
def write(output_file, line):
output_file.write(line + '\n')
def write_paragraph(output_file, output_map):
if (output_map is not None):
write(output_file, output_map['paragraph'])
write(output_file, output_map['sentence'])
write(output_file, '# text = ' + ' '.join(output_map['texts']))
for (index, token_line) in enumerate(output_map['tokens'], start=1):
write(output_file, '\t'.join([str(index)] + token_line.split('\t')[1:]))
write(output_file, '')
def tweak(input_file_name, output_file_name):
output_file = codecs.open(output_file_name, 'w')
input_file = codecs.open(input_file_name, 'r')
output_map = None
for line in input_file:
if (line[0].isdigit()):
output_map['tokens'].append(line.strip())
else:
match = re.search('^# (.+?) = (.+)$', line)
if (match):
(name, value) = match.groups()
if (name == 'newpar id'):
write_paragraph(output_file, output_map)
paragraph_line = re.sub('^(# newpar id = )(\d+)$', r'\1p\2', line.strip())
output_map = {'paragraph': paragraph_line, 'sentence':None, 'texts':[], 'tokens':[]}
elif (name == 'sent_id'):
if (value.endswith('.1')):
output_map['sentence'] = re.sub('^(# sent_id = )(\d+\.1)$', r'\1s\2', line.strip())
elif (name == 'text'):
output_map['texts'].append(value)
write_paragraph(output_file, output_map)
input_file.close()
output_file.close()
if (__name__ == '__main__'):
arg_parser = argparse.ArgumentParser(description='Fix invalid XML ids.')
arg_parser.add_argument('-infile', type=str, help='Input file')
arg_parser.add_argument('-outfile', type=str, help='Output file')
arguments = arg_parser.parse_args()
input_file_name = arguments.infile
output_file_name = arguments.outfile
tweak(input_file_name, output_file_name)

View File

@ -1,41 +0,0 @@
# temporary directory
TMP_DIRECTORY = '../tmp/structure_assignment'
# scripts
TEI_SPLIT_SCRIPT_NAME = 'split_tei.py'
CONLLU_TWEAK_SCRIPT_NAME = 'tweak_conllu.py'
TRANSLATION_SCRIPT_NAME = 'translate_jos.py'
CONLLU_TEI_SCRIPT_NAME = 'conllu_to_xml.py'
MWE_EXTRACTION_SCRIPT_NAME = 'wani.py'
STRUCTURE_SINGLE_ASSIGNMENT_SCRIPT_NAME = 'assign_single_structures.py'
STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py'
STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
TEI_DICTIONARY_SCRIPT_NAME = 'tei_to_dictionary.py'
DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py'
# resources
TRANSLATION_FILE_NAME = '../resources/dict.xml'
CLASSLA_MODELS_DIRECTORY = '../resources/classla'
STRUCTURE_SCHEMA_FILE_NAME = '../resources/structures.xsd'
DICTIONARY_SCHEMA_FILE_NAME = '../resources/monolingual_dictionaries.xsd'
# temporary outputs
STRING_LIST_FILE_NAME = TMP_DIRECTORY + '/strings.txt'
OBELIKS_RAW_FILE_NAME = TMP_DIRECTORY + '/obeliks_raw.conllu'
OBELIKS_TWEAKED_FILE_NAME = TMP_DIRECTORY + '/obeliks_tweaked.conllu'
CLASSLA_OUTPUT_FILE_NAME = TMP_DIRECTORY + '/classla_raw.conllu'
CLASSLA_TRANSLATED_FILE_NAME = TMP_DIRECTORY + '/classla_translated.conllu'
TEI_INIT_FILE_NAME = TMP_DIRECTORY + '/tei_initial.xml'
TEI_SINGLE_FILE_NAME = TMP_DIRECTORY + '/tei_single.xml'
TEI_SINGLE_STRUCTURE_FILE_NAME = TMP_DIRECTORY + '/tei_single_with_ids.xml'
TEI_MULTIPLE_FILE_NAME = TMP_DIRECTORY + '/tei_multiple.xml'
TEI_MULTIPLE_STRUCTURE_1_FILE_NAME = TMP_DIRECTORY + '/tei_multiple_with_ids1.xml'
TEI_MULTIPLE_STRUCTURE_2_FILE_NAME = TMP_DIRECTORY + '/tei_multiple_with_ids2.xml'
MWE_CSV_1_FILE_NAME = TMP_DIRECTORY + '/mwes1.csv'
MWE_CSV_2_FILE_NAME = TMP_DIRECTORY + '/mwes2.csv'
STRUCTURE_OLD_FILE_NAME = TMP_DIRECTORY + '/structures_old.xml'
STRUCTURE_NEW_FILE_NAME = TMP_DIRECTORY + '/structures_new.xml'
DICTIONARY_SINGLE_FILE_NAME = TMP_DIRECTORY + '/dictionary_single.xml'
DICTIONARY_MULTIPLE_FILE_NAME = TMP_DIRECTORY + '/dictionary_multiple.xml'
DICTIONARY_FILE_NAME = TMP_DIRECTORY + '/dictionary.xml'

View File

@ -1,13 +1,10 @@
import argparse import argparse
import os
import shutil
import codecs
import classla import classla
from classla import Document from classla import Document
from classla.models.common.conll import CoNLLFile from classla.models.common.conll import CoNLLFile
from constants import * import structure_assignment.pipeline as pipeline
arg_parser = argparse.ArgumentParser(description='Parse Slovene strings and convert to TEI.') arg_parser = argparse.ArgumentParser(description='Parse Slovene strings and convert to TEI.')
arg_parser.add_argument('-inlist', type=str, help='Input list file') arg_parser.add_argument('-inlist', type=str, help='Input list file')
@ -16,55 +13,15 @@ arguments = arg_parser.parse_args()
input_file_name = arguments.inlist input_file_name = arguments.inlist
output_file_name = arguments.outtei output_file_name = arguments.outtei
NLP_CONFIG_MAP = {
'treebank': 'sl_ssj_jos',
'processors': 'tokenize,pos,lemma,depparse',
'tokenize_pretokenized': True,
'models_dir': CLASSLA_MODELS_DIRECTORY
}
XML_ID_PREFIX = 's'
def run_pipeline(input_file_name, output_file_name): def run_pipeline(input_file_name, output_file_name):
shutil.rmtree(TMP_DIRECTORY, True) pipeline.initialise(temp_dir='/tmp/structure_assignment_pipeline1', resource_dir='../resources')
os.makedirs(TMP_DIRECTORY, exist_ok=True) pipeline.import_string_file(input_file_name)
shutil.copyfile(input_file_name, STRING_LIST_FILE_NAME) pipeline.do_tokenise()
run_obeliks(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME) pipeline.do_tweak_conllu()
tweak_conllu(OBELIKS_RAW_FILE_NAME, OBELIKS_TWEAKED_FILE_NAME) pipeline.do_parse()
run_classla(OBELIKS_TWEAKED_FILE_NAME, CLASSLA_OUTPUT_FILE_NAME) pipeline.do_translate_jos()
run_jos_translation(CLASSLA_OUTPUT_FILE_NAME, CLASSLA_TRANSLATED_FILE_NAME) pipeline.do_conllu_to_tei()
run_tei_conversion(CLASSLA_TRANSLATED_FILE_NAME, TEI_INIT_FILE_NAME) pipeline.export_parsed_file(output_file_name)
shutil.copyfile(TEI_INIT_FILE_NAME, output_file_name)
def run_obeliks(list_file_name, conllu_file_name): if (__name__ == '__main__'):
print('Running obeliks ...') run_pipeline(input_file_name, output_file_name)
obeliks_command = ' '.join(['obeliks', '-c', '-if', list_file_name, '-o', conllu_file_name])
os.system(obeliks_command)
def tweak_conllu(input_file_name, output_file_name):
print('Tweaking conllu results ...')
tweak_command = ' '.join(['python', CONLLU_TWEAK_SCRIPT_NAME, '-infile', input_file_name, '-outfile', output_file_name])
os.system(tweak_command)
def run_classla(obeliks_file_name, classla_file_name):
print('Running classla ...')
doc = Document(text=None)
conll_file = CoNLLFile(filename=obeliks_file_name)
doc.conll_file = conll_file
nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
result = nlp(doc)
result.conll_file.write_conll(classla_file_name)
def run_jos_translation(input_file_name, output_file_name):
print('Translating JOS ...')
translate_command = ' '.join(['python', TRANSLATION_SCRIPT_NAME, '-infile', input_file_name, '-dict', TRANSLATION_FILE_NAME, '-outfile', output_file_name])
print(translate_command)
os.system(translate_command)
def run_tei_conversion(classla_file_name, tei_file_name):
print('Converting to tei ...')
convert_command = ' '.join(['python', CONLLU_TEI_SCRIPT_NAME, '-o', tei_file_name, classla_file_name])
print(convert_command)
os.system(convert_command)
run_pipeline(input_file_name, output_file_name)

View File

@ -1,47 +0,0 @@
import argparse
import codecs
import re
arg_parser = argparse.ArgumentParser(description='Fix invalid XML ids.')
arg_parser.add_argument('-infile', type=str, help='Input file')
arg_parser.add_argument('-outfile', type=str, help='Output file')
arguments = arg_parser.parse_args()
input_file_name = arguments.infile
output_file_name = arguments.outfile
output_file = codecs.open(output_file_name, 'w')
input_file = codecs.open(input_file_name, 'r')
def write(output_file, line):
output_file.write(line + '\n')
def write_paragraph(output_file, output_map):
if (output_map is not None):
write(output_file, output_map['paragraph'])
write(output_file, output_map['sentence'])
write(output_file, '# text = ' + ' '.join(output_map['texts']))
for (index, token_line) in enumerate(output_map['tokens'], start=1):
write(output_file, '\t'.join([str(index)] + token_line.split('\t')[1:]))
write(output_file, '')
output_map = None
for line in input_file:
if (line[0].isdigit()):
output_map['tokens'].append(line.strip())
else:
match = re.search('^# (.+?) = (.+)$', line)
if (match):
(name, value) = match.groups()
if (name == 'newpar id'):
write_paragraph(output_file, output_map)
paragraph_line = re.sub('^(# newpar id = )(\d+)$', r'\1p\2', line.strip())
output_map = {'paragraph': paragraph_line, 'sentence':None, 'texts':[], 'tokens':[]}
elif (name == 'sent_id'):
if (value.endswith('.1')):
output_map['sentence'] = re.sub('^(# sent_id = )(\d+\.1)$', r'\1s\2', line.strip())
elif (name == 'text'):
output_map['texts'].append(value)
write_paragraph(output_file, output_map)
input_file.close()
output_file.close()