diff --git a/scripts/pipeline1.py b/scripts/pipeline1.py deleted file mode 100644 index 31e11a3..0000000 --- a/scripts/pipeline1.py +++ /dev/null @@ -1,26 +0,0 @@ -import argparse - -from structure_assignment.pipeline import Pipeline, create_nlp - -def run_pipeline(nlp, input_file_name, output_file_name): - pipeline = Pipeline(nlp) - pipeline.import_file(input_file_name, 'strings-list') - pipeline.do_tokenise() - pipeline.do_tweak_conllu() - pipeline.do_parse() - pipeline.do_translate_jos() - pipeline.do_conllu_to_tei() - pipeline.export_file(output_file_name, 'tei-initial') - pipeline.cleanup() - -if (__name__ == '__main__'): - - arg_parser = argparse.ArgumentParser(description='Parse Slovene strings and convert to TEI.') - arg_parser.add_argument('-inlist', type=str, help='Input list file') - arg_parser.add_argument('-outtei', type=str, help='Output TEI file') - arguments = arg_parser.parse_args() - input_file_name = arguments.inlist - output_file_name = arguments.outtei - - nlp = create_nlp('../resources') - run_pipeline(input_file_name, output_file_name) diff --git a/scripts/pipeline2.py b/scripts/pipeline2.py deleted file mode 100644 index cfd669e..0000000 --- a/scripts/pipeline2.py +++ /dev/null @@ -1,96 +0,0 @@ -import argparse -import os -import shutil -import lxml.etree as lxml - -from constants import * - -arg_parser = argparse.ArgumentParser(description='Assign parsed Slovene strings to structures and generate lexicon.') -arg_parser.add_argument('-intei', type=str, required=True, help='Parsed and manually edited TEI file') -arg_parser.add_argument('-instructures', type=str, required=True, help='Input structure file') -arg_parser.add_argument('-outlexicon', type=str, required=True, help='Output lexicon file') -arg_parser.add_argument('-outstructures', type=str, required=True, help='Output structure file') -arguments = arg_parser.parse_args() - -input_tei_file_name = arguments.intei -input_structure_file_name = arguments.instructures -output_lexicon_file_name = arguments.outlexicon -output_structure_file_name = arguments.outstructures - -def run_pipeline(input_tei_file_name, input_structure_file_name, output_lexicon_file_name, output_structure_file_name): - - # setup and split - shutil.rmtree(TMP_DIRECTORY, True) - os.makedirs(TMP_DIRECTORY, exist_ok=True) - shutil.copyfile(input_tei_file_name, TEI_INIT_FILE_NAME) - shutil.copyfile(input_structure_file_name, STRUCTURE_OLD_FILE_NAME) - split_tei_input(TEI_INIT_FILE_NAME, TEI_SINGLE_FILE_NAME, TEI_MULTIPLE_FILE_NAME) - - # single-token units - run_structure_single_assignment(TEI_SINGLE_FILE_NAME, STRUCTURE_OLD_FILE_NAME, TEI_SINGLE_STRUCTURE_FILE_NAME) - run_dictionary_conversion(TEI_SINGLE_STRUCTURE_FILE_NAME, DICTIONARY_SINGLE_FILE_NAME) - - # multiple-token units - run_mwe_extraction(STRUCTURE_OLD_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME) - run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME) - run_structure_creation(STRUCTURE_OLD_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME, STRUCTURE_NEW_FILE_NAME) - validate_structures(STRUCTURE_NEW_FILE_NAME) - run_mwe_extraction(STRUCTURE_NEW_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME) - run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME, TEI_MULTIPLE_STRUCTURE_2_FILE_NAME) - run_dictionary_conversion(TEI_MULTIPLE_STRUCTURE_2_FILE_NAME, DICTIONARY_MULTIPLE_FILE_NAME) - - # merge and finish - merge_dictionaries(DICTIONARY_SINGLE_FILE_NAME, DICTIONARY_MULTIPLE_FILE_NAME, DICTIONARY_FILE_NAME) - validate_dictionary(DICTIONARY_FILE_NAME) - shutil.copyfile(DICTIONARY_FILE_NAME, output_lexicon_file_name) - shutil.copyfile(STRUCTURE_NEW_FILE_NAME, output_structure_file_name) - -def split_tei_input(input_file_name, single_file_name, multiple_file_name): - print('Splitting TEI input file ...') - split_command = ' '.join(['python', TEI_SPLIT_SCRIPT_NAME, '-infile', input_file_name, '-single', single_file_name, '-multiple', multiple_file_name]) - os.system(split_command) - -def run_mwe_extraction(structure_file_name, tei_file_name, mwe_csv_file_name): - print('Extracting MWEs from tei ...') - extraction_command = ' '.join(['python', MWE_EXTRACTION_SCRIPT_NAME, structure_file_name, tei_file_name, '--all', mwe_csv_file_name, '--skip-id-check', '--fixed-restriction-order', '--new-tei']) - print(extraction_command) - os.system(extraction_command) - -def validate_structures(structure_file_name): - print('Validating updated structure specifications ...') - xml_schema = lxml.XMLSchema(lxml.parse(STRUCTURE_SCHEMA_FILE_NAME)) - xml_tree = lxml.parse(structure_file_name) - xml_schema.assertValid(xml_tree) - -def run_structure_single_assignment(input_file_name, structure_file_name, output_file_name): - print('Assigning structure ids ...') - assignment_command = ' '.join(['python', STRUCTURE_SINGLE_ASSIGNMENT_SCRIPT_NAME, '-infile', input_file_name, '-structures', structure_file_name, '-outfile', output_file_name]) - os.system(assignment_command) - -def run_structure_multiple_assignment(tei_file_name, mwe_csv_file_name, output_file_name): - print('Assigning structure ids ...') - assignment_command = ' '.join(['python', STRUCTURE_ASSIGNMENT_SCRIPT_NAME, tei_file_name, mwe_csv_file_name, output_file_name]) - os.system(assignment_command) - -def run_structure_creation(input_file_name, tei_file_name, output_file_name): - print('Creating missing structures ...') - creation_command = ' '.join(['python', STRUCTURE_CREATION_SCRIPT_NAME, '-infile', input_file_name, '-tei', tei_file_name, '-outfile', output_file_name]) - os.system(creation_command) - -def run_dictionary_conversion(tei_file_name, xml_file_name): - print('Converting to dictionary xml format ...') - convert_command = ' '.join(['python', TEI_DICTIONARY_SCRIPT_NAME, '-infile', tei_file_name, '-outfile', xml_file_name]) - os.system(convert_command) - -def merge_dictionaries(single_file_name, multiple_file_name, output_file_name): - print('Merging dictionary files ...') - merge_command = ' '.join(['python', DICTIONARY_MERGE_SCRIPT_NAME, '-single', single_file_name, '-multiple', multiple_file_name, '-outfile', output_file_name]) - os.system(merge_command) - -def validate_dictionary(dictionary_file_name): - print('Validating output dictionary file ...') - xml_schema = lxml.XMLSchema(lxml.parse(DICTIONARY_SCHEMA_FILE_NAME)) - xml_tree = lxml.parse(dictionary_file_name) - xml_schema.assertValid(xml_tree) - -run_pipeline(input_tei_file_name, input_structure_file_name, output_lexicon_file_name, output_structure_file_name) diff --git a/scripts/process.py b/scripts/process.py new file mode 100644 index 0000000..504a1e4 --- /dev/null +++ b/scripts/process.py @@ -0,0 +1,88 @@ +import argparse + +from structure_assignment.pipeline import Pipeline, create_nlp + +resource_directory = '../resources' + +def run_all(input_file_name, output_file_name, nlp, structure_file_name): + tmp_file_name = '/tmp/tmp.xml' # TODO: do better than this + string_to_parse(input_file_name, tmp_file_name, nlp) + parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name) + validate_structures(structure_file_name) + validate_dictionary(output_file_name) + +def strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name): + tmp_file_name = '/tmp/tmp.xml' # TODO: do better than this + string_to_parse(input_file_name, tmp_file_name, nlp) + parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name) + +def strings_to_parse(input_file_name, output_file_name, nlp): + pipeline = Pipeline(nlp, resource_directory) + pipeline.import_file(input_file_name, 'strings-list') + pipeline.do_tokenise() + pipeline.do_tweak_conllu() + pipeline.export_file(output_file_name, 'obeliks-tweaked') + # pipeline.do_parse() + # pipeline.do_translate_jos() + # pipeline.do_conllu_to_tei() + # pipeline.export_file(output_file_name, 'tei-initial') + pipeline.cleanup() + +def parse_to_dictionary(input_file_name, output_file_name, structure_file_name): + pipeline = Pipeline(None, resource_directory) + pipeline.import_file(input_file_name, 'tei-initial') + pipeline.do_split_tei() + pipeline.do_assign_single() + pipeline.do_tei_to_dictionary_single() + pipeline.do_find_structure_units_first() + pipeline.do_assign_multiple_first() + pipeline.do_create_structures() + pipeline.do_find_structure_units_second() + pipeline.do_assign_multiple_second() + pipeline.do_tei_to_dictionary_multiple() + pipeline.do_merge_dictionaries() + pipeline.export_file(output_file_name, 'dictionary') + pipeline.export_file(structure_file_name, 'structures-new') + pipeline.cleanup() + +def validate_structures(input_file_name): + pipeline = Pipeline(None, resource_directory) + pipeline.import_file(input_file_name, 'structures-new') + pipeline.do_validate_structures() + pipeline.cleanup() + +def validate_dictionary(input_file_name): + pipeline = Pipeline(None, resource_directory) + pipeline.import_file(input_file_name, 'dictionary') + pipeline.do_validate_dictionary() + pipeline.cleanup() + + +if (__name__ == '__main__'): + + arg_parser = argparse.ArgumentParser(description='Run part or all of structure pipeline.') + arg_parser.add_argument('-part', type=str, help='Part name') + arg_parser.add_argument('-infile', type=str, help='Input file') + arg_parser.add_argument('-outfile', type=str, help='Output file') + arg_parser.add_argument('-structures', type=str, help='Updated structure file') + arguments = arg_parser.parse_args() + + part_name = arguments.part + input_file_name = arguments.infile + output_file_name = arguments.outfile + structure_file_name = arguments.structures + + if (part_name == 'strings_to_parse'): + nlp = create_nlp(resource_directory) + strings_to_parse(input_file_name, output_file_name, nlp) + elif (part_name == 'strings_to_dictionary'): + nlp = create_nlp(resource_directory) + strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name) + elif (part_name == 'parse_to_dictionary'): + parse_to_dictionary(input_file_name, output_file_name, structure_file_name) + elif (part_name == 'validate_structures'): + validate_structures(input_file_name) + elif (part_name == 'validate_dictionary'): + validate_dictionary(input_file_name) + elif (part_name == 'all'): + run_all(input_file_name)