import argparse from structure_assignment.pipeline import Pipeline, create_nlp resource_directory = '../resources' def run_all(input_file_name, output_file_name, nlp, structure_file_name): tmp_file_name = '/tmp/tmp.xml' # TODO: do better than this string_to_parse(input_file_name, tmp_file_name, nlp) parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name) validate_structures(structure_file_name) validate_dictionary(output_file_name) def strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name): tmp_file_name = '/tmp/tmp.xml' # TODO: do better than this string_to_parse(input_file_name, tmp_file_name, nlp) parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name) def strings_to_parse(input_file_name, output_file_name, nlp): pipeline = Pipeline(nlp, resource_directory) pipeline.import_file(input_file_name, 'strings-list') pipeline.do_tokenise() pipeline.do_tweak_conllu() pipeline.export_file(output_file_name, 'obeliks-tweaked') # pipeline.do_parse() # pipeline.do_translate_jos() # pipeline.do_conllu_to_tei() # pipeline.export_file(output_file_name, 'tei-initial') pipeline.cleanup() def parse_to_dictionary(input_file_name, output_file_name, structure_file_name): pipeline = Pipeline(None, resource_directory) pipeline.import_file(input_file_name, 'tei-initial') pipeline.do_split_tei() pipeline.do_assign_single() pipeline.do_tei_to_dictionary_single() pipeline.do_find_structure_units_first() pipeline.do_assign_multiple_first() pipeline.do_create_structures() pipeline.do_find_structure_units_second() pipeline.do_assign_multiple_second() pipeline.do_tei_to_dictionary_multiple() pipeline.do_merge_dictionaries() pipeline.export_file(output_file_name, 'dictionary') pipeline.export_file(structure_file_name, 'structures-new') pipeline.cleanup() def validate_structures(input_file_name): pipeline = Pipeline(None, resource_directory) pipeline.import_file(input_file_name, 'structures-new') pipeline.do_validate_structures() pipeline.cleanup() def validate_dictionary(input_file_name): pipeline = Pipeline(None, resource_directory) pipeline.import_file(input_file_name, 'dictionary') pipeline.do_validate_dictionary() pipeline.cleanup() if (__name__ == '__main__'): arg_parser = argparse.ArgumentParser(description='Run part or all of structure pipeline.') arg_parser.add_argument('-part', type=str, help='Part name') arg_parser.add_argument('-infile', type=str, help='Input file') arg_parser.add_argument('-outfile', type=str, help='Output file') arg_parser.add_argument('-structures', type=str, help='Updated structure file') arguments = arg_parser.parse_args() part_name = arguments.part input_file_name = arguments.infile output_file_name = arguments.outfile structure_file_name = arguments.structures if (part_name == 'strings_to_parse'): nlp = create_nlp(resource_directory) strings_to_parse(input_file_name, output_file_name, nlp) elif (part_name == 'strings_to_dictionary'): nlp = create_nlp(resource_directory) strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name) elif (part_name == 'parse_to_dictionary'): parse_to_dictionary(input_file_name, output_file_name, structure_file_name) elif (part_name == 'validate_structures'): validate_structures(input_file_name) elif (part_name == 'validate_dictionary'): validate_dictionary(input_file_name) elif (part_name == 'all'): run_all(input_file_name)