2021-03-15 14:11:28 +00:00
|
|
|
import argparse
|
2021-03-15 15:24:01 +00:00
|
|
|
import tempfile
|
2021-03-18 08:10:27 +00:00
|
|
|
import shutil
|
2021-03-15 14:11:28 +00:00
|
|
|
|
|
|
|
from structure_assignment.pipeline import Pipeline, create_nlp
|
|
|
|
|
|
|
|
resource_directory = '../resources'
|
|
|
|
|
|
|
|
def run_all(input_file_name, output_file_name, nlp, structure_file_name):
|
2021-03-18 08:10:27 +00:00
|
|
|
tmp_directory = tempfile.mkdtemp()
|
|
|
|
tmp_file_name = tmp_directory + '/parsed.xml'
|
|
|
|
strings_to_parse(input_file_name, tmp_file_name, nlp)
|
2021-03-15 14:11:28 +00:00
|
|
|
parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name)
|
2021-03-18 08:10:27 +00:00
|
|
|
shutil.rmtree(tmp_directory)
|
2021-03-15 14:11:28 +00:00
|
|
|
validate_structures(structure_file_name)
|
|
|
|
validate_dictionary(output_file_name)
|
|
|
|
|
|
|
|
def strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name):
|
2021-03-18 08:10:27 +00:00
|
|
|
tmp_directory = tempfile.mkdtemp()
|
|
|
|
tmp_file_name = tmp_directory + '/parsed.xml'
|
|
|
|
strings_to_parse(input_file_name, tmp_file_name, nlp)
|
2021-03-15 14:11:28 +00:00
|
|
|
parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name)
|
2021-03-18 08:10:27 +00:00
|
|
|
shutil.rmtree(tmp_directory)
|
2021-03-15 14:11:28 +00:00
|
|
|
|
|
|
|
def strings_to_parse(input_file_name, output_file_name, nlp):
|
2021-03-15 15:24:01 +00:00
|
|
|
pipeline = Pipeline(resource_directory, nlp)
|
2021-03-15 14:11:28 +00:00
|
|
|
pipeline.import_file(input_file_name, 'strings-list')
|
|
|
|
pipeline.do_tokenise()
|
|
|
|
pipeline.do_tweak_conllu()
|
|
|
|
pipeline.export_file(output_file_name, 'obeliks-tweaked')
|
|
|
|
# pipeline.do_parse()
|
|
|
|
# pipeline.do_translate_jos()
|
|
|
|
# pipeline.do_conllu_to_tei()
|
|
|
|
# pipeline.export_file(output_file_name, 'tei-initial')
|
|
|
|
pipeline.cleanup()
|
|
|
|
|
|
|
|
def parse_to_dictionary(input_file_name, output_file_name, structure_file_name):
|
2021-03-15 15:24:01 +00:00
|
|
|
pipeline = Pipeline(resource_directory)
|
2021-03-15 14:11:28 +00:00
|
|
|
pipeline.import_file(input_file_name, 'tei-initial')
|
|
|
|
pipeline.do_split_tei()
|
|
|
|
pipeline.do_assign_single()
|
|
|
|
pipeline.do_tei_to_dictionary_single()
|
|
|
|
pipeline.do_find_structure_units_first()
|
|
|
|
pipeline.do_assign_multiple_first()
|
|
|
|
pipeline.do_create_structures()
|
|
|
|
pipeline.do_find_structure_units_second()
|
|
|
|
pipeline.do_assign_multiple_second()
|
|
|
|
pipeline.do_tei_to_dictionary_multiple()
|
|
|
|
pipeline.do_merge_dictionaries()
|
|
|
|
pipeline.export_file(output_file_name, 'dictionary')
|
|
|
|
pipeline.export_file(structure_file_name, 'structures-new')
|
|
|
|
pipeline.cleanup()
|
|
|
|
|
|
|
|
def validate_structures(input_file_name):
|
2021-03-15 15:24:01 +00:00
|
|
|
pipeline = Pipeline(resource_directory)
|
2021-03-15 14:11:28 +00:00
|
|
|
pipeline.import_file(input_file_name, 'structures-new')
|
|
|
|
pipeline.do_validate_structures()
|
|
|
|
pipeline.cleanup()
|
|
|
|
|
|
|
|
def validate_dictionary(input_file_name):
|
2021-03-15 15:24:01 +00:00
|
|
|
pipeline = Pipeline(resource_directory)
|
2021-03-15 14:11:28 +00:00
|
|
|
pipeline.import_file(input_file_name, 'dictionary')
|
|
|
|
pipeline.do_validate_dictionary()
|
|
|
|
pipeline.cleanup()
|
|
|
|
|
|
|
|
|
|
|
|
if (__name__ == '__main__'):
|
|
|
|
|
|
|
|
arg_parser = argparse.ArgumentParser(description='Run part or all of structure pipeline.')
|
|
|
|
arg_parser.add_argument('-part', type=str, help='Part name')
|
|
|
|
arg_parser.add_argument('-infile', type=str, help='Input file')
|
|
|
|
arg_parser.add_argument('-outfile', type=str, help='Output file')
|
|
|
|
arg_parser.add_argument('-structures', type=str, help='Updated structure file')
|
|
|
|
arguments = arg_parser.parse_args()
|
|
|
|
|
|
|
|
part_name = arguments.part
|
|
|
|
input_file_name = arguments.infile
|
|
|
|
output_file_name = arguments.outfile
|
|
|
|
structure_file_name = arguments.structures
|
|
|
|
|
|
|
|
if (part_name == 'strings_to_parse'):
|
|
|
|
nlp = create_nlp(resource_directory)
|
|
|
|
strings_to_parse(input_file_name, output_file_name, nlp)
|
|
|
|
elif (part_name == 'strings_to_dictionary'):
|
|
|
|
nlp = create_nlp(resource_directory)
|
|
|
|
strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name)
|
|
|
|
elif (part_name == 'parse_to_dictionary'):
|
|
|
|
parse_to_dictionary(input_file_name, output_file_name, structure_file_name)
|
|
|
|
elif (part_name == 'validate_structures'):
|
|
|
|
validate_structures(input_file_name)
|
|
|
|
elif (part_name == 'validate_dictionary'):
|
|
|
|
validate_dictionary(input_file_name)
|
|
|
|
elif (part_name == 'all'):
|
2021-03-15 15:24:01 +00:00
|
|
|
run_all(input_file_name, output_file_name, nlp, structure_file_name)
|