diff --git a/package/structure_assignment/pipeline.py b/package/structure_assignment/pipeline.py index dfb2913..7e12460 100644 --- a/package/structure_assignment/pipeline.py +++ b/package/structure_assignment/pipeline.py @@ -8,7 +8,7 @@ import obeliks import classla from classla import Document -#from classla.models.common.conll import CoNLLFile +from classla.utils.conll import CoNLL from structure_assignment.constants import * from structure_assignment.tweak_conllu import tweak as tweak_conllu @@ -21,9 +21,86 @@ from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary from nova_slovnica.create_structures import create as create_structures from structure_assignment.merge_dictionaries import merge as merge_dictionaries -def create_nlp(resource_directory): - NLP_CONFIG_MAP['dir'] = resource_directory + '/classla' - return classla.Pipeline('sl', **NLP_CONFIG_MAP) +class Runner: + + def __init__(self, resource_directory, nlp_needed): + self.resource_directory = resource_directory + if (nlp_needed): + NLP_CONFIG_MAP['dir'] = resource_directory + '/classla' + self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP) + + def run_all(input_file_name, output_file_name, structure_file_name): + pipeline = Pipeline(self.resource_directory, self.nlp) + pipeline.import_file(input_file_name, 'strings-list') + self._strings_to_parse_sequence(pipeline) + self._parse_to_dictionary_sequence(pipeline) + pipeline.export_file(output_file_name, 'dictionary') + pipeline.export_file(structure_file_name, 'structures-new') + self._validate_structures(structure_file_name) + self._validate_dictionary(output_file_name) + pipeline.cleanup() + + def strings_to_dictionary(input_file_name, output_file_name, structure_file_name): + pipeline = Pipeline(self.resource_directory, self.nlp) + pipeline.import_file(input_file_name, 'strings-list') + self._strings_to_parse_sequence(pipeline) + self._parse_to_dictionary_sequence(pipeline) + pipeline.export_file(output_file_name, 'dictionary') + pipeline.export_file(structure_file_name, 'structures-new') + pipeline.cleanup() + + def strings_to_parse(self, input_file_name, output_file_name): + pipeline = Pipeline(self.resource_directory, self.nlp) + pipeline.import_file(input_file_name, 'strings-list') + self._strings_to_parse_sequence(pipeline) + pipeline.export_file(output_file_name, 'tei-initial') + pipeline.cleanup() + + def parse_to_dictionary(self, input_file_name, output_file_name, structure_file_name): + pipeline = Pipeline(self.resource_directory) + pipeline.import_file(input_file_name, 'tei-initial') + self._parse_to_dictionary_sequence(pipeline) + pipeline.export_file(output_file_name, 'dictionary') + pipeline.export_file(structure_file_name, 'structures-new') + pipeline.cleanup() + + def validate_structures(self, input_file_name): + pipeline = Pipeline(self.resource_directory) + pipeline.import_file(input_file_name, 'structures-new') + self._validate_structures_sequence(pipeline) + pipeline.cleanup() + + def validate_dictionary(self, input_file_name): + pipeline = Pipeline(self.resource_directory) + pipeline.import_file(input_file_name, 'dictionary') + self._validate_dictionary_sequence(pipeline) + pipeline.cleanup() + + def _strings_to_parse_sequence(self, pipeline): + pipeline.do_tokenise() + pipeline.do_tweak_conllu() + pipeline.do_parse() + pipeline.do_translate_jos() + pipeline.do_conllu_to_tei() + + def _parse_to_dictionary_sequence(self, pipeline): + pipeline.do_split_tei() + pipeline.do_assign_single() + pipeline.do_tei_to_dictionary_single() + pipeline.do_find_structure_units_first() + pipeline.do_assign_multiple_first() + pipeline.do_create_structures() + pipeline.do_find_structure_units_second() + pipeline.do_assign_multiple_second() + pipeline.do_tei_to_dictionary_multiple() + pipeline.do_merge_dictionaries() + + def _validate_structures_sequence(self, pipeline): + pipeline.do_validate_structures() + + def _validate_dictionary_sequence(self, pipeline): + pipeline.do_validate_dictionary() + class Pipeline: diff --git a/scripts/process.py b/scripts/process.py index f1e9f9a..0e4296d 100644 --- a/scripts/process.py +++ b/scripts/process.py @@ -1,69 +1,9 @@ import argparse -import tempfile -import shutil -from structure_assignment.pipeline import Pipeline, create_nlp +from structure_assignment.pipeline import Runner resource_directory = '../resources' -def run_all(input_file_name, output_file_name, nlp, structure_file_name): - tmp_directory = tempfile.mkdtemp() - tmp_file_name = tmp_directory + '/parsed.xml' - strings_to_parse(input_file_name, tmp_file_name, nlp) - parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name) - shutil.rmtree(tmp_directory) - validate_structures(structure_file_name) - validate_dictionary(output_file_name) - -def strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name): - tmp_directory = tempfile.mkdtemp() - tmp_file_name = tmp_directory + '/parsed.xml' - strings_to_parse(input_file_name, tmp_file_name, nlp) - parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name) - shutil.rmtree(tmp_directory) - -def strings_to_parse(input_file_name, output_file_name, nlp): - pipeline = Pipeline(resource_directory, nlp) - pipeline.import_file(input_file_name, 'strings-list') - pipeline.do_tokenise() - pipeline.do_tweak_conllu() - pipeline.export_file(output_file_name, 'obeliks-tweaked') - # pipeline.do_parse() - # pipeline.do_translate_jos() - # pipeline.do_conllu_to_tei() - # pipeline.export_file(output_file_name, 'tei-initial') - pipeline.cleanup() - -def parse_to_dictionary(input_file_name, output_file_name, structure_file_name): - pipeline = Pipeline(resource_directory) - pipeline.import_file(input_file_name, 'tei-initial') - pipeline.do_split_tei() - pipeline.do_assign_single() - pipeline.do_tei_to_dictionary_single() - pipeline.do_find_structure_units_first() - pipeline.do_assign_multiple_first() - pipeline.do_create_structures() - pipeline.do_find_structure_units_second() - pipeline.do_assign_multiple_second() - pipeline.do_tei_to_dictionary_multiple() - pipeline.do_merge_dictionaries() - pipeline.export_file(output_file_name, 'dictionary') - pipeline.export_file(structure_file_name, 'structures-new') - pipeline.cleanup() - -def validate_structures(input_file_name): - pipeline = Pipeline(resource_directory) - pipeline.import_file(input_file_name, 'structures-new') - pipeline.do_validate_structures() - pipeline.cleanup() - -def validate_dictionary(input_file_name): - pipeline = Pipeline(resource_directory) - pipeline.import_file(input_file_name, 'dictionary') - pipeline.do_validate_dictionary() - pipeline.cleanup() - - if (__name__ == '__main__'): arg_parser = argparse.ArgumentParser(description='Run part or all of structure pipeline.') @@ -78,17 +18,17 @@ if (__name__ == '__main__'): output_file_name = arguments.outfile structure_file_name = arguments.structures + nlp_needed = part_name in {'strings_to_parse', 'strings_to_dictionary', 'all'} + runner = Runner(resource_directory, nlp_needed) if (part_name == 'strings_to_parse'): - nlp = create_nlp(resource_directory) - strings_to_parse(input_file_name, output_file_name, nlp) + runner.strings_to_parse(input_file_name, output_file_name) elif (part_name == 'strings_to_dictionary'): - nlp = create_nlp(resource_directory) - strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name) + runner.strings_to_dictionary(input_file_name, output_file_name, structure_file_name) elif (part_name == 'parse_to_dictionary'): - parse_to_dictionary(input_file_name, output_file_name, structure_file_name) + runner.parse_to_dictionary(input_file_name, output_file_name, structure_file_name) elif (part_name == 'validate_structures'): - validate_structures(input_file_name) + runner.validate_structures(input_file_name) elif (part_name == 'validate_dictionary'): - validate_dictionary(input_file_name) + runner.validate_dictionary(input_file_name) elif (part_name == 'all'): - run_all(input_file_name, output_file_name, nlp, structure_file_name) + runner.run_all(input_file_name, output_file_name, nlp, structure_file_name)