diff --git a/structure_assignment/pipeline.py b/structure_assignment/pipeline.py index da5baba..0d10aca 100644 --- a/structure_assignment/pipeline.py +++ b/structure_assignment/pipeline.py @@ -52,6 +52,13 @@ class Runner: pipeline.export_file(output_file_name, 'tei-initial') self.cleanup(pipeline) + def strings_to_tokens(self, input_file_name, output_file_name): + pipeline = Pipeline(self.nlp) + pipeline.import_file(input_file_name, 'strings-list') + self._strings_to_tokens_sequence(pipeline) + pipeline.export_file(output_file_name, 'obeliks-tokenised') + self.cleanup(pipeline) + def tagged_to_dictionary(self, strings_file_name, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): # TODO: refactor/tidy classla_conllu_file_name = '/tmp/classla.conlu' @@ -132,6 +139,9 @@ class Runner: pipeline.do_validate_dictionary() self.cleanup(pipeline) + def _strings_to_tokens_sequence(self, pipeline): + pipeline.do_tokenise() + def _strings_to_parse_sequence(self, pipeline): pipeline.do_tokenise() pipeline.do_tweak_conllu()