From eb52405300ef95cfd05a5701b72fff0caa3e5df4 Mon Sep 17 00:00:00 2001 From: Luka Dragar Date: Thu, 20 Feb 2025 22:56:07 +0100 Subject: [PATCH] Redmine #2921: Added new 'strings to tokens' sequence --- structure_assignment/pipeline.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/structure_assignment/pipeline.py b/structure_assignment/pipeline.py index da5baba..0d10aca 100644 --- a/structure_assignment/pipeline.py +++ b/structure_assignment/pipeline.py @@ -52,6 +52,13 @@ class Runner: pipeline.export_file(output_file_name, 'tei-initial') self.cleanup(pipeline) + def strings_to_tokens(self, input_file_name, output_file_name): + pipeline = Pipeline(self.nlp) + pipeline.import_file(input_file_name, 'strings-list') + self._strings_to_tokens_sequence(pipeline) + pipeline.export_file(output_file_name, 'obeliks-tokenised') + self.cleanup(pipeline) + def tagged_to_dictionary(self, strings_file_name, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): # TODO: refactor/tidy classla_conllu_file_name = '/tmp/classla.conlu' @@ -132,6 +139,9 @@ class Runner: pipeline.do_validate_dictionary() self.cleanup(pipeline) + def _strings_to_tokens_sequence(self, pipeline): + pipeline.do_tokenise() + def _strings_to_parse_sequence(self, pipeline): pipeline.do_tokenise() pipeline.do_tweak_conllu()