From 308164742d7a559b418505e9650bcace275d8c85 Mon Sep 17 00:00:00 2001 From: Luka Dragar Date: Thu, 20 Feb 2025 22:56:07 +0100 Subject: [PATCH] Redmine #2921: Added new 'strings to tokens' sequence --- structure_assignment/pipeline.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/structure_assignment/pipeline.py b/structure_assignment/pipeline.py index da5baba..c99c653 100644 --- a/structure_assignment/pipeline.py +++ b/structure_assignment/pipeline.py @@ -52,6 +52,13 @@ class Runner: pipeline.export_file(output_file_name, 'tei-initial') self.cleanup(pipeline) + def strings_to_tokens(self, input_file_name, output_file_name): + pipeline = Pipeline(self.nlp) + pipeline.import_file(input_file_name, 'strings-list') + self._strings_to_tokens_sequence(pipeline) + pipeline.export_file(output_file_name, 'classla-translated') + self.cleanup(pipeline) + def tagged_to_dictionary(self, strings_file_name, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): # TODO: refactor/tidy classla_conllu_file_name = '/tmp/classla.conlu' @@ -132,6 +139,12 @@ class Runner: pipeline.do_validate_dictionary() self.cleanup(pipeline) + def _strings_to_tokens_sequence(self, pipeline): + pipeline.do_tokenise() + pipeline.do_tweak_conllu() + pipeline.do_parse() + pipeline.do_translate_jos() + def _strings_to_parse_sequence(self, pipeline): pipeline.do_tokenise() pipeline.do_tweak_conllu()