Redmine #2921: Added new 'strings to tokens' sequence

This commit is contained in:
Luka Dragar 2025-02-20 22:56:07 +01:00
parent 442f3ca163
commit eb52405300

View File

@ -52,6 +52,13 @@ class Runner:
pipeline.export_file(output_file_name, 'tei-initial') pipeline.export_file(output_file_name, 'tei-initial')
self.cleanup(pipeline) self.cleanup(pipeline)
def strings_to_tokens(self, input_file_name, output_file_name):
pipeline = Pipeline(self.nlp)
pipeline.import_file(input_file_name, 'strings-list')
self._strings_to_tokens_sequence(pipeline)
pipeline.export_file(output_file_name, 'obeliks-tokenised')
self.cleanup(pipeline)
def tagged_to_dictionary(self, strings_file_name, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): # TODO: refactor/tidy def tagged_to_dictionary(self, strings_file_name, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): # TODO: refactor/tidy
classla_conllu_file_name = '/tmp/classla.conlu' classla_conllu_file_name = '/tmp/classla.conlu'
@ -132,6 +139,9 @@ class Runner:
pipeline.do_validate_dictionary() pipeline.do_validate_dictionary()
self.cleanup(pipeline) self.cleanup(pipeline)
def _strings_to_tokens_sequence(self, pipeline):
pipeline.do_tokenise()
def _strings_to_parse_sequence(self, pipeline): def _strings_to_parse_sequence(self, pipeline):
pipeline.do_tokenise() pipeline.do_tokenise()
pipeline.do_tweak_conllu() pipeline.do_tweak_conllu()