|
|
|
@ -15,13 +15,13 @@ from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary
|
|
|
|
|
|
|
|
|
|
class Runner:
|
|
|
|
|
|
|
|
|
|
def __init__(self, classla_directory, nlp_needed, wani=None):
|
|
|
|
|
def __init__(self, nlp_needed, classla_directory=None, wani_file_name=None):
|
|
|
|
|
self.classla_directory = classla_directory
|
|
|
|
|
if (nlp_needed):
|
|
|
|
|
NLP_CONFIG_MAP['dir'] = classla_directory
|
|
|
|
|
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
|
|
|
|
if (wani is not None):
|
|
|
|
|
self._provide_wani(wani)
|
|
|
|
|
if (wani_file_name is not None):
|
|
|
|
|
self._provide_wani(wani_file_name)
|
|
|
|
|
|
|
|
|
|
def _provide_wani(self, wani_file_name): # TODO: remove once wani is incorporated into luscenje_struktur package
|
|
|
|
|
self.wani_directory = tempfile.mkdtemp()
|
|
|
|
@ -39,7 +39,7 @@ class Runner:
|
|
|
|
|
pipeline.export_file(output_structure_file_name, 'structures-new')
|
|
|
|
|
pipeline.do_validate_dictionary()
|
|
|
|
|
pipeline.export_file(output_file_name, 'dictionary')
|
|
|
|
|
pipeline.cleanup()
|
|
|
|
|
self.cleanup(pipeline)
|
|
|
|
|
|
|
|
|
|
def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
|
|
|
|
pipeline = Pipeline(self.nlp)
|
|
|
|
@ -49,14 +49,14 @@ class Runner:
|
|
|
|
|
self._parse_to_dictionary_sequence(pipeline)
|
|
|
|
|
pipeline.export_file(output_file_name, 'dictionary')
|
|
|
|
|
pipeline.export_file(output_structure_file_name, 'structures-new')
|
|
|
|
|
pipeline.cleanup()
|
|
|
|
|
self.cleanup(pipeline)
|
|
|
|
|
|
|
|
|
|
def strings_to_parse(self, input_file_name, output_file_name):
|
|
|
|
|
pipeline = Pipeline(self.nlp)
|
|
|
|
|
pipeline.import_file(input_file_name, 'strings-list')
|
|
|
|
|
self._strings_to_parse_sequence(pipeline)
|
|
|
|
|
pipeline.export_file(output_file_name, 'tei-initial')
|
|
|
|
|
pipeline.cleanup()
|
|
|
|
|
self.cleanup(pipeline)
|
|
|
|
|
|
|
|
|
|
def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
|
|
|
|
pipeline = Pipeline()
|
|
|
|
@ -65,19 +65,19 @@ class Runner:
|
|
|
|
|
self._parse_to_dictionary_sequence(pipeline)
|
|
|
|
|
pipeline.export_file(output_file_name, 'dictionary')
|
|
|
|
|
pipeline.export_file(output_structure_file_name, 'structures-new')
|
|
|
|
|
pipeline.cleanup()
|
|
|
|
|
self.cleanup(pipeline)
|
|
|
|
|
|
|
|
|
|
def validate_structures(self, input_file_name):
|
|
|
|
|
pipeline = Pipeline()
|
|
|
|
|
pipeline.import_file(input_file_name, 'structures-new')
|
|
|
|
|
pipeline.do_validate_structures()
|
|
|
|
|
pipeline.cleanup()
|
|
|
|
|
self.cleanup(pipeline)
|
|
|
|
|
|
|
|
|
|
def validate_dictionary(self, input_file_name):
|
|
|
|
|
pipeline = Pipeline()
|
|
|
|
|
pipeline.import_file(input_file_name, 'dictionary')
|
|
|
|
|
pipeline.do_validate_dictionary()
|
|
|
|
|
pipeline.cleanup()
|
|
|
|
|
self.cleanup(pipeline)
|
|
|
|
|
|
|
|
|
|
def _strings_to_parse_sequence(self, pipeline):
|
|
|
|
|
pipeline.do_tokenise()
|
|
|
|
@ -92,6 +92,11 @@ class Runner:
|
|
|
|
|
pipeline.do_assign_other_structures()
|
|
|
|
|
pipeline.do_tei_to_dictionary()
|
|
|
|
|
|
|
|
|
|
def cleanup(self, pipeline):
|
|
|
|
|
shutil.rmtree(self.wani_directory, True)
|
|
|
|
|
pipeline.cleanup()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Pipeline:
|
|
|
|
|
|
|
|
|
|
def __init__(self, nlp=None):
|
|
|
|
@ -220,4 +225,3 @@ class Pipeline:
|
|
|
|
|
|
|
|
|
|
def cleanup(self):
|
|
|
|
|
shutil.rmtree(self.tmp_directory, True)
|
|
|
|
|
shutil.rmtree(self.wani_directory, True)
|
|
|
|
|