Redmine #1835: moved command functions into class
This commit is contained in:
parent
59dd78f9b2
commit
753cfad794
|
@ -8,7 +8,7 @@ import obeliks
|
||||||
|
|
||||||
import classla
|
import classla
|
||||||
from classla import Document
|
from classla import Document
|
||||||
#from classla.models.common.conll import CoNLLFile
|
from classla.utils.conll import CoNLL
|
||||||
|
|
||||||
from structure_assignment.constants import *
|
from structure_assignment.constants import *
|
||||||
from structure_assignment.tweak_conllu import tweak as tweak_conllu
|
from structure_assignment.tweak_conllu import tweak as tweak_conllu
|
||||||
|
@ -21,9 +21,86 @@ from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary
|
||||||
from nova_slovnica.create_structures import create as create_structures
|
from nova_slovnica.create_structures import create as create_structures
|
||||||
from structure_assignment.merge_dictionaries import merge as merge_dictionaries
|
from structure_assignment.merge_dictionaries import merge as merge_dictionaries
|
||||||
|
|
||||||
def create_nlp(resource_directory):
|
class Runner:
|
||||||
NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
|
|
||||||
return classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
def __init__(self, resource_directory, nlp_needed):
|
||||||
|
self.resource_directory = resource_directory
|
||||||
|
if (nlp_needed):
|
||||||
|
NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
|
||||||
|
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
||||||
|
|
||||||
|
def run_all(input_file_name, output_file_name, structure_file_name):
|
||||||
|
pipeline = Pipeline(self.resource_directory, self.nlp)
|
||||||
|
pipeline.import_file(input_file_name, 'strings-list')
|
||||||
|
self._strings_to_parse_sequence(pipeline)
|
||||||
|
self._parse_to_dictionary_sequence(pipeline)
|
||||||
|
pipeline.export_file(output_file_name, 'dictionary')
|
||||||
|
pipeline.export_file(structure_file_name, 'structures-new')
|
||||||
|
self._validate_structures(structure_file_name)
|
||||||
|
self._validate_dictionary(output_file_name)
|
||||||
|
pipeline.cleanup()
|
||||||
|
|
||||||
|
def strings_to_dictionary(input_file_name, output_file_name, structure_file_name):
|
||||||
|
pipeline = Pipeline(self.resource_directory, self.nlp)
|
||||||
|
pipeline.import_file(input_file_name, 'strings-list')
|
||||||
|
self._strings_to_parse_sequence(pipeline)
|
||||||
|
self._parse_to_dictionary_sequence(pipeline)
|
||||||
|
pipeline.export_file(output_file_name, 'dictionary')
|
||||||
|
pipeline.export_file(structure_file_name, 'structures-new')
|
||||||
|
pipeline.cleanup()
|
||||||
|
|
||||||
|
def strings_to_parse(self, input_file_name, output_file_name):
|
||||||
|
pipeline = Pipeline(self.resource_directory, self.nlp)
|
||||||
|
pipeline.import_file(input_file_name, 'strings-list')
|
||||||
|
self._strings_to_parse_sequence(pipeline)
|
||||||
|
pipeline.export_file(output_file_name, 'tei-initial')
|
||||||
|
pipeline.cleanup()
|
||||||
|
|
||||||
|
def parse_to_dictionary(self, input_file_name, output_file_name, structure_file_name):
|
||||||
|
pipeline = Pipeline(self.resource_directory)
|
||||||
|
pipeline.import_file(input_file_name, 'tei-initial')
|
||||||
|
self._parse_to_dictionary_sequence(pipeline)
|
||||||
|
pipeline.export_file(output_file_name, 'dictionary')
|
||||||
|
pipeline.export_file(structure_file_name, 'structures-new')
|
||||||
|
pipeline.cleanup()
|
||||||
|
|
||||||
|
def validate_structures(self, input_file_name):
|
||||||
|
pipeline = Pipeline(self.resource_directory)
|
||||||
|
pipeline.import_file(input_file_name, 'structures-new')
|
||||||
|
self._validate_structures_sequence(pipeline)
|
||||||
|
pipeline.cleanup()
|
||||||
|
|
||||||
|
def validate_dictionary(self, input_file_name):
|
||||||
|
pipeline = Pipeline(self.resource_directory)
|
||||||
|
pipeline.import_file(input_file_name, 'dictionary')
|
||||||
|
self._validate_dictionary_sequence(pipeline)
|
||||||
|
pipeline.cleanup()
|
||||||
|
|
||||||
|
def _strings_to_parse_sequence(self, pipeline):
|
||||||
|
pipeline.do_tokenise()
|
||||||
|
pipeline.do_tweak_conllu()
|
||||||
|
pipeline.do_parse()
|
||||||
|
pipeline.do_translate_jos()
|
||||||
|
pipeline.do_conllu_to_tei()
|
||||||
|
|
||||||
|
def _parse_to_dictionary_sequence(self, pipeline):
|
||||||
|
pipeline.do_split_tei()
|
||||||
|
pipeline.do_assign_single()
|
||||||
|
pipeline.do_tei_to_dictionary_single()
|
||||||
|
pipeline.do_find_structure_units_first()
|
||||||
|
pipeline.do_assign_multiple_first()
|
||||||
|
pipeline.do_create_structures()
|
||||||
|
pipeline.do_find_structure_units_second()
|
||||||
|
pipeline.do_assign_multiple_second()
|
||||||
|
pipeline.do_tei_to_dictionary_multiple()
|
||||||
|
pipeline.do_merge_dictionaries()
|
||||||
|
|
||||||
|
def _validate_structures_sequence(self, pipeline):
|
||||||
|
pipeline.do_validate_structures()
|
||||||
|
|
||||||
|
def _validate_dictionary_sequence(self, pipeline):
|
||||||
|
pipeline.do_validate_dictionary()
|
||||||
|
|
||||||
|
|
||||||
class Pipeline:
|
class Pipeline:
|
||||||
|
|
||||||
|
|
|
@ -1,69 +1,9 @@
|
||||||
import argparse
|
import argparse
|
||||||
import tempfile
|
|
||||||
import shutil
|
|
||||||
|
|
||||||
from structure_assignment.pipeline import Pipeline, create_nlp
|
from structure_assignment.pipeline import Runner
|
||||||
|
|
||||||
resource_directory = '../resources'
|
resource_directory = '../resources'
|
||||||
|
|
||||||
def run_all(input_file_name, output_file_name, nlp, structure_file_name):
|
|
||||||
tmp_directory = tempfile.mkdtemp()
|
|
||||||
tmp_file_name = tmp_directory + '/parsed.xml'
|
|
||||||
strings_to_parse(input_file_name, tmp_file_name, nlp)
|
|
||||||
parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name)
|
|
||||||
shutil.rmtree(tmp_directory)
|
|
||||||
validate_structures(structure_file_name)
|
|
||||||
validate_dictionary(output_file_name)
|
|
||||||
|
|
||||||
def strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name):
|
|
||||||
tmp_directory = tempfile.mkdtemp()
|
|
||||||
tmp_file_name = tmp_directory + '/parsed.xml'
|
|
||||||
strings_to_parse(input_file_name, tmp_file_name, nlp)
|
|
||||||
parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name)
|
|
||||||
shutil.rmtree(tmp_directory)
|
|
||||||
|
|
||||||
def strings_to_parse(input_file_name, output_file_name, nlp):
|
|
||||||
pipeline = Pipeline(resource_directory, nlp)
|
|
||||||
pipeline.import_file(input_file_name, 'strings-list')
|
|
||||||
pipeline.do_tokenise()
|
|
||||||
pipeline.do_tweak_conllu()
|
|
||||||
pipeline.export_file(output_file_name, 'obeliks-tweaked')
|
|
||||||
# pipeline.do_parse()
|
|
||||||
# pipeline.do_translate_jos()
|
|
||||||
# pipeline.do_conllu_to_tei()
|
|
||||||
# pipeline.export_file(output_file_name, 'tei-initial')
|
|
||||||
pipeline.cleanup()
|
|
||||||
|
|
||||||
def parse_to_dictionary(input_file_name, output_file_name, structure_file_name):
|
|
||||||
pipeline = Pipeline(resource_directory)
|
|
||||||
pipeline.import_file(input_file_name, 'tei-initial')
|
|
||||||
pipeline.do_split_tei()
|
|
||||||
pipeline.do_assign_single()
|
|
||||||
pipeline.do_tei_to_dictionary_single()
|
|
||||||
pipeline.do_find_structure_units_first()
|
|
||||||
pipeline.do_assign_multiple_first()
|
|
||||||
pipeline.do_create_structures()
|
|
||||||
pipeline.do_find_structure_units_second()
|
|
||||||
pipeline.do_assign_multiple_second()
|
|
||||||
pipeline.do_tei_to_dictionary_multiple()
|
|
||||||
pipeline.do_merge_dictionaries()
|
|
||||||
pipeline.export_file(output_file_name, 'dictionary')
|
|
||||||
pipeline.export_file(structure_file_name, 'structures-new')
|
|
||||||
pipeline.cleanup()
|
|
||||||
|
|
||||||
def validate_structures(input_file_name):
|
|
||||||
pipeline = Pipeline(resource_directory)
|
|
||||||
pipeline.import_file(input_file_name, 'structures-new')
|
|
||||||
pipeline.do_validate_structures()
|
|
||||||
pipeline.cleanup()
|
|
||||||
|
|
||||||
def validate_dictionary(input_file_name):
|
|
||||||
pipeline = Pipeline(resource_directory)
|
|
||||||
pipeline.import_file(input_file_name, 'dictionary')
|
|
||||||
pipeline.do_validate_dictionary()
|
|
||||||
pipeline.cleanup()
|
|
||||||
|
|
||||||
|
|
||||||
if (__name__ == '__main__'):
|
if (__name__ == '__main__'):
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(description='Run part or all of structure pipeline.')
|
arg_parser = argparse.ArgumentParser(description='Run part or all of structure pipeline.')
|
||||||
|
@ -78,17 +18,17 @@ if (__name__ == '__main__'):
|
||||||
output_file_name = arguments.outfile
|
output_file_name = arguments.outfile
|
||||||
structure_file_name = arguments.structures
|
structure_file_name = arguments.structures
|
||||||
|
|
||||||
|
nlp_needed = part_name in {'strings_to_parse', 'strings_to_dictionary', 'all'}
|
||||||
|
runner = Runner(resource_directory, nlp_needed)
|
||||||
if (part_name == 'strings_to_parse'):
|
if (part_name == 'strings_to_parse'):
|
||||||
nlp = create_nlp(resource_directory)
|
runner.strings_to_parse(input_file_name, output_file_name)
|
||||||
strings_to_parse(input_file_name, output_file_name, nlp)
|
|
||||||
elif (part_name == 'strings_to_dictionary'):
|
elif (part_name == 'strings_to_dictionary'):
|
||||||
nlp = create_nlp(resource_directory)
|
runner.strings_to_dictionary(input_file_name, output_file_name, structure_file_name)
|
||||||
strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name)
|
|
||||||
elif (part_name == 'parse_to_dictionary'):
|
elif (part_name == 'parse_to_dictionary'):
|
||||||
parse_to_dictionary(input_file_name, output_file_name, structure_file_name)
|
runner.parse_to_dictionary(input_file_name, output_file_name, structure_file_name)
|
||||||
elif (part_name == 'validate_structures'):
|
elif (part_name == 'validate_structures'):
|
||||||
validate_structures(input_file_name)
|
runner.validate_structures(input_file_name)
|
||||||
elif (part_name == 'validate_dictionary'):
|
elif (part_name == 'validate_dictionary'):
|
||||||
validate_dictionary(input_file_name)
|
runner.validate_dictionary(input_file_name)
|
||||||
elif (part_name == 'all'):
|
elif (part_name == 'all'):
|
||||||
run_all(input_file_name, output_file_name, nlp, structure_file_name)
|
runner.run_all(input_file_name, output_file_name, nlp, structure_file_name)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user