Redmine #1487: handled dependencies in setup.py and adjusted resource handling
This commit is contained in:
		
							parent
							
								
									d421cb3c03
								
							
						
					
					
						commit
						8409d2722f
					
				
							
								
								
									
										5
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -1,3 +1,2 @@ | ||||
| /venv | ||||
| /lib | ||||
| /resources | ||||
| __pycache__ | ||||
| tmp | ||||
| @ -2,7 +2,8 @@ import argparse | ||||
| 
 | ||||
| from structure_assignment.pipeline import Runner | ||||
| 
 | ||||
| resource_directory = '../resources' | ||||
| classla_directory = '../resources/classla' | ||||
| wani_file_name = '../resources/wani.py'  # TODO: remove once luscenje_struktur incorporates wani in package | ||||
| 
 | ||||
| if (__name__ == '__main__'): | ||||
| 
 | ||||
| @ -21,7 +22,7 @@ if (__name__ == '__main__'): | ||||
|     output_structure_file_name = arguments.outstructs | ||||
| 
 | ||||
|     nlp_needed = mode in {'strings_to_parse', 'strings_to_dictionary', 'all'} | ||||
|     runner = Runner(resource_directory, nlp_needed) | ||||
|     runner = Runner(classla_directory, nlp_needed, wani_file_name) | ||||
|     if (mode == 'strings_to_parse'): | ||||
|         runner.strings_to_parse(input_file_name, output_file_name) | ||||
|     elif (mode == 'strings_to_dictionary'): | ||||
|  | ||||
| @ -1,34 +0,0 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| cd "$(dirname "$0")" | ||||
| cd .. | ||||
| 
 | ||||
| mkdir lib resources | ||||
| 
 | ||||
| ## get dependencies | ||||
| cd lib | ||||
| git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git | ||||
| git clone git@gitea.cjvt.si:generic/xml_schemas.git | ||||
| cd .. | ||||
| 
 | ||||
| ## prepare python environment | ||||
| python3 -m venv venv | ||||
| source venv/bin/activate | ||||
| pip install wheel | ||||
| pip install lxml | ||||
| pip install psycopg2cffi | ||||
| pip install sqlalchemy | ||||
| pip install classla | ||||
| python -c "import classla; classla.download('sl', type='standard_jos', dir='resources/classla')" | ||||
| pip install lib/luscenje_struktur/ | ||||
| pip install git+https://git@gitea.cjvt.si/generic/conversion_utils.git#egg=conversion_utils | ||||
| pip install package/ | ||||
| deactivate | ||||
| 
 | ||||
| ## put needed resources in place | ||||
| cd resources | ||||
| ln -s ../lib/luscenje_struktur/wani.py . | ||||
| ln -s ../lib/xml_schemas/resources/schema/structures.xsd . | ||||
| ln -s ../lib/xml_schemas/resources/schema/inventory.xsd . | ||||
| ln -s ../lib/xml_schemas/resources/schema/monolingual_dictionaries.xsd . | ||||
| cd .. | ||||
							
								
								
									
										11
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								setup.py
									
									
									
									
									
								
							| @ -1,10 +1,17 @@ | ||||
| from setuptools import setup | ||||
| 
 | ||||
| setup(name='structure_assignment', | ||||
|       version='0.1', | ||||
|       version='0.2', | ||||
|       description='Pipeline for parsing and assigning structures to arbitrary Slovenian strings', | ||||
|       url='', | ||||
|       url='https://gitea.cjvt.si/generic/structure_assignment_pipeline', | ||||
|       author='Cyprian Laskowski', | ||||
|       author_email='cyp@cjvt.si', | ||||
|       packages=['structure_assignment'], | ||||
|       install_requires=['lxml', | ||||
|                         'classla', | ||||
|                         'conversion_utils @ git+https://gitea.cjvt.si/generic/conversion_utils.git', | ||||
|                         'luscenje_struktur_loc @ git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git@i2198',  # TODO: switch to master once luscenje_struktur's i2198 is merged into master | ||||
|                         'psycopg2cffi',  # TODO: remove once luscenje_struktur takes care of it | ||||
|                         'sqlalchemy',  # TODO: remove once luscenje_struktur takes care of it | ||||
|       ], | ||||
|       zip_safe=True) | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| import os | ||||
| import shutil | ||||
| import tempfile | ||||
| from types import SimpleNamespace | ||||
| @ -16,14 +15,22 @@ from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary | ||||
| 
 | ||||
| class Runner: | ||||
| 
 | ||||
|     def __init__(self, resource_directory, nlp_needed): | ||||
|         self.resource_directory = resource_directory | ||||
|     def __init__(self, classla_directory, nlp_needed, wani=None): | ||||
|         self.classla_directory = classla_directory | ||||
|         if (nlp_needed): | ||||
|             NLP_CONFIG_MAP['dir'] = resource_directory + '/classla' | ||||
|             self.nlp =  classla.Pipeline('sl', **NLP_CONFIG_MAP) | ||||
|             NLP_CONFIG_MAP['dir'] = classla_directory | ||||
|             self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP) | ||||
|         if (wani is not None): | ||||
|             self._provide_wani(wani) | ||||
| 
 | ||||
|     def _provide_wani(self, wani_file_name):  # TODO: remove once wani is incorporated into luscenje_struktur package | ||||
|         self.wani_directory = tempfile.mkdtemp() | ||||
|         shutil.copy(wani_file_name, self.wani_directory) | ||||
|         import sys | ||||
|         sys.path.insert(0, self.wani_directory) | ||||
| 
 | ||||
|     def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): | ||||
|         pipeline = Pipeline(self.resource_directory, self.nlp) | ||||
|         pipeline = Pipeline(self.nlp) | ||||
|         pipeline.import_file(input_file_name, 'strings-list') | ||||
|         pipeline.import_file(input_structure_file_name, 'structures-old') | ||||
|         self._strings_to_parse_sequence(pipeline) | ||||
| @ -35,7 +42,7 @@ class Runner: | ||||
|         pipeline.cleanup() | ||||
| 
 | ||||
|     def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): | ||||
|         pipeline = Pipeline(self.resource_directory, self.nlp) | ||||
|         pipeline = Pipeline(self.nlp) | ||||
|         pipeline.import_file(input_file_name, 'strings-list') | ||||
|         pipeline.import_file(input_structure_file_name, 'structures-old') | ||||
|         self._strings_to_parse_sequence(pipeline) | ||||
| @ -45,14 +52,14 @@ class Runner: | ||||
|         pipeline.cleanup() | ||||
| 
 | ||||
|     def strings_to_parse(self, input_file_name, output_file_name): | ||||
|         pipeline = Pipeline(self.resource_directory, self.nlp) | ||||
|         pipeline = Pipeline(self.nlp) | ||||
|         pipeline.import_file(input_file_name, 'strings-list') | ||||
|         self._strings_to_parse_sequence(pipeline) | ||||
|         pipeline.export_file(output_file_name, 'tei-initial') | ||||
|         pipeline.cleanup() | ||||
| 
 | ||||
|     def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): | ||||
|         pipeline = Pipeline(self.resource_directory) | ||||
|         pipeline = Pipeline() | ||||
|         pipeline.import_file(input_file_name, 'tei-initial') | ||||
|         pipeline.import_file(input_structure_file_name, 'structures-old') | ||||
|         self._parse_to_dictionary_sequence(pipeline) | ||||
| @ -61,13 +68,13 @@ class Runner: | ||||
|         pipeline.cleanup() | ||||
| 
 | ||||
|     def validate_structures(self, input_file_name): | ||||
|         pipeline = Pipeline(self.resource_directory) | ||||
|         pipeline = Pipeline() | ||||
|         pipeline.import_file(input_file_name, 'structures-new') | ||||
|         pipeline.do_validate_structures() | ||||
|         pipeline.cleanup() | ||||
| 
 | ||||
|     def validate_dictionary(self, input_file_name): | ||||
|         pipeline = Pipeline(self.resource_directory) | ||||
|         pipeline = Pipeline() | ||||
|         pipeline.import_file(input_file_name, 'dictionary') | ||||
|         pipeline.do_validate_dictionary() | ||||
|         pipeline.cleanup() | ||||
| @ -87,17 +94,10 @@ class Runner: | ||||
| 
 | ||||
| class Pipeline: | ||||
| 
 | ||||
|     def __init__(self, resource_directory, nlp=None): | ||||
|     def __init__(self, nlp=None): | ||||
|         self.nlp = nlp | ||||
|         self.tmp_directory = tempfile.mkdtemp() | ||||
|         resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)] | ||||
|         for resource_file_name in resource_file_names: | ||||
|             if (os.path.isfile(resource_file_name)): | ||||
|                 shutil.copy(resource_file_name, self.tmp_directory) | ||||
|         import sys | ||||
|         sys.path.insert(0, self.tmp_directory) | ||||
|         self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()} | ||||
|         self.classla_directory = resource_directory + '/classla' | ||||
| 
 | ||||
|     def import_file(self, file_name, file_key): | ||||
|         shutil.copyfile(file_name, self.file_map[file_key]) | ||||
| @ -108,7 +108,7 @@ class Pipeline: | ||||
|         output_file_name = self.file_map['obeliks-tokenised'] | ||||
|         with open(input_file_name, 'r') as input_file: | ||||
|             input_conllu = input_file.read() | ||||
|         tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.classla_directory) | ||||
|         tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.nlp.dir) | ||||
|         output_conllu = tokeniser(input_conllu).to_conll() | ||||
|         with open(output_file_name, 'w') as output_file: | ||||
|             output_file.write(output_conllu) | ||||
| @ -220,3 +220,4 @@ class Pipeline: | ||||
| 
 | ||||
|     def cleanup(self): | ||||
|         shutil.rmtree(self.tmp_directory, True) | ||||
|         shutil.rmtree(self.wani_directory, True) | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user