Redmine #1487: handled dependencies in setup.py and adjusted resource handling

This commit is contained in:
Cyprian Laskowski 2022-07-29 19:42:02 +02:00
parent d421cb3c03
commit 8409d2722f
5 changed files with 35 additions and 61 deletions

5
.gitignore vendored
View File

@ -1,3 +1,2 @@
/venv __pycache__
/lib tmp
/resources

View File

@ -2,7 +2,8 @@ import argparse
from structure_assignment.pipeline import Runner from structure_assignment.pipeline import Runner
resource_directory = '../resources' classla_directory = '../resources/classla'
wani_file_name = '../resources/wani.py' # TODO: remove once luscenje_struktur incorporates wani in package
if (__name__ == '__main__'): if (__name__ == '__main__'):
@ -21,7 +22,7 @@ if (__name__ == '__main__'):
output_structure_file_name = arguments.outstructs output_structure_file_name = arguments.outstructs
nlp_needed = mode in {'strings_to_parse', 'strings_to_dictionary', 'all'} nlp_needed = mode in {'strings_to_parse', 'strings_to_dictionary', 'all'}
runner = Runner(resource_directory, nlp_needed) runner = Runner(classla_directory, nlp_needed, wani_file_name)
if (mode == 'strings_to_parse'): if (mode == 'strings_to_parse'):
runner.strings_to_parse(input_file_name, output_file_name) runner.strings_to_parse(input_file_name, output_file_name)
elif (mode == 'strings_to_dictionary'): elif (mode == 'strings_to_dictionary'):

View File

@ -1,34 +0,0 @@
#!/bin/bash
cd "$(dirname "$0")"
cd ..
mkdir lib resources
## get dependencies
cd lib
git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git
git clone git@gitea.cjvt.si:generic/xml_schemas.git
cd ..
## prepare python environment
python3 -m venv venv
source venv/bin/activate
pip install wheel
pip install lxml
pip install psycopg2cffi
pip install sqlalchemy
pip install classla
python -c "import classla; classla.download('sl', type='standard_jos', dir='resources/classla')"
pip install lib/luscenje_struktur/
pip install git+https://git@gitea.cjvt.si/generic/conversion_utils.git#egg=conversion_utils
pip install package/
deactivate
## put needed resources in place
cd resources
ln -s ../lib/luscenje_struktur/wani.py .
ln -s ../lib/xml_schemas/resources/schema/structures.xsd .
ln -s ../lib/xml_schemas/resources/schema/inventory.xsd .
ln -s ../lib/xml_schemas/resources/schema/monolingual_dictionaries.xsd .
cd ..

View File

@ -1,10 +1,17 @@
from setuptools import setup from setuptools import setup
setup(name='structure_assignment', setup(name='structure_assignment',
version='0.1', version='0.2',
description='Pipeline for parsing and assigning structures to arbitrary Slovenian strings', description='Pipeline for parsing and assigning structures to arbitrary Slovenian strings',
url='', url='https://gitea.cjvt.si/generic/structure_assignment_pipeline',
author='Cyprian Laskowski', author='Cyprian Laskowski',
author_email='cyp@cjvt.si', author_email='cyp@cjvt.si',
packages=['structure_assignment'], packages=['structure_assignment'],
install_requires=['lxml',
'classla',
'conversion_utils @ git+https://gitea.cjvt.si/generic/conversion_utils.git',
'luscenje_struktur_loc @ git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git@i2198', # TODO: switch to master once luscenje_struktur's i2198 is merged into master
'psycopg2cffi', # TODO: remove once luscenje_struktur takes care of it
'sqlalchemy', # TODO: remove once luscenje_struktur takes care of it
],
zip_safe=True) zip_safe=True)

View File

@ -1,4 +1,3 @@
import os
import shutil import shutil
import tempfile import tempfile
from types import SimpleNamespace from types import SimpleNamespace
@ -16,14 +15,22 @@ from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary
class Runner: class Runner:
def __init__(self, resource_directory, nlp_needed): def __init__(self, classla_directory, nlp_needed, wani=None):
self.resource_directory = resource_directory self.classla_directory = classla_directory
if (nlp_needed): if (nlp_needed):
NLP_CONFIG_MAP['dir'] = resource_directory + '/classla' NLP_CONFIG_MAP['dir'] = classla_directory
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP) self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
if (wani is not None):
self._provide_wani(wani)
def _provide_wani(self, wani_file_name): # TODO: remove once wani is incorporated into luscenje_struktur package
self.wani_directory = tempfile.mkdtemp()
shutil.copy(wani_file_name, self.wani_directory)
import sys
sys.path.insert(0, self.wani_directory)
def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
pipeline = Pipeline(self.resource_directory, self.nlp) pipeline = Pipeline(self.nlp)
pipeline.import_file(input_file_name, 'strings-list') pipeline.import_file(input_file_name, 'strings-list')
pipeline.import_file(input_structure_file_name, 'structures-old') pipeline.import_file(input_structure_file_name, 'structures-old')
self._strings_to_parse_sequence(pipeline) self._strings_to_parse_sequence(pipeline)
@ -35,7 +42,7 @@ class Runner:
pipeline.cleanup() pipeline.cleanup()
def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
pipeline = Pipeline(self.resource_directory, self.nlp) pipeline = Pipeline(self.nlp)
pipeline.import_file(input_file_name, 'strings-list') pipeline.import_file(input_file_name, 'strings-list')
pipeline.import_file(input_structure_file_name, 'structures-old') pipeline.import_file(input_structure_file_name, 'structures-old')
self._strings_to_parse_sequence(pipeline) self._strings_to_parse_sequence(pipeline)
@ -45,14 +52,14 @@ class Runner:
pipeline.cleanup() pipeline.cleanup()
def strings_to_parse(self, input_file_name, output_file_name): def strings_to_parse(self, input_file_name, output_file_name):
pipeline = Pipeline(self.resource_directory, self.nlp) pipeline = Pipeline(self.nlp)
pipeline.import_file(input_file_name, 'strings-list') pipeline.import_file(input_file_name, 'strings-list')
self._strings_to_parse_sequence(pipeline) self._strings_to_parse_sequence(pipeline)
pipeline.export_file(output_file_name, 'tei-initial') pipeline.export_file(output_file_name, 'tei-initial')
pipeline.cleanup() pipeline.cleanup()
def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
pipeline = Pipeline(self.resource_directory) pipeline = Pipeline()
pipeline.import_file(input_file_name, 'tei-initial') pipeline.import_file(input_file_name, 'tei-initial')
pipeline.import_file(input_structure_file_name, 'structures-old') pipeline.import_file(input_structure_file_name, 'structures-old')
self._parse_to_dictionary_sequence(pipeline) self._parse_to_dictionary_sequence(pipeline)
@ -61,13 +68,13 @@ class Runner:
pipeline.cleanup() pipeline.cleanup()
def validate_structures(self, input_file_name): def validate_structures(self, input_file_name):
pipeline = Pipeline(self.resource_directory) pipeline = Pipeline()
pipeline.import_file(input_file_name, 'structures-new') pipeline.import_file(input_file_name, 'structures-new')
pipeline.do_validate_structures() pipeline.do_validate_structures()
pipeline.cleanup() pipeline.cleanup()
def validate_dictionary(self, input_file_name): def validate_dictionary(self, input_file_name):
pipeline = Pipeline(self.resource_directory) pipeline = Pipeline()
pipeline.import_file(input_file_name, 'dictionary') pipeline.import_file(input_file_name, 'dictionary')
pipeline.do_validate_dictionary() pipeline.do_validate_dictionary()
pipeline.cleanup() pipeline.cleanup()
@ -87,17 +94,10 @@ class Runner:
class Pipeline: class Pipeline:
def __init__(self, resource_directory, nlp=None): def __init__(self, nlp=None):
self.nlp = nlp self.nlp = nlp
self.tmp_directory = tempfile.mkdtemp() self.tmp_directory = tempfile.mkdtemp()
resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)]
for resource_file_name in resource_file_names:
if (os.path.isfile(resource_file_name)):
shutil.copy(resource_file_name, self.tmp_directory)
import sys
sys.path.insert(0, self.tmp_directory)
self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()} self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}
self.classla_directory = resource_directory + '/classla'
def import_file(self, file_name, file_key): def import_file(self, file_name, file_key):
shutil.copyfile(file_name, self.file_map[file_key]) shutil.copyfile(file_name, self.file_map[file_key])
@ -108,7 +108,7 @@ class Pipeline:
output_file_name = self.file_map['obeliks-tokenised'] output_file_name = self.file_map['obeliks-tokenised']
with open(input_file_name, 'r') as input_file: with open(input_file_name, 'r') as input_file:
input_conllu = input_file.read() input_conllu = input_file.read()
tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.classla_directory) tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.nlp.dir)
output_conllu = tokeniser(input_conllu).to_conll() output_conllu = tokeniser(input_conllu).to_conll()
with open(output_file_name, 'w') as output_file: with open(output_file_name, 'w') as output_file:
output_file.write(output_conllu) output_file.write(output_conllu)
@ -220,3 +220,4 @@ class Pipeline:
def cleanup(self): def cleanup(self):
shutil.rmtree(self.tmp_directory, True) shutil.rmtree(self.tmp_directory, True)
shutil.rmtree(self.wani_directory, True)