Redmine #1487: handled dependencies in setup.py and adjusted resource handling
This commit is contained in:
parent
d421cb3c03
commit
8409d2722f
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -1,3 +1,2 @@
|
|||
/venv
|
||||
/lib
|
||||
/resources
|
||||
__pycache__
|
||||
tmp
|
|
@ -2,7 +2,8 @@ import argparse
|
|||
|
||||
from structure_assignment.pipeline import Runner
|
||||
|
||||
resource_directory = '../resources'
|
||||
classla_directory = '../resources/classla'
|
||||
wani_file_name = '../resources/wani.py' # TODO: remove once luscenje_struktur incorporates wani in package
|
||||
|
||||
if (__name__ == '__main__'):
|
||||
|
||||
|
@ -21,7 +22,7 @@ if (__name__ == '__main__'):
|
|||
output_structure_file_name = arguments.outstructs
|
||||
|
||||
nlp_needed = mode in {'strings_to_parse', 'strings_to_dictionary', 'all'}
|
||||
runner = Runner(resource_directory, nlp_needed)
|
||||
runner = Runner(classla_directory, nlp_needed, wani_file_name)
|
||||
if (mode == 'strings_to_parse'):
|
||||
runner.strings_to_parse(input_file_name, output_file_name)
|
||||
elif (mode == 'strings_to_dictionary'):
|
||||
|
|
|
@ -1,34 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
cd ..
|
||||
|
||||
mkdir lib resources
|
||||
|
||||
## get dependencies
|
||||
cd lib
|
||||
git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git
|
||||
git clone git@gitea.cjvt.si:generic/xml_schemas.git
|
||||
cd ..
|
||||
|
||||
## prepare python environment
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install wheel
|
||||
pip install lxml
|
||||
pip install psycopg2cffi
|
||||
pip install sqlalchemy
|
||||
pip install classla
|
||||
python -c "import classla; classla.download('sl', type='standard_jos', dir='resources/classla')"
|
||||
pip install lib/luscenje_struktur/
|
||||
pip install git+https://git@gitea.cjvt.si/generic/conversion_utils.git#egg=conversion_utils
|
||||
pip install package/
|
||||
deactivate
|
||||
|
||||
## put needed resources in place
|
||||
cd resources
|
||||
ln -s ../lib/luscenje_struktur/wani.py .
|
||||
ln -s ../lib/xml_schemas/resources/schema/structures.xsd .
|
||||
ln -s ../lib/xml_schemas/resources/schema/inventory.xsd .
|
||||
ln -s ../lib/xml_schemas/resources/schema/monolingual_dictionaries.xsd .
|
||||
cd ..
|
11
setup.py
11
setup.py
|
@ -1,10 +1,17 @@
|
|||
from setuptools import setup
|
||||
|
||||
setup(name='structure_assignment',
|
||||
version='0.1',
|
||||
version='0.2',
|
||||
description='Pipeline for parsing and assigning structures to arbitrary Slovenian strings',
|
||||
url='',
|
||||
url='https://gitea.cjvt.si/generic/structure_assignment_pipeline',
|
||||
author='Cyprian Laskowski',
|
||||
author_email='cyp@cjvt.si',
|
||||
packages=['structure_assignment'],
|
||||
install_requires=['lxml',
|
||||
'classla',
|
||||
'conversion_utils @ git+https://gitea.cjvt.si/generic/conversion_utils.git',
|
||||
'luscenje_struktur_loc @ git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git@i2198', # TODO: switch to master once luscenje_struktur's i2198 is merged into master
|
||||
'psycopg2cffi', # TODO: remove once luscenje_struktur takes care of it
|
||||
'sqlalchemy', # TODO: remove once luscenje_struktur takes care of it
|
||||
],
|
||||
zip_safe=True)
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from types import SimpleNamespace
|
||||
|
@ -16,14 +15,22 @@ from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary
|
|||
|
||||
class Runner:
|
||||
|
||||
def __init__(self, resource_directory, nlp_needed):
|
||||
self.resource_directory = resource_directory
|
||||
def __init__(self, classla_directory, nlp_needed, wani=None):
|
||||
self.classla_directory = classla_directory
|
||||
if (nlp_needed):
|
||||
NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
|
||||
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
||||
NLP_CONFIG_MAP['dir'] = classla_directory
|
||||
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
||||
if (wani is not None):
|
||||
self._provide_wani(wani)
|
||||
|
||||
def _provide_wani(self, wani_file_name): # TODO: remove once wani is incorporated into luscenje_struktur package
|
||||
self.wani_directory = tempfile.mkdtemp()
|
||||
shutil.copy(wani_file_name, self.wani_directory)
|
||||
import sys
|
||||
sys.path.insert(0, self.wani_directory)
|
||||
|
||||
def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
||||
pipeline = Pipeline(self.resource_directory, self.nlp)
|
||||
pipeline = Pipeline(self.nlp)
|
||||
pipeline.import_file(input_file_name, 'strings-list')
|
||||
pipeline.import_file(input_structure_file_name, 'structures-old')
|
||||
self._strings_to_parse_sequence(pipeline)
|
||||
|
@ -35,7 +42,7 @@ class Runner:
|
|||
pipeline.cleanup()
|
||||
|
||||
def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
||||
pipeline = Pipeline(self.resource_directory, self.nlp)
|
||||
pipeline = Pipeline(self.nlp)
|
||||
pipeline.import_file(input_file_name, 'strings-list')
|
||||
pipeline.import_file(input_structure_file_name, 'structures-old')
|
||||
self._strings_to_parse_sequence(pipeline)
|
||||
|
@ -45,14 +52,14 @@ class Runner:
|
|||
pipeline.cleanup()
|
||||
|
||||
def strings_to_parse(self, input_file_name, output_file_name):
|
||||
pipeline = Pipeline(self.resource_directory, self.nlp)
|
||||
pipeline = Pipeline(self.nlp)
|
||||
pipeline.import_file(input_file_name, 'strings-list')
|
||||
self._strings_to_parse_sequence(pipeline)
|
||||
pipeline.export_file(output_file_name, 'tei-initial')
|
||||
pipeline.cleanup()
|
||||
|
||||
def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
||||
pipeline = Pipeline(self.resource_directory)
|
||||
pipeline = Pipeline()
|
||||
pipeline.import_file(input_file_name, 'tei-initial')
|
||||
pipeline.import_file(input_structure_file_name, 'structures-old')
|
||||
self._parse_to_dictionary_sequence(pipeline)
|
||||
|
@ -61,13 +68,13 @@ class Runner:
|
|||
pipeline.cleanup()
|
||||
|
||||
def validate_structures(self, input_file_name):
|
||||
pipeline = Pipeline(self.resource_directory)
|
||||
pipeline = Pipeline()
|
||||
pipeline.import_file(input_file_name, 'structures-new')
|
||||
pipeline.do_validate_structures()
|
||||
pipeline.cleanup()
|
||||
|
||||
def validate_dictionary(self, input_file_name):
|
||||
pipeline = Pipeline(self.resource_directory)
|
||||
pipeline = Pipeline()
|
||||
pipeline.import_file(input_file_name, 'dictionary')
|
||||
pipeline.do_validate_dictionary()
|
||||
pipeline.cleanup()
|
||||
|
@ -87,17 +94,10 @@ class Runner:
|
|||
|
||||
class Pipeline:
|
||||
|
||||
def __init__(self, resource_directory, nlp=None):
|
||||
def __init__(self, nlp=None):
|
||||
self.nlp = nlp
|
||||
self.tmp_directory = tempfile.mkdtemp()
|
||||
resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)]
|
||||
for resource_file_name in resource_file_names:
|
||||
if (os.path.isfile(resource_file_name)):
|
||||
shutil.copy(resource_file_name, self.tmp_directory)
|
||||
import sys
|
||||
sys.path.insert(0, self.tmp_directory)
|
||||
self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}
|
||||
self.classla_directory = resource_directory + '/classla'
|
||||
|
||||
def import_file(self, file_name, file_key):
|
||||
shutil.copyfile(file_name, self.file_map[file_key])
|
||||
|
@ -108,7 +108,7 @@ class Pipeline:
|
|||
output_file_name = self.file_map['obeliks-tokenised']
|
||||
with open(input_file_name, 'r') as input_file:
|
||||
input_conllu = input_file.read()
|
||||
tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.classla_directory)
|
||||
tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.nlp.dir)
|
||||
output_conllu = tokeniser(input_conllu).to_conll()
|
||||
with open(output_file_name, 'w') as output_file:
|
||||
output_file.write(output_conllu)
|
||||
|
@ -220,3 +220,4 @@ class Pipeline:
|
|||
|
||||
def cleanup(self):
|
||||
shutil.rmtree(self.tmp_directory, True)
|
||||
shutil.rmtree(self.wani_directory, True)
|
||||
|
|
Loading…
Reference in New Issue
Block a user