Redmine #1487: handled dependencies in setup.py and adjusted resource handling
This commit is contained in:
parent
d421cb3c03
commit
8409d2722f
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -1,3 +1,2 @@
|
||||||
/venv
|
__pycache__
|
||||||
/lib
|
tmp
|
||||||
/resources
|
|
|
@ -2,7 +2,8 @@ import argparse
|
||||||
|
|
||||||
from structure_assignment.pipeline import Runner
|
from structure_assignment.pipeline import Runner
|
||||||
|
|
||||||
resource_directory = '../resources'
|
classla_directory = '../resources/classla'
|
||||||
|
wani_file_name = '../resources/wani.py' # TODO: remove once luscenje_struktur incorporates wani in package
|
||||||
|
|
||||||
if (__name__ == '__main__'):
|
if (__name__ == '__main__'):
|
||||||
|
|
||||||
|
@ -21,7 +22,7 @@ if (__name__ == '__main__'):
|
||||||
output_structure_file_name = arguments.outstructs
|
output_structure_file_name = arguments.outstructs
|
||||||
|
|
||||||
nlp_needed = mode in {'strings_to_parse', 'strings_to_dictionary', 'all'}
|
nlp_needed = mode in {'strings_to_parse', 'strings_to_dictionary', 'all'}
|
||||||
runner = Runner(resource_directory, nlp_needed)
|
runner = Runner(classla_directory, nlp_needed, wani_file_name)
|
||||||
if (mode == 'strings_to_parse'):
|
if (mode == 'strings_to_parse'):
|
||||||
runner.strings_to_parse(input_file_name, output_file_name)
|
runner.strings_to_parse(input_file_name, output_file_name)
|
||||||
elif (mode == 'strings_to_dictionary'):
|
elif (mode == 'strings_to_dictionary'):
|
||||||
|
|
|
@ -1,34 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
cd "$(dirname "$0")"
|
|
||||||
cd ..
|
|
||||||
|
|
||||||
mkdir lib resources
|
|
||||||
|
|
||||||
## get dependencies
|
|
||||||
cd lib
|
|
||||||
git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git
|
|
||||||
git clone git@gitea.cjvt.si:generic/xml_schemas.git
|
|
||||||
cd ..
|
|
||||||
|
|
||||||
## prepare python environment
|
|
||||||
python3 -m venv venv
|
|
||||||
source venv/bin/activate
|
|
||||||
pip install wheel
|
|
||||||
pip install lxml
|
|
||||||
pip install psycopg2cffi
|
|
||||||
pip install sqlalchemy
|
|
||||||
pip install classla
|
|
||||||
python -c "import classla; classla.download('sl', type='standard_jos', dir='resources/classla')"
|
|
||||||
pip install lib/luscenje_struktur/
|
|
||||||
pip install git+https://git@gitea.cjvt.si/generic/conversion_utils.git#egg=conversion_utils
|
|
||||||
pip install package/
|
|
||||||
deactivate
|
|
||||||
|
|
||||||
## put needed resources in place
|
|
||||||
cd resources
|
|
||||||
ln -s ../lib/luscenje_struktur/wani.py .
|
|
||||||
ln -s ../lib/xml_schemas/resources/schema/structures.xsd .
|
|
||||||
ln -s ../lib/xml_schemas/resources/schema/inventory.xsd .
|
|
||||||
ln -s ../lib/xml_schemas/resources/schema/monolingual_dictionaries.xsd .
|
|
||||||
cd ..
|
|
11
setup.py
11
setup.py
|
@ -1,10 +1,17 @@
|
||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
|
|
||||||
setup(name='structure_assignment',
|
setup(name='structure_assignment',
|
||||||
version='0.1',
|
version='0.2',
|
||||||
description='Pipeline for parsing and assigning structures to arbitrary Slovenian strings',
|
description='Pipeline for parsing and assigning structures to arbitrary Slovenian strings',
|
||||||
url='',
|
url='https://gitea.cjvt.si/generic/structure_assignment_pipeline',
|
||||||
author='Cyprian Laskowski',
|
author='Cyprian Laskowski',
|
||||||
author_email='cyp@cjvt.si',
|
author_email='cyp@cjvt.si',
|
||||||
packages=['structure_assignment'],
|
packages=['structure_assignment'],
|
||||||
|
install_requires=['lxml',
|
||||||
|
'classla',
|
||||||
|
'conversion_utils @ git+https://gitea.cjvt.si/generic/conversion_utils.git',
|
||||||
|
'luscenje_struktur_loc @ git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git@i2198', # TODO: switch to master once luscenje_struktur's i2198 is merged into master
|
||||||
|
'psycopg2cffi', # TODO: remove once luscenje_struktur takes care of it
|
||||||
|
'sqlalchemy', # TODO: remove once luscenje_struktur takes care of it
|
||||||
|
],
|
||||||
zip_safe=True)
|
zip_safe=True)
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import os
|
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
from types import SimpleNamespace
|
from types import SimpleNamespace
|
||||||
|
@ -16,14 +15,22 @@ from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary
|
||||||
|
|
||||||
class Runner:
|
class Runner:
|
||||||
|
|
||||||
def __init__(self, resource_directory, nlp_needed):
|
def __init__(self, classla_directory, nlp_needed, wani=None):
|
||||||
self.resource_directory = resource_directory
|
self.classla_directory = classla_directory
|
||||||
if (nlp_needed):
|
if (nlp_needed):
|
||||||
NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
|
NLP_CONFIG_MAP['dir'] = classla_directory
|
||||||
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
||||||
|
if (wani is not None):
|
||||||
|
self._provide_wani(wani)
|
||||||
|
|
||||||
|
def _provide_wani(self, wani_file_name): # TODO: remove once wani is incorporated into luscenje_struktur package
|
||||||
|
self.wani_directory = tempfile.mkdtemp()
|
||||||
|
shutil.copy(wani_file_name, self.wani_directory)
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, self.wani_directory)
|
||||||
|
|
||||||
def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
||||||
pipeline = Pipeline(self.resource_directory, self.nlp)
|
pipeline = Pipeline(self.nlp)
|
||||||
pipeline.import_file(input_file_name, 'strings-list')
|
pipeline.import_file(input_file_name, 'strings-list')
|
||||||
pipeline.import_file(input_structure_file_name, 'structures-old')
|
pipeline.import_file(input_structure_file_name, 'structures-old')
|
||||||
self._strings_to_parse_sequence(pipeline)
|
self._strings_to_parse_sequence(pipeline)
|
||||||
|
@ -35,7 +42,7 @@ class Runner:
|
||||||
pipeline.cleanup()
|
pipeline.cleanup()
|
||||||
|
|
||||||
def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
||||||
pipeline = Pipeline(self.resource_directory, self.nlp)
|
pipeline = Pipeline(self.nlp)
|
||||||
pipeline.import_file(input_file_name, 'strings-list')
|
pipeline.import_file(input_file_name, 'strings-list')
|
||||||
pipeline.import_file(input_structure_file_name, 'structures-old')
|
pipeline.import_file(input_structure_file_name, 'structures-old')
|
||||||
self._strings_to_parse_sequence(pipeline)
|
self._strings_to_parse_sequence(pipeline)
|
||||||
|
@ -45,14 +52,14 @@ class Runner:
|
||||||
pipeline.cleanup()
|
pipeline.cleanup()
|
||||||
|
|
||||||
def strings_to_parse(self, input_file_name, output_file_name):
|
def strings_to_parse(self, input_file_name, output_file_name):
|
||||||
pipeline = Pipeline(self.resource_directory, self.nlp)
|
pipeline = Pipeline(self.nlp)
|
||||||
pipeline.import_file(input_file_name, 'strings-list')
|
pipeline.import_file(input_file_name, 'strings-list')
|
||||||
self._strings_to_parse_sequence(pipeline)
|
self._strings_to_parse_sequence(pipeline)
|
||||||
pipeline.export_file(output_file_name, 'tei-initial')
|
pipeline.export_file(output_file_name, 'tei-initial')
|
||||||
pipeline.cleanup()
|
pipeline.cleanup()
|
||||||
|
|
||||||
def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
||||||
pipeline = Pipeline(self.resource_directory)
|
pipeline = Pipeline()
|
||||||
pipeline.import_file(input_file_name, 'tei-initial')
|
pipeline.import_file(input_file_name, 'tei-initial')
|
||||||
pipeline.import_file(input_structure_file_name, 'structures-old')
|
pipeline.import_file(input_structure_file_name, 'structures-old')
|
||||||
self._parse_to_dictionary_sequence(pipeline)
|
self._parse_to_dictionary_sequence(pipeline)
|
||||||
|
@ -61,13 +68,13 @@ class Runner:
|
||||||
pipeline.cleanup()
|
pipeline.cleanup()
|
||||||
|
|
||||||
def validate_structures(self, input_file_name):
|
def validate_structures(self, input_file_name):
|
||||||
pipeline = Pipeline(self.resource_directory)
|
pipeline = Pipeline()
|
||||||
pipeline.import_file(input_file_name, 'structures-new')
|
pipeline.import_file(input_file_name, 'structures-new')
|
||||||
pipeline.do_validate_structures()
|
pipeline.do_validate_structures()
|
||||||
pipeline.cleanup()
|
pipeline.cleanup()
|
||||||
|
|
||||||
def validate_dictionary(self, input_file_name):
|
def validate_dictionary(self, input_file_name):
|
||||||
pipeline = Pipeline(self.resource_directory)
|
pipeline = Pipeline()
|
||||||
pipeline.import_file(input_file_name, 'dictionary')
|
pipeline.import_file(input_file_name, 'dictionary')
|
||||||
pipeline.do_validate_dictionary()
|
pipeline.do_validate_dictionary()
|
||||||
pipeline.cleanup()
|
pipeline.cleanup()
|
||||||
|
@ -87,17 +94,10 @@ class Runner:
|
||||||
|
|
||||||
class Pipeline:
|
class Pipeline:
|
||||||
|
|
||||||
def __init__(self, resource_directory, nlp=None):
|
def __init__(self, nlp=None):
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.tmp_directory = tempfile.mkdtemp()
|
self.tmp_directory = tempfile.mkdtemp()
|
||||||
resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)]
|
|
||||||
for resource_file_name in resource_file_names:
|
|
||||||
if (os.path.isfile(resource_file_name)):
|
|
||||||
shutil.copy(resource_file_name, self.tmp_directory)
|
|
||||||
import sys
|
|
||||||
sys.path.insert(0, self.tmp_directory)
|
|
||||||
self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}
|
self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}
|
||||||
self.classla_directory = resource_directory + '/classla'
|
|
||||||
|
|
||||||
def import_file(self, file_name, file_key):
|
def import_file(self, file_name, file_key):
|
||||||
shutil.copyfile(file_name, self.file_map[file_key])
|
shutil.copyfile(file_name, self.file_map[file_key])
|
||||||
|
@ -108,7 +108,7 @@ class Pipeline:
|
||||||
output_file_name = self.file_map['obeliks-tokenised']
|
output_file_name = self.file_map['obeliks-tokenised']
|
||||||
with open(input_file_name, 'r') as input_file:
|
with open(input_file_name, 'r') as input_file:
|
||||||
input_conllu = input_file.read()
|
input_conllu = input_file.read()
|
||||||
tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.classla_directory)
|
tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.nlp.dir)
|
||||||
output_conllu = tokeniser(input_conllu).to_conll()
|
output_conllu = tokeniser(input_conllu).to_conll()
|
||||||
with open(output_file_name, 'w') as output_file:
|
with open(output_file_name, 'w') as output_file:
|
||||||
output_file.write(output_conllu)
|
output_file.write(output_conllu)
|
||||||
|
@ -220,3 +220,4 @@ class Pipeline:
|
||||||
|
|
||||||
def cleanup(self):
|
def cleanup(self):
|
||||||
shutil.rmtree(self.tmp_directory, True)
|
shutil.rmtree(self.tmp_directory, True)
|
||||||
|
shutil.rmtree(self.wani_directory, True)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user