diff --git a/.gitignore b/.gitignore index ff1a1aa..1767514 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ -/venv -/lib -/resources \ No newline at end of file +__pycache__ +tmp \ No newline at end of file diff --git a/scripts/process.py b/scripts/process.py index d3c91e6..64d6f09 100644 --- a/scripts/process.py +++ b/scripts/process.py @@ -2,7 +2,8 @@ import argparse from structure_assignment.pipeline import Runner -resource_directory = '../resources' +classla_directory = '../resources/classla' +wani_file_name = '../resources/wani.py' # TODO: remove once luscenje_struktur incorporates wani in package if (__name__ == '__main__'): @@ -21,7 +22,7 @@ if (__name__ == '__main__'): output_structure_file_name = arguments.outstructs nlp_needed = mode in {'strings_to_parse', 'strings_to_dictionary', 'all'} - runner = Runner(resource_directory, nlp_needed) + runner = Runner(classla_directory, nlp_needed, wani_file_name) if (mode == 'strings_to_parse'): runner.strings_to_parse(input_file_name, output_file_name) elif (mode == 'strings_to_dictionary'): diff --git a/scripts/setup.sh b/scripts/setup.sh deleted file mode 100755 index 5563043..0000000 --- a/scripts/setup.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -cd "$(dirname "$0")" -cd .. - -mkdir lib resources - -## get dependencies -cd lib -git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git -git clone git@gitea.cjvt.si:generic/xml_schemas.git -cd .. - -## prepare python environment -python3 -m venv venv -source venv/bin/activate -pip install wheel -pip install lxml -pip install psycopg2cffi -pip install sqlalchemy -pip install classla -python -c "import classla; classla.download('sl', type='standard_jos', dir='resources/classla')" -pip install lib/luscenje_struktur/ -pip install git+https://git@gitea.cjvt.si/generic/conversion_utils.git#egg=conversion_utils -pip install package/ -deactivate - -## put needed resources in place -cd resources -ln -s ../lib/luscenje_struktur/wani.py . -ln -s ../lib/xml_schemas/resources/schema/structures.xsd . -ln -s ../lib/xml_schemas/resources/schema/inventory.xsd . -ln -s ../lib/xml_schemas/resources/schema/monolingual_dictionaries.xsd . -cd .. diff --git a/setup.py b/setup.py index 7c98063..873d812 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,17 @@ from setuptools import setup setup(name='structure_assignment', - version='0.1', + version='0.2', description='Pipeline for parsing and assigning structures to arbitrary Slovenian strings', - url='', + url='https://gitea.cjvt.si/generic/structure_assignment_pipeline', author='Cyprian Laskowski', author_email='cyp@cjvt.si', packages=['structure_assignment'], + install_requires=['lxml', + 'classla', + 'conversion_utils @ git+https://gitea.cjvt.si/generic/conversion_utils.git', + 'luscenje_struktur_loc @ git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git@i2198', # TODO: switch to master once luscenje_struktur's i2198 is merged into master + 'psycopg2cffi', # TODO: remove once luscenje_struktur takes care of it + 'sqlalchemy', # TODO: remove once luscenje_struktur takes care of it + ], zip_safe=True) diff --git a/structure_assignment/pipeline.py b/structure_assignment/pipeline.py index ac1a27c..75134a7 100644 --- a/structure_assignment/pipeline.py +++ b/structure_assignment/pipeline.py @@ -1,4 +1,3 @@ -import os import shutil import tempfile from types import SimpleNamespace @@ -16,14 +15,22 @@ from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary class Runner: - def __init__(self, resource_directory, nlp_needed): - self.resource_directory = resource_directory + def __init__(self, classla_directory, nlp_needed, wani=None): + self.classla_directory = classla_directory if (nlp_needed): - NLP_CONFIG_MAP['dir'] = resource_directory + '/classla' - self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP) + NLP_CONFIG_MAP['dir'] = classla_directory + self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP) + if (wani is not None): + self._provide_wani(wani) + + def _provide_wani(self, wani_file_name): # TODO: remove once wani is incorporated into luscenje_struktur package + self.wani_directory = tempfile.mkdtemp() + shutil.copy(wani_file_name, self.wani_directory) + import sys + sys.path.insert(0, self.wani_directory) def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): - pipeline = Pipeline(self.resource_directory, self.nlp) + pipeline = Pipeline(self.nlp) pipeline.import_file(input_file_name, 'strings-list') pipeline.import_file(input_structure_file_name, 'structures-old') self._strings_to_parse_sequence(pipeline) @@ -35,7 +42,7 @@ class Runner: pipeline.cleanup() def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): - pipeline = Pipeline(self.resource_directory, self.nlp) + pipeline = Pipeline(self.nlp) pipeline.import_file(input_file_name, 'strings-list') pipeline.import_file(input_structure_file_name, 'structures-old') self._strings_to_parse_sequence(pipeline) @@ -45,14 +52,14 @@ class Runner: pipeline.cleanup() def strings_to_parse(self, input_file_name, output_file_name): - pipeline = Pipeline(self.resource_directory, self.nlp) + pipeline = Pipeline(self.nlp) pipeline.import_file(input_file_name, 'strings-list') self._strings_to_parse_sequence(pipeline) pipeline.export_file(output_file_name, 'tei-initial') pipeline.cleanup() def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): - pipeline = Pipeline(self.resource_directory) + pipeline = Pipeline() pipeline.import_file(input_file_name, 'tei-initial') pipeline.import_file(input_structure_file_name, 'structures-old') self._parse_to_dictionary_sequence(pipeline) @@ -61,13 +68,13 @@ class Runner: pipeline.cleanup() def validate_structures(self, input_file_name): - pipeline = Pipeline(self.resource_directory) + pipeline = Pipeline() pipeline.import_file(input_file_name, 'structures-new') pipeline.do_validate_structures() pipeline.cleanup() def validate_dictionary(self, input_file_name): - pipeline = Pipeline(self.resource_directory) + pipeline = Pipeline() pipeline.import_file(input_file_name, 'dictionary') pipeline.do_validate_dictionary() pipeline.cleanup() @@ -87,17 +94,10 @@ class Runner: class Pipeline: - def __init__(self, resource_directory, nlp=None): + def __init__(self, nlp=None): self.nlp = nlp self.tmp_directory = tempfile.mkdtemp() - resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)] - for resource_file_name in resource_file_names: - if (os.path.isfile(resource_file_name)): - shutil.copy(resource_file_name, self.tmp_directory) - import sys - sys.path.insert(0, self.tmp_directory) self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()} - self.classla_directory = resource_directory + '/classla' def import_file(self, file_name, file_key): shutil.copyfile(file_name, self.file_map[file_key]) @@ -108,7 +108,7 @@ class Pipeline: output_file_name = self.file_map['obeliks-tokenised'] with open(input_file_name, 'r') as input_file: input_conllu = input_file.read() - tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.classla_directory) + tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.nlp.dir) output_conllu = tokeniser(input_conllu).to_conll() with open(output_file_name, 'w') as output_file: output_file.write(output_conllu) @@ -220,3 +220,4 @@ class Pipeline: def cleanup(self): shutil.rmtree(self.tmp_directory, True) + shutil.rmtree(self.wani_directory, True)