From 7651774914d1d4fdbdada5962009500fd6f35aab Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Tue, 12 Jan 2021 10:00:40 +0100 Subject: [PATCH] IssueID #1487: switched to python obeliks implementation --- .gitignore | 2 +- requirements.txt | 1 + scripts/constants.py | 1 - scripts/pipeline1.py | 6 +++--- scripts/setup.sh | 9 --------- 5 files changed, 5 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index 492e6ae..6f03474 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ /venv -/Obeliks4J +/obeliks /data_admin /luscenje_struktur /nova_slovnica diff --git a/requirements.txt b/requirements.txt index 1c46303..c008586 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ future==0.18.2 idna==2.10 lxml==4.5.2 numpy==1.19.1 +obeliks==1.0.2 pkg-resources==0.0.0 protobuf==3.13.0 psycopg2cffi==2.8.1 diff --git a/scripts/constants.py b/scripts/constants.py index 2652869..96070ef 100644 --- a/scripts/constants.py +++ b/scripts/constants.py @@ -13,7 +13,6 @@ TEI_DICTIONARY_SCRIPT_NAME = 'tei_to_dictionary.py' DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py' # resources -OBELIKS_JAR_FILE_NAME = '../resources/obeliks.jar' TRANSLATION_FILE_NAME = '../resources/dict.xml' CLASSLA_MODELS_DIRECTORY = '../resources/classla' STRUCTURE_CURRENT_FILE_NAME = '../resources/structures.xml' diff --git a/scripts/pipeline1.py b/scripts/pipeline1.py index 0cc791b..8cd2bb3 100644 --- a/scripts/pipeline1.py +++ b/scripts/pipeline1.py @@ -29,15 +29,15 @@ def run_pipeline(input_file_name, output_file_name): shutil.rmtree(TMP_DIRECTORY, True) os.makedirs(TMP_DIRECTORY, exist_ok=True) shutil.copyfile(input_file_name, STRING_LIST_FILE_NAME) - run_obeliks4J(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME) + run_obeliks(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME) tweak_conllu(OBELIKS_RAW_FILE_NAME, OBELIKS_TWEAKED_FILE_NAME) run_classla(OBELIKS_TWEAKED_FILE_NAME, CLASSLA_FILE_NAME) run_tei_conversion(CLASSLA_FILE_NAME, TEI_INIT_FILE_NAME) shutil.copyfile(TEI_INIT_FILE_NAME, output_file_name) -def run_obeliks4J(list_file_name, conllu_file_name): +def run_obeliks(list_file_name, conllu_file_name): print('Running obeliks ...') - obeliks_command = 'java -jar ' + OBELIKS_JAR_FILE_NAME + ' -d -if ' + list_file_name + ' -o ' + conllu_file_name + obeliks_command = ' '.join(['obeliks', '-c', '-if', list_file_name, '-o', conllu_file_name]) os.system(obeliks_command) def tweak_conllu(input_file_name, output_file_name): diff --git a/scripts/setup.sh b/scripts/setup.sh index 68d6900..ee59aec 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -11,19 +11,11 @@ python -c "import classla; classla.download('sl_ssj_jos')" <<< $'Y\nresources/cl deactivate ## get needed repositories -git clone https://github.com/clarinsi/Obeliks4J git clone git@gitea.cjvt.si:redmine_projects/nova_slovnica.git git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git git clone git@gitea.cjvt.si:generic/data_admin.git git clone git@gitea.cjvt.si:generic/xml_schemas.git -## set up obeliks -cd Obeliks4J -javac -encoding UTF-8 src/main/java/org/obeliks/*.java -d target/classes -cp src/main/resources/* target/classes/org/obeliks/ -jar -cef org.obeliks.Tokenizer obeliks.jar -C target/classes org -cd .. - ## put needed scripts in place cd scripts ln -s ../nova_slovnica/python/scripts/conllu_to_xml.py . @@ -35,7 +27,6 @@ cd .. ## put needed resources in place cd resources -mv ../Obeliks4J/obeliks.jar . ln -s ../nova_slovnica/resources/dict.xml . ln -s ../data_admin/resources/structures.xml . ln -s ../data_admin/resources/structures.xsd .