From 8d5c8f55b5f6bcf57ad2794aa233e2d1d3b57279 Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Thu, 17 Sep 2020 09:29:53 +0200 Subject: [PATCH] IssueID #1487: added basic script versions and directory structure --- .gitignore | 5 +++ requirements.txt | 19 +++++++++ resources/.gitignore | 4 ++ scripts/.gitignore | 3 ++ scripts/pipeline.py | 93 ++++++++++++++++++++++++++++++++++++++++++++ scripts/setup.sh | 38 ++++++++++++++++++ tmp/.gitignore | 2 + 7 files changed, 164 insertions(+) create mode 100644 requirements.txt create mode 100644 resources/.gitignore create mode 100644 scripts/.gitignore create mode 100644 scripts/pipeline.py create mode 100755 scripts/setup.sh create mode 100644 tmp/.gitignore diff --git a/.gitignore b/.gitignore index e69de29..754b9ab 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1,5 @@ +/venv +/Obeliks4J +/data_admin +/luscenje_struktur +/nova_slovnica diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1c46303 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,19 @@ +certifi==2020.6.20 +cffi==1.14.2 +chardet==3.0.4 +classla==0.0.7 +future==0.18.2 +idna==2.10 +lxml==4.5.2 +numpy==1.19.1 +pkg-resources==0.0.0 +protobuf==3.13.0 +psycopg2cffi==2.8.1 +pycparser==2.20 +requests==2.24.0 +setuptools==39.0.1 +six==1.15.0 +SQLAlchemy==1.3.19 +torch==1.5.0 +tqdm==4.48.2 +urllib3==1.25.10 diff --git a/resources/.gitignore b/resources/.gitignore new file mode 100644 index 0000000..6118611 --- /dev/null +++ b/resources/.gitignore @@ -0,0 +1,4 @@ +/classla +/dict.xml +/obeliks.jar +/structures.xml diff --git a/scripts/.gitignore b/scripts/.gitignore new file mode 100644 index 0000000..1a40347 --- /dev/null +++ b/scripts/.gitignore @@ -0,0 +1,3 @@ +/assign_structures.py +/conllu_to_xml.py +/wani.py diff --git a/scripts/pipeline.py b/scripts/pipeline.py new file mode 100644 index 0000000..8d2cec5 --- /dev/null +++ b/scripts/pipeline.py @@ -0,0 +1,93 @@ +import sys +import os +import shutil +import codecs +import re + +import classla +from classla import Document +from classla.models.common.conll import CoNLLFile + +input_file_name = sys.argv[1] +output_file_name = sys.argv[2] + +TMP_DIRECTORY = '../tmp/structure_assignment' +STRING_LIST_FILE_NAME = TMP_DIRECTORY + '/strings.txt' +OBELIKS_JAR_FILE_NAME = '../resources/obeliks.jar' +OBELIKS_RAW_FILE_NAME = TMP_DIRECTORY + '/obeliks_raw.conllu' +OBELIKS_TWEAKED_FILE_NAME = TMP_DIRECTORY + '/obeliks_tweaked.conllu' +CLASSLA_MODELS_DIRECTORY = '../resources/classla' +CLASSLA_FILE_NAME = TMP_DIRECTORY + '/classla.conllu' +CONLLU_TEI_SCRIPT_NAME = 'conllu_to_xml.py' +TEI_FILE_NAME = TMP_DIRECTORY + '/tei.xml' +TRANSLATION_FILE_NAME = '../resources/dict.xml' +MWE_EXTRACTION_SCRIPT_NAME = 'wani.py' +STRUCTURE_FILE_NAME = '../resources/structures.xml' +MWE_CSV_FILE_NAME = TMP_DIRECTORY + '/mwes.csv' +STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py' +STRING_STRUCTURE_FILE_NAME = TMP_DIRECTORY + '/strings_with_structures.csv' + +NLP_CONFIG_MAP = { + 'treebank': 'sl_ssj_jos', + 'processors': 'tokenize,pos,lemma,depparse', + 'tokenize_pretokenized': True, + 'models_dir': CLASSLA_MODELS_DIRECTORY +} + +XML_ID_PREFIX = 's' + +def run_pipeline(input_file_name, output_file_name): + shutil.rmtree(TMP_DIRECTORY, True) + os.makedirs(TMP_DIRECTORY, exist_ok=True) + shutil.copyfile(input_file_name, STRING_LIST_FILE_NAME) + run_obeliks4J() + fix_xml_ids() + run_classla() + convert_to_tei() + run_mwe_extraction() + run_structure_assignment() + shutil.copyfile(STRING_STRUCTURE_FILE_NAME, output_file_name) + +def run_obeliks4J(): + print('Running obeliks ...') + obeliks_command = 'java -jar ' + OBELIKS_JAR_FILE_NAME + ' -d -if ' + STRING_LIST_FILE_NAME + ' -o ' + OBELIKS_RAW_FILE_NAME + os.system(obeliks_command) + +def fix_xml_ids(): + print('Fixing xml ids ...') + output_file = codecs.open(OBELIKS_TWEAKED_FILE_NAME, 'w') + input_file = codecs.open(OBELIKS_RAW_FILE_NAME, 'r') + regexp = r'^(# sent_id = )(\d+\.\d+)$' + for line in input_file: + match = re.search(regexp, line) + if (match): + line = match.group(1) + XML_ID_PREFIX + match.group(2) + '\n' + output_file.write(line) + input_file.close() + output_file.close() + +def run_classla(): + print('Running classla ...') + doc = Document(text=None) + conll_file = CoNLLFile(filename=OBELIKS_TWEAKED_FILE_NAME) + doc.conll_file = conll_file + nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP) + result = nlp(doc) + result.conll_file.write_conll(CLASSLA_FILE_NAME) + +def convert_to_tei(): + print('Converting to tei ...') + convert_command = ' '.join(['python', CONLLU_TEI_SCRIPT_NAME, CLASSLA_FILE_NAME, TEI_FILE_NAME, '--translate', TRANSLATION_FILE_NAME]) + os.system(convert_command) + +def run_mwe_extraction(): + print('Extracting MWEs from tei ...') + extraction_command = ' '.join(['python', MWE_EXTRACTION_SCRIPT_NAME, STRUCTURE_FILE_NAME, TEI_FILE_NAME, '--all', MWE_CSV_FILE_NAME, '--skip-id-check']) + os.system(extraction_command) + +def run_structure_assignment(): + print('Assign structure ids ...') + assignment_command = ' '.join(['python', STRUCTURE_ASSIGNMENT_SCRIPT_NAME, STRING_LIST_FILE_NAME, TEI_FILE_NAME, MWE_CSV_FILE_NAME, STRING_STRUCTURE_FILE_NAME]) + os.system(assignment_command) + +run_pipeline(input_file_name, output_file_name) diff --git a/scripts/setup.sh b/scripts/setup.sh new file mode 100755 index 0000000..501c3db --- /dev/null +++ b/scripts/setup.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +cd .. + +## set up virtual environment +python3 -m venv venv +source venv/bin/activate +pip install wheel +pip install -r requirements.txt +python -c "import classla; classla.download('sl_ssj_jos')" <<< $'Y\nresources/classla' +deactivate + +## get needed repositories +git clone https://github.com/clarinsi/Obeliks4J +git clone git@gitea.cjvt.si:redmine_projects/nova_slovnica.git +git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git +git clone git@gitea.cjvt.si:generic/data_admin.git # this doesn't include structures.xml yet + +## set up obeliks +cd Obeliks4J +javac -encoding UTF-8 src/main/java/org/obeliks/*.java -d target/classes +cp src/main/resources/* target/classes/org/obeliks/ +jar -cef org.obeliks.Tokenizer obeliks.jar -C target/classes org +cd .. + +## put needed scripts in place +cd scripts +ln -s ../nova_slovnica/python/scripts/conllu_to_xml.py . +ln -s ../nova_slovnica/python/scripts/assign_structures.py . +ln -s ../luscenje_struktur/src/wani.py . +cd .. + +## put needed resources in place +cd resources +mv ../Obeliks4J/obeliks.jar . +ln -s ../nova_slovnica/resources/dict.xml . +ln -s ../data_admin/resources/structures.xml . +cd .. diff --git a/tmp/.gitignore b/tmp/.gitignore new file mode 100644 index 0000000..c96a04f --- /dev/null +++ b/tmp/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file