IssueID #1487: added basic script versions and directory structure
This commit is contained in:
parent
681ace4873
commit
8d5c8f55b5
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -0,0 +1,5 @@
|
|||
/venv
|
||||
/Obeliks4J
|
||||
/data_admin
|
||||
/luscenje_struktur
|
||||
/nova_slovnica
|
19
requirements.txt
Normal file
19
requirements.txt
Normal file
|
@ -0,0 +1,19 @@
|
|||
certifi==2020.6.20
|
||||
cffi==1.14.2
|
||||
chardet==3.0.4
|
||||
classla==0.0.7
|
||||
future==0.18.2
|
||||
idna==2.10
|
||||
lxml==4.5.2
|
||||
numpy==1.19.1
|
||||
pkg-resources==0.0.0
|
||||
protobuf==3.13.0
|
||||
psycopg2cffi==2.8.1
|
||||
pycparser==2.20
|
||||
requests==2.24.0
|
||||
setuptools==39.0.1
|
||||
six==1.15.0
|
||||
SQLAlchemy==1.3.19
|
||||
torch==1.5.0
|
||||
tqdm==4.48.2
|
||||
urllib3==1.25.10
|
4
resources/.gitignore
vendored
Normal file
4
resources/.gitignore
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
/classla
|
||||
/dict.xml
|
||||
/obeliks.jar
|
||||
/structures.xml
|
3
scripts/.gitignore
vendored
Normal file
3
scripts/.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
/assign_structures.py
|
||||
/conllu_to_xml.py
|
||||
/wani.py
|
93
scripts/pipeline.py
Normal file
93
scripts/pipeline.py
Normal file
|
@ -0,0 +1,93 @@
|
|||
import sys
|
||||
import os
|
||||
import shutil
|
||||
import codecs
|
||||
import re
|
||||
|
||||
import classla
|
||||
from classla import Document
|
||||
from classla.models.common.conll import CoNLLFile
|
||||
|
||||
input_file_name = sys.argv[1]
|
||||
output_file_name = sys.argv[2]
|
||||
|
||||
TMP_DIRECTORY = '../tmp/structure_assignment'
|
||||
STRING_LIST_FILE_NAME = TMP_DIRECTORY + '/strings.txt'
|
||||
OBELIKS_JAR_FILE_NAME = '../resources/obeliks.jar'
|
||||
OBELIKS_RAW_FILE_NAME = TMP_DIRECTORY + '/obeliks_raw.conllu'
|
||||
OBELIKS_TWEAKED_FILE_NAME = TMP_DIRECTORY + '/obeliks_tweaked.conllu'
|
||||
CLASSLA_MODELS_DIRECTORY = '../resources/classla'
|
||||
CLASSLA_FILE_NAME = TMP_DIRECTORY + '/classla.conllu'
|
||||
CONLLU_TEI_SCRIPT_NAME = 'conllu_to_xml.py'
|
||||
TEI_FILE_NAME = TMP_DIRECTORY + '/tei.xml'
|
||||
TRANSLATION_FILE_NAME = '../resources/dict.xml'
|
||||
MWE_EXTRACTION_SCRIPT_NAME = 'wani.py'
|
||||
STRUCTURE_FILE_NAME = '../resources/structures.xml'
|
||||
MWE_CSV_FILE_NAME = TMP_DIRECTORY + '/mwes.csv'
|
||||
STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py'
|
||||
STRING_STRUCTURE_FILE_NAME = TMP_DIRECTORY + '/strings_with_structures.csv'
|
||||
|
||||
NLP_CONFIG_MAP = {
|
||||
'treebank': 'sl_ssj_jos',
|
||||
'processors': 'tokenize,pos,lemma,depparse',
|
||||
'tokenize_pretokenized': True,
|
||||
'models_dir': CLASSLA_MODELS_DIRECTORY
|
||||
}
|
||||
|
||||
XML_ID_PREFIX = 's'
|
||||
|
||||
def run_pipeline(input_file_name, output_file_name):
|
||||
shutil.rmtree(TMP_DIRECTORY, True)
|
||||
os.makedirs(TMP_DIRECTORY, exist_ok=True)
|
||||
shutil.copyfile(input_file_name, STRING_LIST_FILE_NAME)
|
||||
run_obeliks4J()
|
||||
fix_xml_ids()
|
||||
run_classla()
|
||||
convert_to_tei()
|
||||
run_mwe_extraction()
|
||||
run_structure_assignment()
|
||||
shutil.copyfile(STRING_STRUCTURE_FILE_NAME, output_file_name)
|
||||
|
||||
def run_obeliks4J():
|
||||
print('Running obeliks ...')
|
||||
obeliks_command = 'java -jar ' + OBELIKS_JAR_FILE_NAME + ' -d -if ' + STRING_LIST_FILE_NAME + ' -o ' + OBELIKS_RAW_FILE_NAME
|
||||
os.system(obeliks_command)
|
||||
|
||||
def fix_xml_ids():
|
||||
print('Fixing xml ids ...')
|
||||
output_file = codecs.open(OBELIKS_TWEAKED_FILE_NAME, 'w')
|
||||
input_file = codecs.open(OBELIKS_RAW_FILE_NAME, 'r')
|
||||
regexp = r'^(# sent_id = )(\d+\.\d+)$'
|
||||
for line in input_file:
|
||||
match = re.search(regexp, line)
|
||||
if (match):
|
||||
line = match.group(1) + XML_ID_PREFIX + match.group(2) + '\n'
|
||||
output_file.write(line)
|
||||
input_file.close()
|
||||
output_file.close()
|
||||
|
||||
def run_classla():
|
||||
print('Running classla ...')
|
||||
doc = Document(text=None)
|
||||
conll_file = CoNLLFile(filename=OBELIKS_TWEAKED_FILE_NAME)
|
||||
doc.conll_file = conll_file
|
||||
nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
||||
result = nlp(doc)
|
||||
result.conll_file.write_conll(CLASSLA_FILE_NAME)
|
||||
|
||||
def convert_to_tei():
|
||||
print('Converting to tei ...')
|
||||
convert_command = ' '.join(['python', CONLLU_TEI_SCRIPT_NAME, CLASSLA_FILE_NAME, TEI_FILE_NAME, '--translate', TRANSLATION_FILE_NAME])
|
||||
os.system(convert_command)
|
||||
|
||||
def run_mwe_extraction():
|
||||
print('Extracting MWEs from tei ...')
|
||||
extraction_command = ' '.join(['python', MWE_EXTRACTION_SCRIPT_NAME, STRUCTURE_FILE_NAME, TEI_FILE_NAME, '--all', MWE_CSV_FILE_NAME, '--skip-id-check'])
|
||||
os.system(extraction_command)
|
||||
|
||||
def run_structure_assignment():
|
||||
print('Assign structure ids ...')
|
||||
assignment_command = ' '.join(['python', STRUCTURE_ASSIGNMENT_SCRIPT_NAME, STRING_LIST_FILE_NAME, TEI_FILE_NAME, MWE_CSV_FILE_NAME, STRING_STRUCTURE_FILE_NAME])
|
||||
os.system(assignment_command)
|
||||
|
||||
run_pipeline(input_file_name, output_file_name)
|
38
scripts/setup.sh
Executable file
38
scripts/setup.sh
Executable file
|
@ -0,0 +1,38 @@
|
|||
#!/bin/bash
|
||||
|
||||
cd ..
|
||||
|
||||
## set up virtual environment
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install wheel
|
||||
pip install -r requirements.txt
|
||||
python -c "import classla; classla.download('sl_ssj_jos')" <<< $'Y\nresources/classla'
|
||||
deactivate
|
||||
|
||||
## get needed repositories
|
||||
git clone https://github.com/clarinsi/Obeliks4J
|
||||
git clone git@gitea.cjvt.si:redmine_projects/nova_slovnica.git
|
||||
git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git
|
||||
git clone git@gitea.cjvt.si:generic/data_admin.git # this doesn't include structures.xml yet
|
||||
|
||||
## set up obeliks
|
||||
cd Obeliks4J
|
||||
javac -encoding UTF-8 src/main/java/org/obeliks/*.java -d target/classes
|
||||
cp src/main/resources/* target/classes/org/obeliks/
|
||||
jar -cef org.obeliks.Tokenizer obeliks.jar -C target/classes org
|
||||
cd ..
|
||||
|
||||
## put needed scripts in place
|
||||
cd scripts
|
||||
ln -s ../nova_slovnica/python/scripts/conllu_to_xml.py .
|
||||
ln -s ../nova_slovnica/python/scripts/assign_structures.py .
|
||||
ln -s ../luscenje_struktur/src/wani.py .
|
||||
cd ..
|
||||
|
||||
## put needed resources in place
|
||||
cd resources
|
||||
mv ../Obeliks4J/obeliks.jar .
|
||||
ln -s ../nova_slovnica/resources/dict.xml .
|
||||
ln -s ../data_admin/resources/structures.xml .
|
||||
cd ..
|
2
tmp/.gitignore
vendored
Normal file
2
tmp/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
*
|
||||
!.gitignore
|
Loading…
Reference in New Issue
Block a user