IssueID #1487: added basic script versions and directory structure
This commit is contained in:
parent
681ace4873
commit
8d5c8f55b5
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -0,0 +1,5 @@
|
||||||
|
/venv
|
||||||
|
/Obeliks4J
|
||||||
|
/data_admin
|
||||||
|
/luscenje_struktur
|
||||||
|
/nova_slovnica
|
19
requirements.txt
Normal file
19
requirements.txt
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
certifi==2020.6.20
|
||||||
|
cffi==1.14.2
|
||||||
|
chardet==3.0.4
|
||||||
|
classla==0.0.7
|
||||||
|
future==0.18.2
|
||||||
|
idna==2.10
|
||||||
|
lxml==4.5.2
|
||||||
|
numpy==1.19.1
|
||||||
|
pkg-resources==0.0.0
|
||||||
|
protobuf==3.13.0
|
||||||
|
psycopg2cffi==2.8.1
|
||||||
|
pycparser==2.20
|
||||||
|
requests==2.24.0
|
||||||
|
setuptools==39.0.1
|
||||||
|
six==1.15.0
|
||||||
|
SQLAlchemy==1.3.19
|
||||||
|
torch==1.5.0
|
||||||
|
tqdm==4.48.2
|
||||||
|
urllib3==1.25.10
|
4
resources/.gitignore
vendored
Normal file
4
resources/.gitignore
vendored
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
/classla
|
||||||
|
/dict.xml
|
||||||
|
/obeliks.jar
|
||||||
|
/structures.xml
|
3
scripts/.gitignore
vendored
Normal file
3
scripts/.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
/assign_structures.py
|
||||||
|
/conllu_to_xml.py
|
||||||
|
/wani.py
|
93
scripts/pipeline.py
Normal file
93
scripts/pipeline.py
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import codecs
|
||||||
|
import re
|
||||||
|
|
||||||
|
import classla
|
||||||
|
from classla import Document
|
||||||
|
from classla.models.common.conll import CoNLLFile
|
||||||
|
|
||||||
|
input_file_name = sys.argv[1]
|
||||||
|
output_file_name = sys.argv[2]
|
||||||
|
|
||||||
|
TMP_DIRECTORY = '../tmp/structure_assignment'
|
||||||
|
STRING_LIST_FILE_NAME = TMP_DIRECTORY + '/strings.txt'
|
||||||
|
OBELIKS_JAR_FILE_NAME = '../resources/obeliks.jar'
|
||||||
|
OBELIKS_RAW_FILE_NAME = TMP_DIRECTORY + '/obeliks_raw.conllu'
|
||||||
|
OBELIKS_TWEAKED_FILE_NAME = TMP_DIRECTORY + '/obeliks_tweaked.conllu'
|
||||||
|
CLASSLA_MODELS_DIRECTORY = '../resources/classla'
|
||||||
|
CLASSLA_FILE_NAME = TMP_DIRECTORY + '/classla.conllu'
|
||||||
|
CONLLU_TEI_SCRIPT_NAME = 'conllu_to_xml.py'
|
||||||
|
TEI_FILE_NAME = TMP_DIRECTORY + '/tei.xml'
|
||||||
|
TRANSLATION_FILE_NAME = '../resources/dict.xml'
|
||||||
|
MWE_EXTRACTION_SCRIPT_NAME = 'wani.py'
|
||||||
|
STRUCTURE_FILE_NAME = '../resources/structures.xml'
|
||||||
|
MWE_CSV_FILE_NAME = TMP_DIRECTORY + '/mwes.csv'
|
||||||
|
STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py'
|
||||||
|
STRING_STRUCTURE_FILE_NAME = TMP_DIRECTORY + '/strings_with_structures.csv'
|
||||||
|
|
||||||
|
NLP_CONFIG_MAP = {
|
||||||
|
'treebank': 'sl_ssj_jos',
|
||||||
|
'processors': 'tokenize,pos,lemma,depparse',
|
||||||
|
'tokenize_pretokenized': True,
|
||||||
|
'models_dir': CLASSLA_MODELS_DIRECTORY
|
||||||
|
}
|
||||||
|
|
||||||
|
XML_ID_PREFIX = 's'
|
||||||
|
|
||||||
|
def run_pipeline(input_file_name, output_file_name):
|
||||||
|
shutil.rmtree(TMP_DIRECTORY, True)
|
||||||
|
os.makedirs(TMP_DIRECTORY, exist_ok=True)
|
||||||
|
shutil.copyfile(input_file_name, STRING_LIST_FILE_NAME)
|
||||||
|
run_obeliks4J()
|
||||||
|
fix_xml_ids()
|
||||||
|
run_classla()
|
||||||
|
convert_to_tei()
|
||||||
|
run_mwe_extraction()
|
||||||
|
run_structure_assignment()
|
||||||
|
shutil.copyfile(STRING_STRUCTURE_FILE_NAME, output_file_name)
|
||||||
|
|
||||||
|
def run_obeliks4J():
|
||||||
|
print('Running obeliks ...')
|
||||||
|
obeliks_command = 'java -jar ' + OBELIKS_JAR_FILE_NAME + ' -d -if ' + STRING_LIST_FILE_NAME + ' -o ' + OBELIKS_RAW_FILE_NAME
|
||||||
|
os.system(obeliks_command)
|
||||||
|
|
||||||
|
def fix_xml_ids():
|
||||||
|
print('Fixing xml ids ...')
|
||||||
|
output_file = codecs.open(OBELIKS_TWEAKED_FILE_NAME, 'w')
|
||||||
|
input_file = codecs.open(OBELIKS_RAW_FILE_NAME, 'r')
|
||||||
|
regexp = r'^(# sent_id = )(\d+\.\d+)$'
|
||||||
|
for line in input_file:
|
||||||
|
match = re.search(regexp, line)
|
||||||
|
if (match):
|
||||||
|
line = match.group(1) + XML_ID_PREFIX + match.group(2) + '\n'
|
||||||
|
output_file.write(line)
|
||||||
|
input_file.close()
|
||||||
|
output_file.close()
|
||||||
|
|
||||||
|
def run_classla():
|
||||||
|
print('Running classla ...')
|
||||||
|
doc = Document(text=None)
|
||||||
|
conll_file = CoNLLFile(filename=OBELIKS_TWEAKED_FILE_NAME)
|
||||||
|
doc.conll_file = conll_file
|
||||||
|
nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
||||||
|
result = nlp(doc)
|
||||||
|
result.conll_file.write_conll(CLASSLA_FILE_NAME)
|
||||||
|
|
||||||
|
def convert_to_tei():
|
||||||
|
print('Converting to tei ...')
|
||||||
|
convert_command = ' '.join(['python', CONLLU_TEI_SCRIPT_NAME, CLASSLA_FILE_NAME, TEI_FILE_NAME, '--translate', TRANSLATION_FILE_NAME])
|
||||||
|
os.system(convert_command)
|
||||||
|
|
||||||
|
def run_mwe_extraction():
|
||||||
|
print('Extracting MWEs from tei ...')
|
||||||
|
extraction_command = ' '.join(['python', MWE_EXTRACTION_SCRIPT_NAME, STRUCTURE_FILE_NAME, TEI_FILE_NAME, '--all', MWE_CSV_FILE_NAME, '--skip-id-check'])
|
||||||
|
os.system(extraction_command)
|
||||||
|
|
||||||
|
def run_structure_assignment():
|
||||||
|
print('Assign structure ids ...')
|
||||||
|
assignment_command = ' '.join(['python', STRUCTURE_ASSIGNMENT_SCRIPT_NAME, STRING_LIST_FILE_NAME, TEI_FILE_NAME, MWE_CSV_FILE_NAME, STRING_STRUCTURE_FILE_NAME])
|
||||||
|
os.system(assignment_command)
|
||||||
|
|
||||||
|
run_pipeline(input_file_name, output_file_name)
|
38
scripts/setup.sh
Executable file
38
scripts/setup.sh
Executable file
|
@ -0,0 +1,38 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
cd ..
|
||||||
|
|
||||||
|
## set up virtual environment
|
||||||
|
python3 -m venv venv
|
||||||
|
source venv/bin/activate
|
||||||
|
pip install wheel
|
||||||
|
pip install -r requirements.txt
|
||||||
|
python -c "import classla; classla.download('sl_ssj_jos')" <<< $'Y\nresources/classla'
|
||||||
|
deactivate
|
||||||
|
|
||||||
|
## get needed repositories
|
||||||
|
git clone https://github.com/clarinsi/Obeliks4J
|
||||||
|
git clone git@gitea.cjvt.si:redmine_projects/nova_slovnica.git
|
||||||
|
git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git
|
||||||
|
git clone git@gitea.cjvt.si:generic/data_admin.git # this doesn't include structures.xml yet
|
||||||
|
|
||||||
|
## set up obeliks
|
||||||
|
cd Obeliks4J
|
||||||
|
javac -encoding UTF-8 src/main/java/org/obeliks/*.java -d target/classes
|
||||||
|
cp src/main/resources/* target/classes/org/obeliks/
|
||||||
|
jar -cef org.obeliks.Tokenizer obeliks.jar -C target/classes org
|
||||||
|
cd ..
|
||||||
|
|
||||||
|
## put needed scripts in place
|
||||||
|
cd scripts
|
||||||
|
ln -s ../nova_slovnica/python/scripts/conllu_to_xml.py .
|
||||||
|
ln -s ../nova_slovnica/python/scripts/assign_structures.py .
|
||||||
|
ln -s ../luscenje_struktur/src/wani.py .
|
||||||
|
cd ..
|
||||||
|
|
||||||
|
## put needed resources in place
|
||||||
|
cd resources
|
||||||
|
mv ../Obeliks4J/obeliks.jar .
|
||||||
|
ln -s ../nova_slovnica/resources/dict.xml .
|
||||||
|
ln -s ../data_admin/resources/structures.xml .
|
||||||
|
cd ..
|
2
tmp/.gitignore
vendored
Normal file
2
tmp/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
*
|
||||||
|
!.gitignore
|
Loading…
Reference in New Issue
Block a user