IssueID #1487: switched to python obeliks implementation

This commit is contained in:
Cyprian Laskowski 2021-01-12 10:00:40 +01:00
parent f7900c84e5
commit 7651774914
5 changed files with 5 additions and 14 deletions

2
.gitignore vendored
View File

@ -1,5 +1,5 @@
/venv
/Obeliks4J
/obeliks
/data_admin
/luscenje_struktur
/nova_slovnica

View File

@ -6,6 +6,7 @@ future==0.18.2
idna==2.10
lxml==4.5.2
numpy==1.19.1
obeliks==1.0.2
pkg-resources==0.0.0
protobuf==3.13.0
psycopg2cffi==2.8.1

View File

@ -13,7 +13,6 @@ TEI_DICTIONARY_SCRIPT_NAME = 'tei_to_dictionary.py'
DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py'
# resources
OBELIKS_JAR_FILE_NAME = '../resources/obeliks.jar'
TRANSLATION_FILE_NAME = '../resources/dict.xml'
CLASSLA_MODELS_DIRECTORY = '../resources/classla'
STRUCTURE_CURRENT_FILE_NAME = '../resources/structures.xml'

View File

@ -29,15 +29,15 @@ def run_pipeline(input_file_name, output_file_name):
shutil.rmtree(TMP_DIRECTORY, True)
os.makedirs(TMP_DIRECTORY, exist_ok=True)
shutil.copyfile(input_file_name, STRING_LIST_FILE_NAME)
run_obeliks4J(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME)
run_obeliks(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME)
tweak_conllu(OBELIKS_RAW_FILE_NAME, OBELIKS_TWEAKED_FILE_NAME)
run_classla(OBELIKS_TWEAKED_FILE_NAME, CLASSLA_FILE_NAME)
run_tei_conversion(CLASSLA_FILE_NAME, TEI_INIT_FILE_NAME)
shutil.copyfile(TEI_INIT_FILE_NAME, output_file_name)
def run_obeliks4J(list_file_name, conllu_file_name):
def run_obeliks(list_file_name, conllu_file_name):
print('Running obeliks ...')
obeliks_command = 'java -jar ' + OBELIKS_JAR_FILE_NAME + ' -d -if ' + list_file_name + ' -o ' + conllu_file_name
obeliks_command = ' '.join(['obeliks', '-c', '-if', list_file_name, '-o', conllu_file_name])
os.system(obeliks_command)
def tweak_conllu(input_file_name, output_file_name):

View File

@ -11,19 +11,11 @@ python -c "import classla; classla.download('sl_ssj_jos')" <<< $'Y\nresources/cl
deactivate
## get needed repositories
git clone https://github.com/clarinsi/Obeliks4J
git clone git@gitea.cjvt.si:redmine_projects/nova_slovnica.git
git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git
git clone git@gitea.cjvt.si:generic/data_admin.git
git clone git@gitea.cjvt.si:generic/xml_schemas.git
## set up obeliks
cd Obeliks4J
javac -encoding UTF-8 src/main/java/org/obeliks/*.java -d target/classes
cp src/main/resources/* target/classes/org/obeliks/
jar -cef org.obeliks.Tokenizer obeliks.jar -C target/classes org
cd ..
## put needed scripts in place
cd scripts
ln -s ../nova_slovnica/python/scripts/conllu_to_xml.py .
@ -35,7 +27,6 @@ cd ..
## put needed resources in place
cd resources
mv ../Obeliks4J/obeliks.jar .
ln -s ../nova_slovnica/resources/dict.xml .
ln -s ../data_admin/resources/structures.xml .
ln -s ../data_admin/resources/structures.xsd .