IssueID #1487: switched to python obeliks implementation
This commit is contained in:
parent
f7900c84e5
commit
7651774914
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,5 +1,5 @@
|
||||||
/venv
|
/venv
|
||||||
/Obeliks4J
|
/obeliks
|
||||||
/data_admin
|
/data_admin
|
||||||
/luscenje_struktur
|
/luscenje_struktur
|
||||||
/nova_slovnica
|
/nova_slovnica
|
||||||
|
|
|
@ -6,6 +6,7 @@ future==0.18.2
|
||||||
idna==2.10
|
idna==2.10
|
||||||
lxml==4.5.2
|
lxml==4.5.2
|
||||||
numpy==1.19.1
|
numpy==1.19.1
|
||||||
|
obeliks==1.0.2
|
||||||
pkg-resources==0.0.0
|
pkg-resources==0.0.0
|
||||||
protobuf==3.13.0
|
protobuf==3.13.0
|
||||||
psycopg2cffi==2.8.1
|
psycopg2cffi==2.8.1
|
||||||
|
|
|
@ -13,7 +13,6 @@ TEI_DICTIONARY_SCRIPT_NAME = 'tei_to_dictionary.py'
|
||||||
DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py'
|
DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py'
|
||||||
|
|
||||||
# resources
|
# resources
|
||||||
OBELIKS_JAR_FILE_NAME = '../resources/obeliks.jar'
|
|
||||||
TRANSLATION_FILE_NAME = '../resources/dict.xml'
|
TRANSLATION_FILE_NAME = '../resources/dict.xml'
|
||||||
CLASSLA_MODELS_DIRECTORY = '../resources/classla'
|
CLASSLA_MODELS_DIRECTORY = '../resources/classla'
|
||||||
STRUCTURE_CURRENT_FILE_NAME = '../resources/structures.xml'
|
STRUCTURE_CURRENT_FILE_NAME = '../resources/structures.xml'
|
||||||
|
|
|
@ -29,15 +29,15 @@ def run_pipeline(input_file_name, output_file_name):
|
||||||
shutil.rmtree(TMP_DIRECTORY, True)
|
shutil.rmtree(TMP_DIRECTORY, True)
|
||||||
os.makedirs(TMP_DIRECTORY, exist_ok=True)
|
os.makedirs(TMP_DIRECTORY, exist_ok=True)
|
||||||
shutil.copyfile(input_file_name, STRING_LIST_FILE_NAME)
|
shutil.copyfile(input_file_name, STRING_LIST_FILE_NAME)
|
||||||
run_obeliks4J(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME)
|
run_obeliks(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME)
|
||||||
tweak_conllu(OBELIKS_RAW_FILE_NAME, OBELIKS_TWEAKED_FILE_NAME)
|
tweak_conllu(OBELIKS_RAW_FILE_NAME, OBELIKS_TWEAKED_FILE_NAME)
|
||||||
run_classla(OBELIKS_TWEAKED_FILE_NAME, CLASSLA_FILE_NAME)
|
run_classla(OBELIKS_TWEAKED_FILE_NAME, CLASSLA_FILE_NAME)
|
||||||
run_tei_conversion(CLASSLA_FILE_NAME, TEI_INIT_FILE_NAME)
|
run_tei_conversion(CLASSLA_FILE_NAME, TEI_INIT_FILE_NAME)
|
||||||
shutil.copyfile(TEI_INIT_FILE_NAME, output_file_name)
|
shutil.copyfile(TEI_INIT_FILE_NAME, output_file_name)
|
||||||
|
|
||||||
def run_obeliks4J(list_file_name, conllu_file_name):
|
def run_obeliks(list_file_name, conllu_file_name):
|
||||||
print('Running obeliks ...')
|
print('Running obeliks ...')
|
||||||
obeliks_command = 'java -jar ' + OBELIKS_JAR_FILE_NAME + ' -d -if ' + list_file_name + ' -o ' + conllu_file_name
|
obeliks_command = ' '.join(['obeliks', '-c', '-if', list_file_name, '-o', conllu_file_name])
|
||||||
os.system(obeliks_command)
|
os.system(obeliks_command)
|
||||||
|
|
||||||
def tweak_conllu(input_file_name, output_file_name):
|
def tweak_conllu(input_file_name, output_file_name):
|
||||||
|
|
|
@ -11,19 +11,11 @@ python -c "import classla; classla.download('sl_ssj_jos')" <<< $'Y\nresources/cl
|
||||||
deactivate
|
deactivate
|
||||||
|
|
||||||
## get needed repositories
|
## get needed repositories
|
||||||
git clone https://github.com/clarinsi/Obeliks4J
|
|
||||||
git clone git@gitea.cjvt.si:redmine_projects/nova_slovnica.git
|
git clone git@gitea.cjvt.si:redmine_projects/nova_slovnica.git
|
||||||
git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git
|
git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git
|
||||||
git clone git@gitea.cjvt.si:generic/data_admin.git
|
git clone git@gitea.cjvt.si:generic/data_admin.git
|
||||||
git clone git@gitea.cjvt.si:generic/xml_schemas.git
|
git clone git@gitea.cjvt.si:generic/xml_schemas.git
|
||||||
|
|
||||||
## set up obeliks
|
|
||||||
cd Obeliks4J
|
|
||||||
javac -encoding UTF-8 src/main/java/org/obeliks/*.java -d target/classes
|
|
||||||
cp src/main/resources/* target/classes/org/obeliks/
|
|
||||||
jar -cef org.obeliks.Tokenizer obeliks.jar -C target/classes org
|
|
||||||
cd ..
|
|
||||||
|
|
||||||
## put needed scripts in place
|
## put needed scripts in place
|
||||||
cd scripts
|
cd scripts
|
||||||
ln -s ../nova_slovnica/python/scripts/conllu_to_xml.py .
|
ln -s ../nova_slovnica/python/scripts/conllu_to_xml.py .
|
||||||
|
@ -35,7 +27,6 @@ cd ..
|
||||||
|
|
||||||
## put needed resources in place
|
## put needed resources in place
|
||||||
cd resources
|
cd resources
|
||||||
mv ../Obeliks4J/obeliks.jar .
|
|
||||||
ln -s ../nova_slovnica/resources/dict.xml .
|
ln -s ../nova_slovnica/resources/dict.xml .
|
||||||
ln -s ../data_admin/resources/structures.xml .
|
ln -s ../data_admin/resources/structures.xml .
|
||||||
ln -s ../data_admin/resources/structures.xsd .
|
ln -s ../data_admin/resources/structures.xsd .
|
||||||
|
|
Loading…
Reference in New Issue
Block a user