IssueID #1487: switched to python obeliks implementation
This commit is contained in:
parent
f7900c84e5
commit
7651774914
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,5 +1,5 @@
|
|||
/venv
|
||||
/Obeliks4J
|
||||
/obeliks
|
||||
/data_admin
|
||||
/luscenje_struktur
|
||||
/nova_slovnica
|
||||
|
|
|
@ -6,6 +6,7 @@ future==0.18.2
|
|||
idna==2.10
|
||||
lxml==4.5.2
|
||||
numpy==1.19.1
|
||||
obeliks==1.0.2
|
||||
pkg-resources==0.0.0
|
||||
protobuf==3.13.0
|
||||
psycopg2cffi==2.8.1
|
||||
|
|
|
@ -13,7 +13,6 @@ TEI_DICTIONARY_SCRIPT_NAME = 'tei_to_dictionary.py'
|
|||
DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py'
|
||||
|
||||
# resources
|
||||
OBELIKS_JAR_FILE_NAME = '../resources/obeliks.jar'
|
||||
TRANSLATION_FILE_NAME = '../resources/dict.xml'
|
||||
CLASSLA_MODELS_DIRECTORY = '../resources/classla'
|
||||
STRUCTURE_CURRENT_FILE_NAME = '../resources/structures.xml'
|
||||
|
|
|
@ -29,15 +29,15 @@ def run_pipeline(input_file_name, output_file_name):
|
|||
shutil.rmtree(TMP_DIRECTORY, True)
|
||||
os.makedirs(TMP_DIRECTORY, exist_ok=True)
|
||||
shutil.copyfile(input_file_name, STRING_LIST_FILE_NAME)
|
||||
run_obeliks4J(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME)
|
||||
run_obeliks(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME)
|
||||
tweak_conllu(OBELIKS_RAW_FILE_NAME, OBELIKS_TWEAKED_FILE_NAME)
|
||||
run_classla(OBELIKS_TWEAKED_FILE_NAME, CLASSLA_FILE_NAME)
|
||||
run_tei_conversion(CLASSLA_FILE_NAME, TEI_INIT_FILE_NAME)
|
||||
shutil.copyfile(TEI_INIT_FILE_NAME, output_file_name)
|
||||
|
||||
def run_obeliks4J(list_file_name, conllu_file_name):
|
||||
def run_obeliks(list_file_name, conllu_file_name):
|
||||
print('Running obeliks ...')
|
||||
obeliks_command = 'java -jar ' + OBELIKS_JAR_FILE_NAME + ' -d -if ' + list_file_name + ' -o ' + conllu_file_name
|
||||
obeliks_command = ' '.join(['obeliks', '-c', '-if', list_file_name, '-o', conllu_file_name])
|
||||
os.system(obeliks_command)
|
||||
|
||||
def tweak_conllu(input_file_name, output_file_name):
|
||||
|
|
|
@ -11,19 +11,11 @@ python -c "import classla; classla.download('sl_ssj_jos')" <<< $'Y\nresources/cl
|
|||
deactivate
|
||||
|
||||
## get needed repositories
|
||||
git clone https://github.com/clarinsi/Obeliks4J
|
||||
git clone git@gitea.cjvt.si:redmine_projects/nova_slovnica.git
|
||||
git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git
|
||||
git clone git@gitea.cjvt.si:generic/data_admin.git
|
||||
git clone git@gitea.cjvt.si:generic/xml_schemas.git
|
||||
|
||||
## set up obeliks
|
||||
cd Obeliks4J
|
||||
javac -encoding UTF-8 src/main/java/org/obeliks/*.java -d target/classes
|
||||
cp src/main/resources/* target/classes/org/obeliks/
|
||||
jar -cef org.obeliks.Tokenizer obeliks.jar -C target/classes org
|
||||
cd ..
|
||||
|
||||
## put needed scripts in place
|
||||
cd scripts
|
||||
ln -s ../nova_slovnica/python/scripts/conllu_to_xml.py .
|
||||
|
@ -35,7 +27,6 @@ cd ..
|
|||
|
||||
## put needed resources in place
|
||||
cd resources
|
||||
mv ../Obeliks4J/obeliks.jar .
|
||||
ln -s ../nova_slovnica/resources/dict.xml .
|
||||
ln -s ../data_admin/resources/structures.xml .
|
||||
ln -s ../data_admin/resources/structures.xsd .
|
||||
|
|
Loading…
Reference in New Issue
Block a user