IssueID #1487: incorporated script for translating JOS msds and dependencies

This commit is contained in:
Cyprian Laskowski 2021-01-12 11:07:41 +01:00
parent 7651774914
commit 37d176c477
4 changed files with 15 additions and 4 deletions

3
scripts/.gitignore vendored
View File

@ -3,4 +3,5 @@
/conllu_to_xml.py
/wani.py
/create_structures.py
/tei_to_dictionary.py
/tei_to_dictionary.py
/translate_jos.py

View File

@ -4,6 +4,7 @@ TMP_DIRECTORY = '../tmp/structure_assignment'
# scripts
TEI_SPLIT_SCRIPT_NAME = 'split_tei.py'
CONLLU_TWEAK_SCRIPT_NAME = 'tweak_conllu.py'
TRANSLATION_SCRIPT_NAME = 'translate_jos.py'
CONLLU_TEI_SCRIPT_NAME = 'conllu_to_xml.py'
MWE_EXTRACTION_SCRIPT_NAME = 'wani.py'
STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py'
@ -23,7 +24,8 @@ DICTIONARY_SCHEMA_FILE_NAME = '../resources/monolingual_dictionaries.xsd'
STRING_LIST_FILE_NAME = TMP_DIRECTORY + '/strings.txt'
OBELIKS_RAW_FILE_NAME = TMP_DIRECTORY + '/obeliks_raw.conllu'
OBELIKS_TWEAKED_FILE_NAME = TMP_DIRECTORY + '/obeliks_tweaked.conllu'
CLASSLA_FILE_NAME = TMP_DIRECTORY + '/classla.conllu'
CLASSLA_OUTPUT_FILE_NAME = TMP_DIRECTORY + '/classla_raw.conllu'
CLASSLA_TRANSLATED_FILE_NAME = TMP_DIRECTORY + '/classla_translated.conllu'
TEI_INIT_FILE_NAME = TMP_DIRECTORY + '/tei_initial.xml'
TEI_SINGLE_FILE_NAME = TMP_DIRECTORY + '/tei_single.xml'
TEI_MULTIPLE_FILE_NAME = TMP_DIRECTORY + '/tei_multiple.xml'

View File

@ -31,8 +31,9 @@ def run_pipeline(input_file_name, output_file_name):
shutil.copyfile(input_file_name, STRING_LIST_FILE_NAME)
run_obeliks(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME)
tweak_conllu(OBELIKS_RAW_FILE_NAME, OBELIKS_TWEAKED_FILE_NAME)
run_classla(OBELIKS_TWEAKED_FILE_NAME, CLASSLA_FILE_NAME)
run_tei_conversion(CLASSLA_FILE_NAME, TEI_INIT_FILE_NAME)
run_classla(OBELIKS_TWEAKED_FILE_NAME, CLASSLA_OUTPUT_FILE_NAME)
run_jos_translation(CLASSLA_OUTPUT_FILE_NAME, CLASSLA_TRANSLATED_FILE_NAME)
run_tei_conversion(CLASSLA_TRANSLATED_FILE_NAME, TEI_INIT_FILE_NAME)
shutil.copyfile(TEI_INIT_FILE_NAME, output_file_name)
def run_obeliks(list_file_name, conllu_file_name):
@ -54,6 +55,12 @@ def run_classla(obeliks_file_name, classla_file_name):
result = nlp(doc)
result.conll_file.write_conll(classla_file_name)
def run_jos_translation(input_file_name, output_file_name):
print('Translating JOS ...')
translate_command = ' '.join(['python', TRANSLATION_SCRIPT_NAME, '-infile', input_file_name, '-dict', TRANSLATION_FILE_NAME, '-outfile', output_file_name])
print(translate_command)
os.system(translate_command)
def run_tei_conversion(classla_file_name, tei_file_name):
print('Converting to tei ...')
convert_command = ' '.join(['python', CONLLU_TEI_SCRIPT_NAME, '-o', tei_file_name, classla_file_name])

View File

@ -21,6 +21,7 @@ cd scripts
ln -s ../nova_slovnica/python/scripts/conllu_to_xml.py .
ln -s ../nova_slovnica/python/scripts/assign_structures.py .
ln -s ../nova_slovnica/python/scripts/create_structures.py .
ln -s ../nova_slovnica/python/scripts/translate_jos.py .
ln -s ../nova_slovnica/python/scripts/tei_to_dictionary.py .
ln -s ../luscenje_struktur/wani.py .
cd ..