From 37d176c4772a39b6735aa29a33ccc31d82bcdede Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Tue, 12 Jan 2021 11:07:41 +0100 Subject: [PATCH] IssueID #1487: incorporated script for translating JOS msds and dependencies --- scripts/.gitignore | 3 ++- scripts/constants.py | 4 +++- scripts/pipeline1.py | 11 +++++++++-- scripts/setup.sh | 1 + 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/scripts/.gitignore b/scripts/.gitignore index 52e9bf3..f353f7b 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -3,4 +3,5 @@ /conllu_to_xml.py /wani.py /create_structures.py -/tei_to_dictionary.py \ No newline at end of file +/tei_to_dictionary.py +/translate_jos.py \ No newline at end of file diff --git a/scripts/constants.py b/scripts/constants.py index 96070ef..fc0e129 100644 --- a/scripts/constants.py +++ b/scripts/constants.py @@ -4,6 +4,7 @@ TMP_DIRECTORY = '../tmp/structure_assignment' # scripts TEI_SPLIT_SCRIPT_NAME = 'split_tei.py' CONLLU_TWEAK_SCRIPT_NAME = 'tweak_conllu.py' +TRANSLATION_SCRIPT_NAME = 'translate_jos.py' CONLLU_TEI_SCRIPT_NAME = 'conllu_to_xml.py' MWE_EXTRACTION_SCRIPT_NAME = 'wani.py' STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py' @@ -23,7 +24,8 @@ DICTIONARY_SCHEMA_FILE_NAME = '../resources/monolingual_dictionaries.xsd' STRING_LIST_FILE_NAME = TMP_DIRECTORY + '/strings.txt' OBELIKS_RAW_FILE_NAME = TMP_DIRECTORY + '/obeliks_raw.conllu' OBELIKS_TWEAKED_FILE_NAME = TMP_DIRECTORY + '/obeliks_tweaked.conllu' -CLASSLA_FILE_NAME = TMP_DIRECTORY + '/classla.conllu' +CLASSLA_OUTPUT_FILE_NAME = TMP_DIRECTORY + '/classla_raw.conllu' +CLASSLA_TRANSLATED_FILE_NAME = TMP_DIRECTORY + '/classla_translated.conllu' TEI_INIT_FILE_NAME = TMP_DIRECTORY + '/tei_initial.xml' TEI_SINGLE_FILE_NAME = TMP_DIRECTORY + '/tei_single.xml' TEI_MULTIPLE_FILE_NAME = TMP_DIRECTORY + '/tei_multiple.xml' diff --git a/scripts/pipeline1.py b/scripts/pipeline1.py index 8cd2bb3..956b433 100644 --- a/scripts/pipeline1.py +++ b/scripts/pipeline1.py @@ -31,8 +31,9 @@ def run_pipeline(input_file_name, output_file_name): shutil.copyfile(input_file_name, STRING_LIST_FILE_NAME) run_obeliks(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME) tweak_conllu(OBELIKS_RAW_FILE_NAME, OBELIKS_TWEAKED_FILE_NAME) - run_classla(OBELIKS_TWEAKED_FILE_NAME, CLASSLA_FILE_NAME) - run_tei_conversion(CLASSLA_FILE_NAME, TEI_INIT_FILE_NAME) + run_classla(OBELIKS_TWEAKED_FILE_NAME, CLASSLA_OUTPUT_FILE_NAME) + run_jos_translation(CLASSLA_OUTPUT_FILE_NAME, CLASSLA_TRANSLATED_FILE_NAME) + run_tei_conversion(CLASSLA_TRANSLATED_FILE_NAME, TEI_INIT_FILE_NAME) shutil.copyfile(TEI_INIT_FILE_NAME, output_file_name) def run_obeliks(list_file_name, conllu_file_name): @@ -54,6 +55,12 @@ def run_classla(obeliks_file_name, classla_file_name): result = nlp(doc) result.conll_file.write_conll(classla_file_name) +def run_jos_translation(input_file_name, output_file_name): + print('Translating JOS ...') + translate_command = ' '.join(['python', TRANSLATION_SCRIPT_NAME, '-infile', input_file_name, '-dict', TRANSLATION_FILE_NAME, '-outfile', output_file_name]) + print(translate_command) + os.system(translate_command) + def run_tei_conversion(classla_file_name, tei_file_name): print('Converting to tei ...') convert_command = ' '.join(['python', CONLLU_TEI_SCRIPT_NAME, '-o', tei_file_name, classla_file_name]) diff --git a/scripts/setup.sh b/scripts/setup.sh index ee59aec..9cfe7cc 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -21,6 +21,7 @@ cd scripts ln -s ../nova_slovnica/python/scripts/conllu_to_xml.py . ln -s ../nova_slovnica/python/scripts/assign_structures.py . ln -s ../nova_slovnica/python/scripts/create_structures.py . +ln -s ../nova_slovnica/python/scripts/translate_jos.py . ln -s ../nova_slovnica/python/scripts/tei_to_dictionary.py . ln -s ../luscenje_struktur/wani.py . cd ..