diff --git a/scripts/.gitignore b/scripts/.gitignore index f353f7b..e2c2c4f 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -1,4 +1,5 @@ *.pyc +/assign_single_structures.py /assign_structures.py /conllu_to_xml.py /wani.py diff --git a/scripts/constants.py b/scripts/constants.py index fc0e129..9be6b9a 100644 --- a/scripts/constants.py +++ b/scripts/constants.py @@ -7,6 +7,7 @@ CONLLU_TWEAK_SCRIPT_NAME = 'tweak_conllu.py' TRANSLATION_SCRIPT_NAME = 'translate_jos.py' CONLLU_TEI_SCRIPT_NAME = 'conllu_to_xml.py' MWE_EXTRACTION_SCRIPT_NAME = 'wani.py' +STRUCTURE_SINGLE_ASSIGNMENT_SCRIPT_NAME = 'assign_single_structures.py' STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py' STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py' STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py' @@ -28,9 +29,10 @@ CLASSLA_OUTPUT_FILE_NAME = TMP_DIRECTORY + '/classla_raw.conllu' CLASSLA_TRANSLATED_FILE_NAME = TMP_DIRECTORY + '/classla_translated.conllu' TEI_INIT_FILE_NAME = TMP_DIRECTORY + '/tei_initial.xml' TEI_SINGLE_FILE_NAME = TMP_DIRECTORY + '/tei_single.xml' +TEI_SINGLE_STRUCTURE_FILE_NAME = TMP_DIRECTORY + '/tei_single_with_ids.xml' TEI_MULTIPLE_FILE_NAME = TMP_DIRECTORY + '/tei_multiple.xml' -TEI_STRUCTURE_1_FILE_NAME = TMP_DIRECTORY + '/tei_with_structure_ids1.xml' -TEI_STRUCTURE_2_FILE_NAME = TMP_DIRECTORY + '/tei_with_structure_ids2.xml' +TEI_MULTIPLE_STRUCTURE_1_FILE_NAME = TMP_DIRECTORY + '/tei_multiple_with_ids1.xml' +TEI_MULTIPLE_STRUCTURE_2_FILE_NAME = TMP_DIRECTORY + '/tei_multiple_with_ids2.xml' MWE_CSV_1_FILE_NAME = TMP_DIRECTORY + '/mwes1.csv' MWE_CSV_2_FILE_NAME = TMP_DIRECTORY + '/mwes2.csv' STRUCTURE_NEW_FILE_NAME = TMP_DIRECTORY + '/structures_new.xml' diff --git a/scripts/pipeline2.py b/scripts/pipeline2.py index 16f24b3..583dd6d 100644 --- a/scripts/pipeline2.py +++ b/scripts/pipeline2.py @@ -24,16 +24,17 @@ def run_pipeline(input_tei_file_name, output_lexicon_file_name, output_structure split_tei_input(TEI_INIT_FILE_NAME, TEI_SINGLE_FILE_NAME, TEI_MULTIPLE_FILE_NAME) # single-token units - run_dictionary_conversion(TEI_SINGLE_FILE_NAME, DICTIONARY_SINGLE_FILE_NAME) + run_structure_single_assignment(TEI_SINGLE_FILE_NAME, STRUCTURE_CURRENT_FILE_NAME, TEI_SINGLE_STRUCTURE_FILE_NAME) + run_dictionary_conversion(TEI_SINGLE_STRUCTURE_FILE_NAME, DICTIONARY_SINGLE_FILE_NAME) # multiple-token units run_mwe_extraction(STRUCTURE_CURRENT_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME) - run_structure_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME, TEI_STRUCTURE_1_FILE_NAME) - run_structure_creation(STRUCTURE_CURRENT_FILE_NAME, TEI_STRUCTURE_1_FILE_NAME, STRUCTURE_NEW_FILE_NAME) + run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME) + run_structure_creation(STRUCTURE_CURRENT_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME, STRUCTURE_NEW_FILE_NAME) validate_structures(STRUCTURE_NEW_FILE_NAME) run_mwe_extraction(STRUCTURE_NEW_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME) - run_structure_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME, TEI_STRUCTURE_2_FILE_NAME) - run_dictionary_conversion(TEI_STRUCTURE_2_FILE_NAME, DICTIONARY_MULTIPLE_FILE_NAME) + run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME, TEI_MULTIPLE_STRUCTURE_2_FILE_NAME) + run_dictionary_conversion(TEI_MULTIPLE_STRUCTURE_2_FILE_NAME, DICTIONARY_MULTIPLE_FILE_NAME) # merge and finish merge_dictionaries(DICTIONARY_SINGLE_FILE_NAME, DICTIONARY_MULTIPLE_FILE_NAME, DICTIONARY_FILE_NAME) @@ -58,7 +59,12 @@ def validate_structures(structure_file_name): xml_tree = lxml.parse(structure_file_name) xml_schema.assertValid(xml_tree) -def run_structure_assignment(tei_file_name, mwe_csv_file_name, output_file_name): +def run_structure_single_assignment(input_file_name, structure_file_name, output_file_name): + print('Assigning structure ids ...') + assignment_command = ' '.join(['python', STRUCTURE_SINGLE_ASSIGNMENT_SCRIPT_NAME, '-infile', input_file_name, '-structures', structure_file_name, '-outfile', output_file_name]) + os.system(assignment_command) + +def run_structure_multiple_assignment(tei_file_name, mwe_csv_file_name, output_file_name): print('Assigning structure ids ...') assignment_command = ' '.join(['python', STRUCTURE_ASSIGNMENT_SCRIPT_NAME, tei_file_name, mwe_csv_file_name, output_file_name]) os.system(assignment_command)