|
|
|
@ -24,16 +24,17 @@ def run_pipeline(input_tei_file_name, output_lexicon_file_name, output_structure
|
|
|
|
|
split_tei_input(TEI_INIT_FILE_NAME, TEI_SINGLE_FILE_NAME, TEI_MULTIPLE_FILE_NAME)
|
|
|
|
|
|
|
|
|
|
# single-token units
|
|
|
|
|
run_dictionary_conversion(TEI_SINGLE_FILE_NAME, DICTIONARY_SINGLE_FILE_NAME)
|
|
|
|
|
run_structure_single_assignment(TEI_SINGLE_FILE_NAME, STRUCTURE_CURRENT_FILE_NAME, TEI_SINGLE_STRUCTURE_FILE_NAME)
|
|
|
|
|
run_dictionary_conversion(TEI_SINGLE_STRUCTURE_FILE_NAME, DICTIONARY_SINGLE_FILE_NAME)
|
|
|
|
|
|
|
|
|
|
# multiple-token units
|
|
|
|
|
run_mwe_extraction(STRUCTURE_CURRENT_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME)
|
|
|
|
|
run_structure_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME, TEI_STRUCTURE_1_FILE_NAME)
|
|
|
|
|
run_structure_creation(STRUCTURE_CURRENT_FILE_NAME, TEI_STRUCTURE_1_FILE_NAME, STRUCTURE_NEW_FILE_NAME)
|
|
|
|
|
run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME)
|
|
|
|
|
run_structure_creation(STRUCTURE_CURRENT_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME, STRUCTURE_NEW_FILE_NAME)
|
|
|
|
|
validate_structures(STRUCTURE_NEW_FILE_NAME)
|
|
|
|
|
run_mwe_extraction(STRUCTURE_NEW_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME)
|
|
|
|
|
run_structure_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME, TEI_STRUCTURE_2_FILE_NAME)
|
|
|
|
|
run_dictionary_conversion(TEI_STRUCTURE_2_FILE_NAME, DICTIONARY_MULTIPLE_FILE_NAME)
|
|
|
|
|
run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME, TEI_MULTIPLE_STRUCTURE_2_FILE_NAME)
|
|
|
|
|
run_dictionary_conversion(TEI_MULTIPLE_STRUCTURE_2_FILE_NAME, DICTIONARY_MULTIPLE_FILE_NAME)
|
|
|
|
|
|
|
|
|
|
# merge and finish
|
|
|
|
|
merge_dictionaries(DICTIONARY_SINGLE_FILE_NAME, DICTIONARY_MULTIPLE_FILE_NAME, DICTIONARY_FILE_NAME)
|
|
|
|
@ -58,7 +59,12 @@ def validate_structures(structure_file_name):
|
|
|
|
|
xml_tree = lxml.parse(structure_file_name)
|
|
|
|
|
xml_schema.assertValid(xml_tree)
|
|
|
|
|
|
|
|
|
|
def run_structure_assignment(tei_file_name, mwe_csv_file_name, output_file_name):
|
|
|
|
|
def run_structure_single_assignment(input_file_name, structure_file_name, output_file_name):
|
|
|
|
|
print('Assigning structure ids ...')
|
|
|
|
|
assignment_command = ' '.join(['python', STRUCTURE_SINGLE_ASSIGNMENT_SCRIPT_NAME, '-infile', input_file_name, '-structures', structure_file_name, '-outfile', output_file_name])
|
|
|
|
|
os.system(assignment_command)
|
|
|
|
|
|
|
|
|
|
def run_structure_multiple_assignment(tei_file_name, mwe_csv_file_name, output_file_name):
|
|
|
|
|
print('Assigning structure ids ...')
|
|
|
|
|
assignment_command = ' '.join(['python', STRUCTURE_ASSIGNMENT_SCRIPT_NAME, tei_file_name, mwe_csv_file_name, output_file_name])
|
|
|
|
|
os.system(assignment_command)
|
|
|
|
|