IssueID #1723: assigned structures also for single words
This commit is contained in:
		
							parent
							
								
									04606e3e6a
								
							
						
					
					
						commit
						613ade673a
					
				
							
								
								
									
										1
									
								
								scripts/.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								scripts/.gitignore
									
									
									
									
										vendored
									
									
								
							@ -1,4 +1,5 @@
 | 
			
		||||
*.pyc
 | 
			
		||||
/assign_single_structures.py
 | 
			
		||||
/assign_structures.py
 | 
			
		||||
/conllu_to_xml.py
 | 
			
		||||
/wani.py
 | 
			
		||||
 | 
			
		||||
@ -7,6 +7,7 @@ CONLLU_TWEAK_SCRIPT_NAME = 'tweak_conllu.py'
 | 
			
		||||
TRANSLATION_SCRIPT_NAME = 'translate_jos.py'
 | 
			
		||||
CONLLU_TEI_SCRIPT_NAME = 'conllu_to_xml.py'
 | 
			
		||||
MWE_EXTRACTION_SCRIPT_NAME = 'wani.py'
 | 
			
		||||
STRUCTURE_SINGLE_ASSIGNMENT_SCRIPT_NAME = 'assign_single_structures.py'
 | 
			
		||||
STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py'
 | 
			
		||||
STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
 | 
			
		||||
STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
 | 
			
		||||
@ -28,9 +29,10 @@ CLASSLA_OUTPUT_FILE_NAME = TMP_DIRECTORY + '/classla_raw.conllu'
 | 
			
		||||
CLASSLA_TRANSLATED_FILE_NAME = TMP_DIRECTORY + '/classla_translated.conllu'
 | 
			
		||||
TEI_INIT_FILE_NAME = TMP_DIRECTORY + '/tei_initial.xml'
 | 
			
		||||
TEI_SINGLE_FILE_NAME = TMP_DIRECTORY + '/tei_single.xml'
 | 
			
		||||
TEI_SINGLE_STRUCTURE_FILE_NAME = TMP_DIRECTORY + '/tei_single_with_ids.xml'
 | 
			
		||||
TEI_MULTIPLE_FILE_NAME = TMP_DIRECTORY + '/tei_multiple.xml'
 | 
			
		||||
TEI_STRUCTURE_1_FILE_NAME = TMP_DIRECTORY + '/tei_with_structure_ids1.xml'
 | 
			
		||||
TEI_STRUCTURE_2_FILE_NAME = TMP_DIRECTORY + '/tei_with_structure_ids2.xml'
 | 
			
		||||
TEI_MULTIPLE_STRUCTURE_1_FILE_NAME = TMP_DIRECTORY + '/tei_multiple_with_ids1.xml'
 | 
			
		||||
TEI_MULTIPLE_STRUCTURE_2_FILE_NAME = TMP_DIRECTORY + '/tei_multiple_with_ids2.xml'
 | 
			
		||||
MWE_CSV_1_FILE_NAME = TMP_DIRECTORY + '/mwes1.csv'
 | 
			
		||||
MWE_CSV_2_FILE_NAME = TMP_DIRECTORY + '/mwes2.csv'
 | 
			
		||||
STRUCTURE_NEW_FILE_NAME = TMP_DIRECTORY + '/structures_new.xml'
 | 
			
		||||
 | 
			
		||||
@ -24,16 +24,17 @@ def run_pipeline(input_tei_file_name, output_lexicon_file_name, output_structure
 | 
			
		||||
    split_tei_input(TEI_INIT_FILE_NAME, TEI_SINGLE_FILE_NAME, TEI_MULTIPLE_FILE_NAME)
 | 
			
		||||
 | 
			
		||||
    # single-token units
 | 
			
		||||
    run_dictionary_conversion(TEI_SINGLE_FILE_NAME, DICTIONARY_SINGLE_FILE_NAME)
 | 
			
		||||
    run_structure_single_assignment(TEI_SINGLE_FILE_NAME, STRUCTURE_CURRENT_FILE_NAME, TEI_SINGLE_STRUCTURE_FILE_NAME)
 | 
			
		||||
    run_dictionary_conversion(TEI_SINGLE_STRUCTURE_FILE_NAME, DICTIONARY_SINGLE_FILE_NAME)
 | 
			
		||||
 | 
			
		||||
    # multiple-token units
 | 
			
		||||
    run_mwe_extraction(STRUCTURE_CURRENT_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME)
 | 
			
		||||
    run_structure_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME, TEI_STRUCTURE_1_FILE_NAME)
 | 
			
		||||
    run_structure_creation(STRUCTURE_CURRENT_FILE_NAME, TEI_STRUCTURE_1_FILE_NAME, STRUCTURE_NEW_FILE_NAME)
 | 
			
		||||
    run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME)
 | 
			
		||||
    run_structure_creation(STRUCTURE_CURRENT_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME, STRUCTURE_NEW_FILE_NAME)
 | 
			
		||||
    validate_structures(STRUCTURE_NEW_FILE_NAME)
 | 
			
		||||
    run_mwe_extraction(STRUCTURE_NEW_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME)
 | 
			
		||||
    run_structure_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME, TEI_STRUCTURE_2_FILE_NAME)
 | 
			
		||||
    run_dictionary_conversion(TEI_STRUCTURE_2_FILE_NAME, DICTIONARY_MULTIPLE_FILE_NAME)
 | 
			
		||||
    run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME, TEI_MULTIPLE_STRUCTURE_2_FILE_NAME)
 | 
			
		||||
    run_dictionary_conversion(TEI_MULTIPLE_STRUCTURE_2_FILE_NAME, DICTIONARY_MULTIPLE_FILE_NAME)
 | 
			
		||||
 | 
			
		||||
    # merge and finish
 | 
			
		||||
    merge_dictionaries(DICTIONARY_SINGLE_FILE_NAME, DICTIONARY_MULTIPLE_FILE_NAME, DICTIONARY_FILE_NAME)
 | 
			
		||||
@ -58,7 +59,12 @@ def validate_structures(structure_file_name):
 | 
			
		||||
    xml_tree = lxml.parse(structure_file_name)
 | 
			
		||||
    xml_schema.assertValid(xml_tree)
 | 
			
		||||
 | 
			
		||||
def run_structure_assignment(tei_file_name, mwe_csv_file_name, output_file_name):
 | 
			
		||||
def run_structure_single_assignment(input_file_name, structure_file_name, output_file_name):
 | 
			
		||||
    print('Assigning structure ids ...')
 | 
			
		||||
    assignment_command = ' '.join(['python', STRUCTURE_SINGLE_ASSIGNMENT_SCRIPT_NAME, '-infile', input_file_name, '-structures', structure_file_name, '-outfile', output_file_name])
 | 
			
		||||
    os.system(assignment_command)
 | 
			
		||||
 | 
			
		||||
def run_structure_multiple_assignment(tei_file_name, mwe_csv_file_name, output_file_name):
 | 
			
		||||
    print('Assigning structure ids ...')
 | 
			
		||||
    assignment_command = ' '.join(['python', STRUCTURE_ASSIGNMENT_SCRIPT_NAME, tei_file_name, mwe_csv_file_name, output_file_name])
 | 
			
		||||
    os.system(assignment_command)
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user