IssueID #1571: adapted to structures.xml no longer being in data_admin
This commit is contained in:
		
							parent
							
								
									613ade673a
								
							
						
					
					
						commit
						b51bc1b87d
					
				| @ -17,7 +17,6 @@ DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py' | ||||
| # resources | ||||
| TRANSLATION_FILE_NAME = '../resources/dict.xml' | ||||
| CLASSLA_MODELS_DIRECTORY = '../resources/classla' | ||||
| STRUCTURE_CURRENT_FILE_NAME = '../resources/structures.xml' | ||||
| STRUCTURE_SCHEMA_FILE_NAME = '../resources/structures.xsd' | ||||
| DICTIONARY_SCHEMA_FILE_NAME = '../resources/monolingual_dictionaries.xsd' | ||||
| 
 | ||||
| @ -35,6 +34,7 @@ TEI_MULTIPLE_STRUCTURE_1_FILE_NAME = TMP_DIRECTORY + '/tei_multiple_with_ids1.xm | ||||
| TEI_MULTIPLE_STRUCTURE_2_FILE_NAME = TMP_DIRECTORY + '/tei_multiple_with_ids2.xml' | ||||
| MWE_CSV_1_FILE_NAME = TMP_DIRECTORY + '/mwes1.csv' | ||||
| MWE_CSV_2_FILE_NAME = TMP_DIRECTORY + '/mwes2.csv' | ||||
| STRUCTURE_OLD_FILE_NAME = TMP_DIRECTORY + '/structures_old.xml' | ||||
| STRUCTURE_NEW_FILE_NAME = TMP_DIRECTORY + '/structures_new.xml' | ||||
| DICTIONARY_SINGLE_FILE_NAME = TMP_DIRECTORY + '/dictionary_single.xml' | ||||
| DICTIONARY_MULTIPLE_FILE_NAME = TMP_DIRECTORY + '/dictionary_multiple.xml' | ||||
|  | ||||
| @ -7,30 +7,33 @@ from constants import * | ||||
| 
 | ||||
| arg_parser = argparse.ArgumentParser(description='Assign parsed Slovene strings to structures and generate lexicon.') | ||||
| arg_parser.add_argument('-intei', type=str, required=True, help='Parsed and manually edited TEI file') | ||||
| arg_parser.add_argument('-instructures', type=str, required=True, help='Input structure file') | ||||
| arg_parser.add_argument('-outlexicon', type=str, required=True, help='Output lexicon file') | ||||
| arg_parser.add_argument('-outstructures', type=str, required=True, help='Output structure file') | ||||
| arguments = arg_parser.parse_args() | ||||
| 
 | ||||
| input_tei_file_name = arguments.intei | ||||
| input_structure_file_name = arguments.instructures | ||||
| output_lexicon_file_name = arguments.outlexicon | ||||
| output_structure_file_name = arguments.outstructures | ||||
| 
 | ||||
| def run_pipeline(input_tei_file_name, output_lexicon_file_name, output_structure_file_name): | ||||
| def run_pipeline(input_tei_file_name, input_structure_file_name, output_lexicon_file_name, output_structure_file_name): | ||||
| 
 | ||||
|     # setup and split | ||||
|     shutil.rmtree(TMP_DIRECTORY, True) | ||||
|     os.makedirs(TMP_DIRECTORY, exist_ok=True) | ||||
|     shutil.copyfile(input_tei_file_name, TEI_INIT_FILE_NAME) | ||||
|     shutil.copyfile(input_structure_file_name, STRUCTURE_OLD_FILE_NAME) | ||||
|     split_tei_input(TEI_INIT_FILE_NAME, TEI_SINGLE_FILE_NAME, TEI_MULTIPLE_FILE_NAME) | ||||
| 
 | ||||
|     # single-token units | ||||
|     run_structure_single_assignment(TEI_SINGLE_FILE_NAME, STRUCTURE_CURRENT_FILE_NAME, TEI_SINGLE_STRUCTURE_FILE_NAME) | ||||
|     run_structure_single_assignment(TEI_SINGLE_FILE_NAME, STRUCTURE_OLD_FILE_NAME, TEI_SINGLE_STRUCTURE_FILE_NAME) | ||||
|     run_dictionary_conversion(TEI_SINGLE_STRUCTURE_FILE_NAME, DICTIONARY_SINGLE_FILE_NAME) | ||||
| 
 | ||||
|     # multiple-token units | ||||
|     run_mwe_extraction(STRUCTURE_CURRENT_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME) | ||||
|     run_mwe_extraction(STRUCTURE_OLD_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME) | ||||
|     run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME) | ||||
|     run_structure_creation(STRUCTURE_CURRENT_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME, STRUCTURE_NEW_FILE_NAME) | ||||
|     run_structure_creation(STRUCTURE_OLD_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME, STRUCTURE_NEW_FILE_NAME) | ||||
|     validate_structures(STRUCTURE_NEW_FILE_NAME) | ||||
|     run_mwe_extraction(STRUCTURE_NEW_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME) | ||||
|     run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME, TEI_MULTIPLE_STRUCTURE_2_FILE_NAME) | ||||
| @ -90,4 +93,4 @@ def validate_dictionary(dictionary_file_name): | ||||
|     xml_tree = lxml.parse(dictionary_file_name) | ||||
|     xml_schema.assertValid(xml_tree) | ||||
| 
 | ||||
| run_pipeline(input_tei_file_name, output_lexicon_file_name, output_structure_file_name) | ||||
| run_pipeline(input_tei_file_name, input_structure_file_name, output_lexicon_file_name, output_structure_file_name) | ||||
|  | ||||
| @ -30,7 +30,6 @@ cd .. | ||||
| ## put needed resources in place | ||||
| cd resources | ||||
| ln -s ../nova_slovnica/resources/dict.xml . | ||||
| ln -s ../data_admin/resources/structures.xml . | ||||
| ln -s ../data_admin/resources/structures.xsd . | ||||
| ln -s ../xml_schemas/resources/schema/inventory.xsd . | ||||
| ln -s ../xml_schemas/resources/schema/monolingual_dictionaries.xsd . | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user