From b51bc1b87d4d89a0ac1078daacc49b01ba69c0ee Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Wed, 13 Jan 2021 21:27:42 +0100 Subject: [PATCH] IssueID #1571: adapted to structures.xml no longer being in data_admin --- scripts/constants.py | 2 +- scripts/pipeline2.py | 13 ++++++++----- scripts/setup.sh | 1 - 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/scripts/constants.py b/scripts/constants.py index 9be6b9a..042e768 100644 --- a/scripts/constants.py +++ b/scripts/constants.py @@ -17,7 +17,6 @@ DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py' # resources TRANSLATION_FILE_NAME = '../resources/dict.xml' CLASSLA_MODELS_DIRECTORY = '../resources/classla' -STRUCTURE_CURRENT_FILE_NAME = '../resources/structures.xml' STRUCTURE_SCHEMA_FILE_NAME = '../resources/structures.xsd' DICTIONARY_SCHEMA_FILE_NAME = '../resources/monolingual_dictionaries.xsd' @@ -35,6 +34,7 @@ TEI_MULTIPLE_STRUCTURE_1_FILE_NAME = TMP_DIRECTORY + '/tei_multiple_with_ids1.xm TEI_MULTIPLE_STRUCTURE_2_FILE_NAME = TMP_DIRECTORY + '/tei_multiple_with_ids2.xml' MWE_CSV_1_FILE_NAME = TMP_DIRECTORY + '/mwes1.csv' MWE_CSV_2_FILE_NAME = TMP_DIRECTORY + '/mwes2.csv' +STRUCTURE_OLD_FILE_NAME = TMP_DIRECTORY + '/structures_old.xml' STRUCTURE_NEW_FILE_NAME = TMP_DIRECTORY + '/structures_new.xml' DICTIONARY_SINGLE_FILE_NAME = TMP_DIRECTORY + '/dictionary_single.xml' DICTIONARY_MULTIPLE_FILE_NAME = TMP_DIRECTORY + '/dictionary_multiple.xml' diff --git a/scripts/pipeline2.py b/scripts/pipeline2.py index 583dd6d..cfd669e 100644 --- a/scripts/pipeline2.py +++ b/scripts/pipeline2.py @@ -7,30 +7,33 @@ from constants import * arg_parser = argparse.ArgumentParser(description='Assign parsed Slovene strings to structures and generate lexicon.') arg_parser.add_argument('-intei', type=str, required=True, help='Parsed and manually edited TEI file') +arg_parser.add_argument('-instructures', type=str, required=True, help='Input structure file') arg_parser.add_argument('-outlexicon', type=str, required=True, help='Output lexicon file') arg_parser.add_argument('-outstructures', type=str, required=True, help='Output structure file') arguments = arg_parser.parse_args() input_tei_file_name = arguments.intei +input_structure_file_name = arguments.instructures output_lexicon_file_name = arguments.outlexicon output_structure_file_name = arguments.outstructures -def run_pipeline(input_tei_file_name, output_lexicon_file_name, output_structure_file_name): +def run_pipeline(input_tei_file_name, input_structure_file_name, output_lexicon_file_name, output_structure_file_name): # setup and split shutil.rmtree(TMP_DIRECTORY, True) os.makedirs(TMP_DIRECTORY, exist_ok=True) shutil.copyfile(input_tei_file_name, TEI_INIT_FILE_NAME) + shutil.copyfile(input_structure_file_name, STRUCTURE_OLD_FILE_NAME) split_tei_input(TEI_INIT_FILE_NAME, TEI_SINGLE_FILE_NAME, TEI_MULTIPLE_FILE_NAME) # single-token units - run_structure_single_assignment(TEI_SINGLE_FILE_NAME, STRUCTURE_CURRENT_FILE_NAME, TEI_SINGLE_STRUCTURE_FILE_NAME) + run_structure_single_assignment(TEI_SINGLE_FILE_NAME, STRUCTURE_OLD_FILE_NAME, TEI_SINGLE_STRUCTURE_FILE_NAME) run_dictionary_conversion(TEI_SINGLE_STRUCTURE_FILE_NAME, DICTIONARY_SINGLE_FILE_NAME) # multiple-token units - run_mwe_extraction(STRUCTURE_CURRENT_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME) + run_mwe_extraction(STRUCTURE_OLD_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME) run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME) - run_structure_creation(STRUCTURE_CURRENT_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME, STRUCTURE_NEW_FILE_NAME) + run_structure_creation(STRUCTURE_OLD_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME, STRUCTURE_NEW_FILE_NAME) validate_structures(STRUCTURE_NEW_FILE_NAME) run_mwe_extraction(STRUCTURE_NEW_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME) run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME, TEI_MULTIPLE_STRUCTURE_2_FILE_NAME) @@ -90,4 +93,4 @@ def validate_dictionary(dictionary_file_name): xml_tree = lxml.parse(dictionary_file_name) xml_schema.assertValid(xml_tree) -run_pipeline(input_tei_file_name, output_lexicon_file_name, output_structure_file_name) +run_pipeline(input_tei_file_name, input_structure_file_name, output_lexicon_file_name, output_structure_file_name) diff --git a/scripts/setup.sh b/scripts/setup.sh index 04d8fec..e5a4670 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -30,7 +30,6 @@ cd .. ## put needed resources in place cd resources ln -s ../nova_slovnica/resources/dict.xml . -ln -s ../data_admin/resources/structures.xml . ln -s ../data_admin/resources/structures.xsd . ln -s ../xml_schemas/resources/schema/inventory.xsd . ln -s ../xml_schemas/resources/schema/monolingual_dictionaries.xsd .