IssueID #1571: adapted to structures.xml no longer being in data_admin
This commit is contained in:
parent
613ade673a
commit
b51bc1b87d
|
@ -17,7 +17,6 @@ DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py'
|
||||||
# resources
|
# resources
|
||||||
TRANSLATION_FILE_NAME = '../resources/dict.xml'
|
TRANSLATION_FILE_NAME = '../resources/dict.xml'
|
||||||
CLASSLA_MODELS_DIRECTORY = '../resources/classla'
|
CLASSLA_MODELS_DIRECTORY = '../resources/classla'
|
||||||
STRUCTURE_CURRENT_FILE_NAME = '../resources/structures.xml'
|
|
||||||
STRUCTURE_SCHEMA_FILE_NAME = '../resources/structures.xsd'
|
STRUCTURE_SCHEMA_FILE_NAME = '../resources/structures.xsd'
|
||||||
DICTIONARY_SCHEMA_FILE_NAME = '../resources/monolingual_dictionaries.xsd'
|
DICTIONARY_SCHEMA_FILE_NAME = '../resources/monolingual_dictionaries.xsd'
|
||||||
|
|
||||||
|
@ -35,6 +34,7 @@ TEI_MULTIPLE_STRUCTURE_1_FILE_NAME = TMP_DIRECTORY + '/tei_multiple_with_ids1.xm
|
||||||
TEI_MULTIPLE_STRUCTURE_2_FILE_NAME = TMP_DIRECTORY + '/tei_multiple_with_ids2.xml'
|
TEI_MULTIPLE_STRUCTURE_2_FILE_NAME = TMP_DIRECTORY + '/tei_multiple_with_ids2.xml'
|
||||||
MWE_CSV_1_FILE_NAME = TMP_DIRECTORY + '/mwes1.csv'
|
MWE_CSV_1_FILE_NAME = TMP_DIRECTORY + '/mwes1.csv'
|
||||||
MWE_CSV_2_FILE_NAME = TMP_DIRECTORY + '/mwes2.csv'
|
MWE_CSV_2_FILE_NAME = TMP_DIRECTORY + '/mwes2.csv'
|
||||||
|
STRUCTURE_OLD_FILE_NAME = TMP_DIRECTORY + '/structures_old.xml'
|
||||||
STRUCTURE_NEW_FILE_NAME = TMP_DIRECTORY + '/structures_new.xml'
|
STRUCTURE_NEW_FILE_NAME = TMP_DIRECTORY + '/structures_new.xml'
|
||||||
DICTIONARY_SINGLE_FILE_NAME = TMP_DIRECTORY + '/dictionary_single.xml'
|
DICTIONARY_SINGLE_FILE_NAME = TMP_DIRECTORY + '/dictionary_single.xml'
|
||||||
DICTIONARY_MULTIPLE_FILE_NAME = TMP_DIRECTORY + '/dictionary_multiple.xml'
|
DICTIONARY_MULTIPLE_FILE_NAME = TMP_DIRECTORY + '/dictionary_multiple.xml'
|
||||||
|
|
|
@ -7,30 +7,33 @@ from constants import *
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(description='Assign parsed Slovene strings to structures and generate lexicon.')
|
arg_parser = argparse.ArgumentParser(description='Assign parsed Slovene strings to structures and generate lexicon.')
|
||||||
arg_parser.add_argument('-intei', type=str, required=True, help='Parsed and manually edited TEI file')
|
arg_parser.add_argument('-intei', type=str, required=True, help='Parsed and manually edited TEI file')
|
||||||
|
arg_parser.add_argument('-instructures', type=str, required=True, help='Input structure file')
|
||||||
arg_parser.add_argument('-outlexicon', type=str, required=True, help='Output lexicon file')
|
arg_parser.add_argument('-outlexicon', type=str, required=True, help='Output lexicon file')
|
||||||
arg_parser.add_argument('-outstructures', type=str, required=True, help='Output structure file')
|
arg_parser.add_argument('-outstructures', type=str, required=True, help='Output structure file')
|
||||||
arguments = arg_parser.parse_args()
|
arguments = arg_parser.parse_args()
|
||||||
|
|
||||||
input_tei_file_name = arguments.intei
|
input_tei_file_name = arguments.intei
|
||||||
|
input_structure_file_name = arguments.instructures
|
||||||
output_lexicon_file_name = arguments.outlexicon
|
output_lexicon_file_name = arguments.outlexicon
|
||||||
output_structure_file_name = arguments.outstructures
|
output_structure_file_name = arguments.outstructures
|
||||||
|
|
||||||
def run_pipeline(input_tei_file_name, output_lexicon_file_name, output_structure_file_name):
|
def run_pipeline(input_tei_file_name, input_structure_file_name, output_lexicon_file_name, output_structure_file_name):
|
||||||
|
|
||||||
# setup and split
|
# setup and split
|
||||||
shutil.rmtree(TMP_DIRECTORY, True)
|
shutil.rmtree(TMP_DIRECTORY, True)
|
||||||
os.makedirs(TMP_DIRECTORY, exist_ok=True)
|
os.makedirs(TMP_DIRECTORY, exist_ok=True)
|
||||||
shutil.copyfile(input_tei_file_name, TEI_INIT_FILE_NAME)
|
shutil.copyfile(input_tei_file_name, TEI_INIT_FILE_NAME)
|
||||||
|
shutil.copyfile(input_structure_file_name, STRUCTURE_OLD_FILE_NAME)
|
||||||
split_tei_input(TEI_INIT_FILE_NAME, TEI_SINGLE_FILE_NAME, TEI_MULTIPLE_FILE_NAME)
|
split_tei_input(TEI_INIT_FILE_NAME, TEI_SINGLE_FILE_NAME, TEI_MULTIPLE_FILE_NAME)
|
||||||
|
|
||||||
# single-token units
|
# single-token units
|
||||||
run_structure_single_assignment(TEI_SINGLE_FILE_NAME, STRUCTURE_CURRENT_FILE_NAME, TEI_SINGLE_STRUCTURE_FILE_NAME)
|
run_structure_single_assignment(TEI_SINGLE_FILE_NAME, STRUCTURE_OLD_FILE_NAME, TEI_SINGLE_STRUCTURE_FILE_NAME)
|
||||||
run_dictionary_conversion(TEI_SINGLE_STRUCTURE_FILE_NAME, DICTIONARY_SINGLE_FILE_NAME)
|
run_dictionary_conversion(TEI_SINGLE_STRUCTURE_FILE_NAME, DICTIONARY_SINGLE_FILE_NAME)
|
||||||
|
|
||||||
# multiple-token units
|
# multiple-token units
|
||||||
run_mwe_extraction(STRUCTURE_CURRENT_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME)
|
run_mwe_extraction(STRUCTURE_OLD_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME)
|
||||||
run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME)
|
run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_1_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME)
|
||||||
run_structure_creation(STRUCTURE_CURRENT_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME, STRUCTURE_NEW_FILE_NAME)
|
run_structure_creation(STRUCTURE_OLD_FILE_NAME, TEI_MULTIPLE_STRUCTURE_1_FILE_NAME, STRUCTURE_NEW_FILE_NAME)
|
||||||
validate_structures(STRUCTURE_NEW_FILE_NAME)
|
validate_structures(STRUCTURE_NEW_FILE_NAME)
|
||||||
run_mwe_extraction(STRUCTURE_NEW_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME)
|
run_mwe_extraction(STRUCTURE_NEW_FILE_NAME, TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME)
|
||||||
run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME, TEI_MULTIPLE_STRUCTURE_2_FILE_NAME)
|
run_structure_multiple_assignment(TEI_MULTIPLE_FILE_NAME, MWE_CSV_2_FILE_NAME, TEI_MULTIPLE_STRUCTURE_2_FILE_NAME)
|
||||||
|
@ -90,4 +93,4 @@ def validate_dictionary(dictionary_file_name):
|
||||||
xml_tree = lxml.parse(dictionary_file_name)
|
xml_tree = lxml.parse(dictionary_file_name)
|
||||||
xml_schema.assertValid(xml_tree)
|
xml_schema.assertValid(xml_tree)
|
||||||
|
|
||||||
run_pipeline(input_tei_file_name, output_lexicon_file_name, output_structure_file_name)
|
run_pipeline(input_tei_file_name, input_structure_file_name, output_lexicon_file_name, output_structure_file_name)
|
||||||
|
|
|
@ -30,7 +30,6 @@ cd ..
|
||||||
## put needed resources in place
|
## put needed resources in place
|
||||||
cd resources
|
cd resources
|
||||||
ln -s ../nova_slovnica/resources/dict.xml .
|
ln -s ../nova_slovnica/resources/dict.xml .
|
||||||
ln -s ../data_admin/resources/structures.xml .
|
|
||||||
ln -s ../data_admin/resources/structures.xsd .
|
ln -s ../data_admin/resources/structures.xsd .
|
||||||
ln -s ../xml_schemas/resources/schema/inventory.xsd .
|
ln -s ../xml_schemas/resources/schema/inventory.xsd .
|
||||||
ln -s ../xml_schemas/resources/schema/monolingual_dictionaries.xsd .
|
ln -s ../xml_schemas/resources/schema/monolingual_dictionaries.xsd .
|
||||||
|
|
Loading…
Reference in New Issue
Block a user