diff --git a/README b/README index daa1c19..67fcd13 100644 --- a/README +++ b/README @@ -1,5 +1,5 @@ -Pipeline for assigning (first creating, if necessary) superbaza -structure_ids to a file of arbitrary Slovene strings, line by line. +Pipeline for parsing a file of arbitrary Slovene string and assigning +(first creating, if necessary) structure_ids for each string. Example usage: @@ -8,5 +8,6 @@ $ ./setup.sh $ echo "velika miza" > ../tmp/strings.txt $ echo "kdo ne more mimo česa" >> ../tmp/strings.txt $ echo "pazi, avto!" >> ../tmp/strings.txt +$ echo "počitnice" >> ../tmp/strings.txt $ source ../venv/bin/activate -$ python pipeline.py ../tmp/strings.txt ../tmp/output.xml +$ python pipeline.py ../tmp/strings.txt ../tmp/dictionary.xml diff --git a/scripts/.gitignore b/scripts/.gitignore index c26afba..ecd1440 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -2,3 +2,4 @@ /conllu_to_xml.py /wani.py /create_structures.py +/tei_to_dictionary.py \ No newline at end of file diff --git a/scripts/pipeline.py b/scripts/pipeline.py index b3b74cd..2826ce6 100644 --- a/scripts/pipeline.py +++ b/scripts/pipeline.py @@ -17,6 +17,8 @@ CONLLU_TEI_SCRIPT_NAME = 'conllu_to_xml.py' MWE_EXTRACTION_SCRIPT_NAME = 'wani.py' STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py' STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py' +STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py' +TEI_DICTIONARY_SCRIPT_NAME = 'tei_to_dictionary.py' OBELIKS_JAR_FILE_NAME = '../resources/obeliks.jar' TRANSLATION_FILE_NAME = '../resources/dict.xml' @@ -33,6 +35,7 @@ TEI_STRUCTURE_2_FILE_NAME = TMP_DIRECTORY + '/tei_with_structure_ids2.xml' MWE_CSV_1_FILE_NAME = TMP_DIRECTORY + '/mwes1.csv' MWE_CSV_2_FILE_NAME = TMP_DIRECTORY + '/mwes2.csv' STRUCTURE_NEW_FILE_NAME = TMP_DIRECTORY + '/structures_new.xml' +DICTIONARY_XML_FILE_NAME = TMP_DIRECTORY + '/dictionary.xml' NLP_CONFIG_MAP = { 'treebank': 'sl_ssj_jos', @@ -50,14 +53,14 @@ def run_pipeline(input_file_name, output_file_name): run_obeliks4J(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME) fix_xml_ids(OBELIKS_RAW_FILE_NAME, OBELIKS_TWEAKED_FILE_NAME) run_classla(OBELIKS_TWEAKED_FILE_NAME, CLASSLA_FILE_NAME) - convert_to_tei(CLASSLA_FILE_NAME, TEI_INIT_FILE_NAME) + run_tei_conversion(CLASSLA_FILE_NAME, TEI_INIT_FILE_NAME) run_mwe_extraction(STRUCTURE_CURRENT_FILE_NAME, TEI_INIT_FILE_NAME, MWE_CSV_1_FILE_NAME) run_structure_assignment(STRING_LIST_FILE_NAME, TEI_INIT_FILE_NAME, MWE_CSV_1_FILE_NAME, TEI_STRUCTURE_1_FILE_NAME) run_structure_creation(STRUCTURE_CURRENT_FILE_NAME, TEI_STRUCTURE_1_FILE_NAME, STRUCTURE_NEW_FILE_NAME) run_mwe_extraction(STRUCTURE_NEW_FILE_NAME, TEI_INIT_FILE_NAME, MWE_CSV_2_FILE_NAME) run_structure_assignment(STRING_LIST_FILE_NAME, TEI_INIT_FILE_NAME, MWE_CSV_2_FILE_NAME, TEI_STRUCTURE_2_FILE_NAME) - # run_format_lexicon() # TODO: implement - shutil.copyfile(TEI_STRUCTURE_2_FILE_NAME, output_file_name) + run_dictionary_conversion(TEI_STRUCTURE_2_FILE_NAME, DICTIONARY_XML_FILE_NAME) + shutil.copyfile(DICTIONARY_XML_FILE_NAME, output_file_name) def run_obeliks4J(obeliks_file_name, classla_file_name): print('Running obeliks ...') @@ -86,7 +89,7 @@ def run_classla(obeliks_file_name, classla_file_name): result = nlp(doc) result.conll_file.write_conll(classla_file_name) -def convert_to_tei(classla_file_name, tei_file_name): +def run_tei_conversion(classla_file_name, tei_file_name): print('Converting to tei ...') convert_command = ' '.join(['python', CONLLU_TEI_SCRIPT_NAME, classla_file_name, tei_file_name, '--translate', TRANSLATION_FILE_NAME]) os.system(convert_command) @@ -104,7 +107,12 @@ def run_structure_assignment(input_file_name, tei_file_name, mwe_csv_file_name, def run_structure_creation(input_file_name, tei_file_name, output_file_name): print('Creating missing structures ...') - assignment_command = ' '.join(['python', STRUCTURE_CREATION_SCRIPT_NAME, '-infile', input_file_name, '-tei', tei_file_name, '-outfile', output_file_name]) - os.system(assignment_command) + creation_command = ' '.join(['python', STRUCTURE_CREATION_SCRIPT_NAME, '-infile', input_file_name, '-tei', tei_file_name, '-outfile', output_file_name]) + os.system(creation_command) +def run_dictionary_conversion(tei_file_name, xml_file_name): + print('Converting to dictionary xml format ...') + convert_command = ' '.join(['python', TEI_DICTIONARY_SCRIPT_NAME, '-infile', tei_file_name, '-outfile', xml_file_name]) + os.system(convert_command) + run_pipeline(input_file_name, output_file_name) diff --git a/scripts/setup.sh b/scripts/setup.sh index 1c6dbd9..a6b5203 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -28,6 +28,7 @@ cd scripts ln -s ../nova_slovnica/python/scripts/conllu_to_xml.py . ln -s ../nova_slovnica/python/scripts/assign_structures.py . ln -s ../nova_slovnica/python/scripts/create_structures.py . +ln -s ../nova_slovnica/python/scripts/tei_to_dictionary.py . ln -s ../luscenje_struktur/luscenje_struktur/wani.py . cd ..