From 3c38cdbcae6a94f76453e35efa557c0f49ac198c Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Tue, 29 Jun 2021 21:00:27 +0200 Subject: [PATCH] Redmine #1835: made input structure specification xml into parameter --- README.md | 36 ++++++++++------------- package/structure_assignment/api.py | 35 +++++++++++----------- package/structure_assignment/constants.py | 2 +- package/structure_assignment/pipeline.py | 15 ++++++---- scripts/process.py | 12 ++++---- scripts/setup.sh | 2 -- 6 files changed, 51 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 95d042f..7dab919 100644 --- a/README.md +++ b/README.md @@ -40,21 +40,20 @@ $ python scripts/process.py -mode strings_to_parse -infile /tmp/strings.txt -out The input should be a TEI XML file (in the same particular format as the output of strings_to_parse) and an xml file of structure -specifications (the CJVT structures.xml file, supplemented with -temporary new structures, if needed). It first splits the TEI file -into two files, one with the single-component units and the other with -the multiple-component units. For each, it then assigns each unit to a -syntactic structure from the DDD database and converts the output into -CJVT dictionary XML format. For the single-component units, this is -pretty trivial, but for multiple-component units it is more involved, -and includes two runs of the MWE extraction script +specifications. It first splits the TEI file into two files, one with +the single-component units and the other with the multiple-component +units. For each, it then assigns each unit to a syntactic structure +from the DDD database and converts the output into CJVT dictionary XML +format. For the single-component units, this is pretty trivial, but +for multiple-component units it is more involved, and includes two +runs of the MWE extraction script [wani.py](https://gitea.cjvt.si/ozbolt/luscenje_struktur), generating missing structures in between. At the end, the single-component and multiple-component dictionary files are merged into one dictionary file. Example: ``` -$ python scripts/process.py -mode parse_to_dictionary -infile /tmp/parsed.xml -outfile /tmp/dictionary.xml -structures /tmp/structures_new.xml +$ python scripts/process.py -mode parse_to_dictionary -infile /tmp/parsed.xml -instructs /tmp/structures_old.xml -outfile /tmp/dictionary.xml -structures /tmp/structures_new.xml ``` ### strings_to_dictionary @@ -64,7 +63,7 @@ Combines strings_to_parse in parse_to_dictionary into one call between). Example: ``` -$ python scripts/process.py -mode strings_to_dictionary -infile /tmp/strings.txt -outfile /tmp/dictionary.xml -structures /tmp/structures_new.xml +$ python scripts/process.py -mode strings_to_dictionary -infile /tmp/strings.txt -instructs /tmp/structures_old.xml -outfile /tmp/dictionary.xml -outstructs /tmp/structures_new.xml ``` ### all @@ -73,26 +72,23 @@ Same as strings_to_dictionary, but also validates the dictionary and structures outputs, just in case. ``` -$ python scripts/process.py -mode all -infile /tmp/strings.txt -outfile /tmp/dictionary.xml -structures /tmp/structures_new.xml +$ python scripts/process.py -mode all -infile /tmp/strings.txt -instructs /tmp/structures_old.xml -outfile /tmp/dictionary.xml -outstructs /tmp/structures_new.xml ``` ## REST API The package provides a REST API with endpoints roughly mirroring the -process.py modes. For the calls accepting strings as input, GET calls -are also supported for single-string input. For the calls resulting in -dictionaries, the results include both the dictionary entries and the -structures they use. If processing resulted in temporary new -structures, their number is recorded in @new_structures. +process.py modes. For most calls, POST is needed, so that input +structures can be easily provided. If processing resulted in temporary +new structures, their number is recorded in @new_structures. Example curl calls: ``` $ curl -k https://proc1.cjvt.si/structures/strings_to_parse?string=velika%20miza -$ curl -k -X POST -F file=@/tmp/strings.txt https://proc1.cjvt.si/structures/strings_to_parse -$ curl -k -X POST -F file=@/tmp/parse.xml https://proc1.cjvt.si/structures/parse_to_dictionary -$ curl -k https://proc1.cjvt.si/structures/strings_to_dictionary?string=velika%20miza -$ curl -k -X POST -F file=@/tmp/strings.txt https://proc1.cjvt.si/structures/strings_to_dictionary +$ curl -k -X POST -F strings=@/tmp/strings.txt https://proc1.cjvt.si/structures/strings_to_parse +$ curl -k -X POST -F parsed=@/tmp/parse.xml -F structures=@/tmp/structures.xml https://proc1.cjvt.si/structures/parse_to_dictionary +$ curl -k -X POST -F strings=@/tmp/strings.txt -F structures=@/tmp/structures.xml https://proc1.cjvt.si/structures/strings_to_dictionary ``` ## Note diff --git a/package/structure_assignment/api.py b/package/structure_assignment/api.py index 7b1e505..0b7861e 100644 --- a/package/structure_assignment/api.py +++ b/package/structure_assignment/api.py @@ -29,7 +29,7 @@ def strings_to_parse(): string_file.write(string + '\n') string_file.close() elif (request.method == 'POST'): - file_data = request.files['file'] + file_data = request.files['strings'] file_data.save(string_file_name) try: @@ -49,19 +49,22 @@ def parse_to_dictionary(): tmp_directory = tempfile.mkdtemp() parsed_file_name = tmp_directory + '/input_parsed.xml' dictionary_file_name = tmp_directory + '/output_dictionary.xml' - structure_file_name = tmp_directory + '/output_structures.xml' + input_structure_file_name = tmp_directory + '/input_structures.xml' + output_structure_file_name = tmp_directory + '/output_structures.xml' try: - file_data = request.files['file'] - file_data.save(parsed_file_name) + parsed_file_data = request.files['parsed'] + parsed_file_data.save(parsed_file_name) + structure_file_data = request.files['structures'] + structure_file_data.save(input_structure_file_name) - runner.parse_to_dictionary(parsed_file_name, dictionary_file_name, structure_file_name) + runner.parse_to_dictionary(parsed_file_name, dictionary_file_name, input_structure_file_name, output_structure_file_name) root = lxml.Element('response') dictionary_root = lxml.parse(dictionary_file_name).getroot() root.append(dictionary_root) - structure_root = lxml.parse(structure_file_name).getroot() + structure_root = lxml.parse(output_structure_file_name).getroot() new_structure_count = len(structure_root.xpath('.//syntactic_structure[@tempId]')) root.set('new_structures', str(new_structure_count)) structure_ids = set(dictionary_root.xpath('.//lexicalUnit/@structure_id')) @@ -79,30 +82,28 @@ def parse_to_dictionary(): return Response(message, mimetype='text/xml') -@app.route(api_prefix + '/strings_to_dictionary', methods=['GET', 'POST']) +@app.route(api_prefix + '/strings_to_dictionary', methods=['POST']) def strings_to_dictionary(): tmp_directory = tempfile.mkdtemp() string_file_name = tmp_directory + '/input_string.txt' dictionary_file_name = tmp_directory + '/output_dictionary.xml' - structure_file_name = tmp_directory + '/output_structures.xml' + input_structure_file_name = tmp_directory + '/input_structures.xml' + output_structure_file_name = tmp_directory + '/output_structures.xml' try: - if (request.method == 'GET'): - string = request.args.get('string') - with open(string_file_name, 'w') as string_file: - string_file.write(string + '\n') - elif (request.method == 'POST'): - file_data = request.files['file'] - file_data.save(string_file_name) + string_file_data = request.files['strings'] + string_file_data.save(string_file_name) + structure_file_data = request.files['structures'] + structure_file_data.save(input_structure_file_name) - runner.strings_to_dictionary(string_file_name, dictionary_file_name, structure_file_name) + runner.strings_to_dictionary(string_file_name, dictionary_file_name, input_structure_file_name, output_structure_file_name) root = lxml.Element('response') dictionary_root = lxml.parse(dictionary_file_name).getroot() root.append(dictionary_root) - structure_root = lxml.parse(structure_file_name).getroot() + structure_root = lxml.parse(output_structure_file_name).getroot() new_structure_count = len(structure_root.xpath('.//syntactic_structure[@tempId]')) root.set('new_structures', str(new_structure_count)) structure_ids = set(dictionary_root.xpath('.//lexicalUnit/@structure_id')) diff --git a/package/structure_assignment/constants.py b/package/structure_assignment/constants.py index 118bfbb..3bad673 100644 --- a/package/structure_assignment/constants.py +++ b/package/structure_assignment/constants.py @@ -13,7 +13,7 @@ FILE_MAP = {'strings-list': 'strings.txt', 'tei-multiple-ids-2': 'tei_multiple_with_ids2.xml', 'mwes-1': 'mwes1.csv', 'mwes-2': 'mwes2.csv', - 'structures-old': 'structures.xml', + 'structures-old': 'structures_old.xml', 'structures-new': 'structures_new.xml', 'dictionary-single': 'dictionary_single.xml', 'dictionary-multiple': 'dictionary_multiple.xml', diff --git a/package/structure_assignment/pipeline.py b/package/structure_assignment/pipeline.py index fa08597..9e9cf93 100644 --- a/package/structure_assignment/pipeline.py +++ b/package/structure_assignment/pipeline.py @@ -25,24 +25,26 @@ class Runner: NLP_CONFIG_MAP['dir'] = resource_directory + '/classla' self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP) - def run_all(self, input_file_name, output_file_name, structure_file_name): + def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): pipeline = Pipeline(self.resource_directory, self.nlp) pipeline.import_file(input_file_name, 'strings-list') + pipeline.import_file(input_structure_file_name, 'structures-old') self._strings_to_parse_sequence(pipeline) self._parse_to_dictionary_sequence(pipeline) pipeline.do_validate_structures() - pipeline.export_file(structure_file_name, 'structures-new') + pipeline.export_file(output_structure_file_name, 'structures-new') pipeline.do_validate_dictionary() pipeline.export_file(output_file_name, 'dictionary') pipeline.cleanup() - def strings_to_dictionary(self, input_file_name, output_file_name, structure_file_name): + def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): pipeline = Pipeline(self.resource_directory, self.nlp) pipeline.import_file(input_file_name, 'strings-list') + pipeline.import_file(input_structure_file_name, 'structures-old') self._strings_to_parse_sequence(pipeline) self._parse_to_dictionary_sequence(pipeline) pipeline.export_file(output_file_name, 'dictionary') - pipeline.export_file(structure_file_name, 'structures-new') + pipeline.export_file(output_structure_file_name, 'structures-new') pipeline.cleanup() def strings_to_parse(self, input_file_name, output_file_name): @@ -52,12 +54,13 @@ class Runner: pipeline.export_file(output_file_name, 'tei-initial') pipeline.cleanup() - def parse_to_dictionary(self, input_file_name, output_file_name, structure_file_name): + def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): pipeline = Pipeline(self.resource_directory) pipeline.import_file(input_file_name, 'tei-initial') + pipeline.import_file(input_structure_file_name, 'structures-old') self._parse_to_dictionary_sequence(pipeline) pipeline.export_file(output_file_name, 'dictionary') - pipeline.export_file(structure_file_name, 'structures-new') + pipeline.export_file(output_structure_file_name, 'structures-new') pipeline.cleanup() def validate_structures(self, input_file_name): diff --git a/scripts/process.py b/scripts/process.py index 0e6c7a0..d3c91e6 100644 --- a/scripts/process.py +++ b/scripts/process.py @@ -10,25 +10,27 @@ if (__name__ == '__main__'): arg_parser.add_argument('-mode', type=str, help='Mode') arg_parser.add_argument('-infile', type=str, help='Input file') arg_parser.add_argument('-outfile', type=str, help='Output file') - arg_parser.add_argument('-structures', type=str, help='Updated structure file') + arg_parser.add_argument('-instructs', type=str, help='Input structure file') + arg_parser.add_argument('-outstructs', type=str, help='Output structure file') arguments = arg_parser.parse_args() mode = arguments.mode input_file_name = arguments.infile output_file_name = arguments.outfile - structure_file_name = arguments.structures + input_structure_file_name = arguments.instructs + output_structure_file_name = arguments.outstructs nlp_needed = mode in {'strings_to_parse', 'strings_to_dictionary', 'all'} runner = Runner(resource_directory, nlp_needed) if (mode == 'strings_to_parse'): runner.strings_to_parse(input_file_name, output_file_name) elif (mode == 'strings_to_dictionary'): - runner.strings_to_dictionary(input_file_name, output_file_name, structure_file_name) + runner.strings_to_dictionary(input_file_name, output_file_name, input_structure_file_name, output_structure_file_name) elif (mode == 'parse_to_dictionary'): - runner.parse_to_dictionary(input_file_name, output_file_name, structure_file_name) + runner.parse_to_dictionary(input_file_name, output_file_name, input_structure_file_name, output_structure_file_name) elif (mode == 'validate_structures'): runner.validate_structures(input_file_name) elif (mode == 'validate_dictionary'): runner.validate_dictionary(input_file_name) elif (mode == 'all'): - runner.run_all(input_file_name, output_file_name, structure_file_name) + runner.run_all(input_file_name, output_file_name, input_structure_file_name, output_structure_file_name) diff --git a/scripts/setup.sh b/scripts/setup.sh index 60be9fd..8ef485e 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -11,7 +11,6 @@ git clone git@gitea.cjvt.si:redmine_projects/nova_slovnica.git git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git git clone git@gitea.cjvt.si:generic/data_admin.git git clone git@gitea.cjvt.si:generic/xml_schemas.git -scp -P 3022 proc1.cjvt.si:/net/nas/resources/miscellaneous/structures/structures.xml . cd .. ## prepare python environment @@ -35,5 +34,4 @@ ln -s ../lib/nova_slovnica/resources/dict.xml . ln -s ../lib/data_admin/resources/structures.xsd . ln -s ../lib/xml_schemas/resources/schema/inventory.xsd . ln -s ../lib/xml_schemas/resources/schema/monolingual_dictionaries.xsd . -mv ../lib/structures.xml . cd ..