diff --git a/package/structure_assignment/constants.py b/package/structure_assignment/constants.py index cae1d4b..e1c54f0 100644 --- a/package/structure_assignment/constants.py +++ b/package/structure_assignment/constants.py @@ -1,20 +1,10 @@ -# scripts -MWE_EXTRACTION_SCRIPT_NAME = 'wani.py' -STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py' -STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py' -DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py' - -# resources -TRANSLATION_FILE_NAME = '../resources/dict.xml' -STRUCTURE_SCHEMA_FILE_NAME = '../resources/structures.xsd' -DICTIONARY_SCHEMA_FILE_NAME = '../resources/monolingual_dictionaries.xsd' - -# temporary outputs FILE_MAP = {'strings-list': 'strings.txt', 'obeliks-tokenised': 'obeliks_raw.conllu', 'obeliks-tweaked': 'obeliks_tweaked.conllu', 'classla-parsed': 'classla_raw.conllu', 'classla-translated': 'classla_translated.conllu', + 'dict': 'dict.xml', + 'structure-schema': 'structures.xsd', 'tei-initial': 'tei_initial.xml', 'tei-single': 'tei_single.xml', 'tei-single-ids': 'tei_single_with_ids.xml', @@ -27,7 +17,8 @@ FILE_MAP = {'strings-list': 'strings.txt', 'structures-new': 'structures_new.xml', 'dictionary-single': 'dictionary_single.xml', 'dictionary-multiple': 'dictionary_multiple.xml', - 'dictionary': 'dictionary.xml' + 'dictionary': 'dictionary.xml', + 'dictionary-schema': 'monolingual_dictionaries.xsd' } NLP_CONFIG_MAP = { diff --git a/package/structure_assignment/pipeline.py b/package/structure_assignment/pipeline.py index 6a5bbff..11a1cfc 100644 --- a/package/structure_assignment/pipeline.py +++ b/package/structure_assignment/pipeline.py @@ -2,6 +2,7 @@ import os import shutil import tempfile from types import SimpleNamespace +import lxml.etree as lxml import obeliks @@ -152,8 +153,24 @@ class Pipeline: output_file_name = self.file_map['dictionary'] merge_dictionaries(single_file_name, multiple_file_name, output_file_name) + def _do_validate(self, schema_file_name, xml_file_name): + xml_schema = lxml.XMLSchema(lxml.parse(schema_file_name)) + xml_tree = lxml.parse(xml_file_name) + xml_schema.assertValid(xml_tree) + + def do_validate_structures(self): + schema_file_name = self.file_map['structure-schema'] + xml_file_name = self.file_map['structures-new'] + self._do_validate(schema_file_name, xml_file_name) + + def do_validate_dictionary(self): + schema_file_name = self.file_map['dictionary-schema'] + xml_file_name = self.file_map['dictionary'] + self._do_validate(schema_file_name, xml_file_name) + def export_file(self, file_name, file_key): shutil.copyfile(self.file_map[file_key], file_name) def cleanup(self): + print(self.tmp_directory) shutil.rmtree(self.tmp_directory, True)