Redmine #1835: added back schema validation

This commit is contained in:
Cyprian Laskowski 2021-03-15 09:38:20 +01:00
parent f5d4a009ea
commit f7b9aaf210
2 changed files with 21 additions and 13 deletions

View File

@ -1,20 +1,10 @@
# scripts
MWE_EXTRACTION_SCRIPT_NAME = 'wani.py'
STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py'
STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py'
# resources
TRANSLATION_FILE_NAME = '../resources/dict.xml'
STRUCTURE_SCHEMA_FILE_NAME = '../resources/structures.xsd'
DICTIONARY_SCHEMA_FILE_NAME = '../resources/monolingual_dictionaries.xsd'
# temporary outputs
FILE_MAP = {'strings-list': 'strings.txt', FILE_MAP = {'strings-list': 'strings.txt',
'obeliks-tokenised': 'obeliks_raw.conllu', 'obeliks-tokenised': 'obeliks_raw.conllu',
'obeliks-tweaked': 'obeliks_tweaked.conllu', 'obeliks-tweaked': 'obeliks_tweaked.conllu',
'classla-parsed': 'classla_raw.conllu', 'classla-parsed': 'classla_raw.conllu',
'classla-translated': 'classla_translated.conllu', 'classla-translated': 'classla_translated.conllu',
'dict': 'dict.xml',
'structure-schema': 'structures.xsd',
'tei-initial': 'tei_initial.xml', 'tei-initial': 'tei_initial.xml',
'tei-single': 'tei_single.xml', 'tei-single': 'tei_single.xml',
'tei-single-ids': 'tei_single_with_ids.xml', 'tei-single-ids': 'tei_single_with_ids.xml',
@ -27,7 +17,8 @@ FILE_MAP = {'strings-list': 'strings.txt',
'structures-new': 'structures_new.xml', 'structures-new': 'structures_new.xml',
'dictionary-single': 'dictionary_single.xml', 'dictionary-single': 'dictionary_single.xml',
'dictionary-multiple': 'dictionary_multiple.xml', 'dictionary-multiple': 'dictionary_multiple.xml',
'dictionary': 'dictionary.xml' 'dictionary': 'dictionary.xml',
'dictionary-schema': 'monolingual_dictionaries.xsd'
} }
NLP_CONFIG_MAP = { NLP_CONFIG_MAP = {

View File

@ -2,6 +2,7 @@ import os
import shutil import shutil
import tempfile import tempfile
from types import SimpleNamespace from types import SimpleNamespace
import lxml.etree as lxml
import obeliks import obeliks
@ -152,8 +153,24 @@ class Pipeline:
output_file_name = self.file_map['dictionary'] output_file_name = self.file_map['dictionary']
merge_dictionaries(single_file_name, multiple_file_name, output_file_name) merge_dictionaries(single_file_name, multiple_file_name, output_file_name)
def _do_validate(self, schema_file_name, xml_file_name):
xml_schema = lxml.XMLSchema(lxml.parse(schema_file_name))
xml_tree = lxml.parse(xml_file_name)
xml_schema.assertValid(xml_tree)
def do_validate_structures(self):
schema_file_name = self.file_map['structure-schema']
xml_file_name = self.file_map['structures-new']
self._do_validate(schema_file_name, xml_file_name)
def do_validate_dictionary(self):
schema_file_name = self.file_map['dictionary-schema']
xml_file_name = self.file_map['dictionary']
self._do_validate(schema_file_name, xml_file_name)
def export_file(self, file_name, file_key): def export_file(self, file_name, file_key):
shutil.copyfile(self.file_map[file_key], file_name) shutil.copyfile(self.file_map[file_key], file_name)
def cleanup(self): def cleanup(self):
print(self.tmp_directory)
shutil.rmtree(self.tmp_directory, True) shutil.rmtree(self.tmp_directory, True)