From bb3c673e29e9a8bc7864aaca5ab8a8086898b78e Mon Sep 17 00:00:00 2001 From: Luka Dragar Date: Wed, 15 Apr 2026 08:23:06 +0200 Subject: [PATCH] Specifications are loaded on import --- conversion_utils/jos_msds_and_properties.py | 100 +++++++++++--------- 1 file changed, 53 insertions(+), 47 deletions(-) diff --git a/conversion_utils/jos_msds_and_properties.py b/conversion_utils/jos_msds_and_properties.py index e4332b2..6de383d 100644 --- a/conversion_utils/jos_msds_and_properties.py +++ b/conversion_utils/jos_msds_and_properties.py @@ -4,7 +4,7 @@ import lxml.etree as lxml from collections import defaultdict from importlib_resources import files -from enum import Enum +from enum import IntEnum from conversion_utils.utils import xpath_find, get_xml_id @@ -62,10 +62,10 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'), ('pronoun', 8, 'se'), ('zaimek', 8, 'se'), ('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')} -class MsdState(Enum): - FULL = 1 - PARTIAL = 2 - UNKNOWN = 3 +class MsdState(IntEnum): + UNKNOWN = -1 + PARTIAL = 1 + FULL = 2 class MsdException(Exception): pass @@ -266,9 +266,35 @@ class UD: class Msd: """JOS msd.""" - def __init__(self, code, language): + class State(IntEnum): + UNKNOWN = -1 + PARTIAL = 1 + FULL = 2 + + def __init__(self, code, language, expected_state=State.FULL, require_valid=False): self.code = code self.language = language + self.expected_state = expected_state + self.require_valid = require_valid + self.state = self._validate_and_get_state() + + def _validate_and_get_state(self): + states = set() + if self.code in DEFAULT_SPECIFICATIONS.codes_map[self.language]: + states.add(self.State.FULL) + if self.code in DEFAULT_SPECIFICATIONS.partial_codes_map[self.language]: + states.add(self.State.PARTIAL) + if len(states) == 0: + states.add(self.State.UNKNOWN) + if self.expected_state not in states: + if self.require_valid: + raise MsdException(f"Given msd '{self.codecode}' is '{self.state.name}', but expected state is '{self.expected_state.name}'.") + else: + if self.state == self.State.UNKNOWN: + print(f"[WARN] The Msd '{self.code}' is unknown.") + else: + print(f"[WARN] The Msd '{self.code}' is unknown for expected state '{self.expected_state.name}'.") + return max(states) def __str__(self): return 'code={code}, language={language}'.format(code=self.code, language=self.language) @@ -282,15 +308,7 @@ class Converter: def __init__(self, xml_file_name=None): if (xml_file_name is None): - resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE) - if (resource.is_file()): - try: - with resource.open('rb') as pickle_file: - self.specifications = pickle.load(pickle_file) - except: - exit('Could not parse specifications pickle file installed.') - else: - exit('No pickle installed or xml provided.') + self.specifications = DEFAULT_SPECIFICATIONS else: parser = SpecificationsParser() try: @@ -313,7 +331,7 @@ class Converter: form_feature_map={}, language=msd.language ) - return self.properties_to_msd(properties, msd.language).code + return self.properties_to_msd(properties, msd.language, expected_state=Msd.State.PARTIAL).code def _parse_msd_ud_conversion(self, file_name): """Parse file with direct conversions from English Msd to Universal Dependencies.""" @@ -336,32 +354,7 @@ class Converter: all_rules[priority].append(current_rules) return all_rules - def is_valid_msd(self, msd): - """Verify if the Msd code is in the standard JOS set.""" - return msd.code in self.specifications.codes_map[msd.language] - - def get_msd_state(self, msd): - """Determine if the Msd code is full, partial or unknown.""" - if msd.code in self.specifications.codes_map[msd.language]: - return MsdState.FULL - elif msd.code in self.specifications.partial_codes_map[msd.language]: - return MsdState.PARTIAL - else: - return MsdState.UNKNOWN - - def check_valid_msd(self, msd, require_valid_flag, allow_partial=True): - """If the Msd code is not valid, raise an exception or give a warning.""" - msd_state = self.get_msd_state(msd) - if msd_state == MsdState.UNKNOWN: - message = f"The msd '{msd.code}' is unknown" - if require_valid_flag: - raise MsdException(message) - else: - print('[WARN] ' + message) - if msd_state == MsdState.PARTIAL and not allow_partial: - raise MsdException(f"Partial msd '{msd.code}' is not allowed. Full msd is required.") - - def msd_to_properties(self, msd, language, lemma=None, require_valid_flag=False, warn_level_flag=False): + def msd_to_properties(self, msd, language, lemma=None, warn_level_flag=False): """Convert Msd to Properties. The language of the generated Properties is specified and can differ from the Msd language. @@ -387,7 +380,6 @@ class Converter: Properties: the result of the conversion of the Msd in the language requested """ - self.check_valid_msd(msd, require_valid_flag) category_char = msd.code[0].lower() value_chars = msd.code[1:] category = self.specifications.find_category_by_code(category_char, msd.language) @@ -413,7 +405,7 @@ class Converter: form_feature_map[feature_name] = feature_value return Properties(category_name, lexeme_feature_map, form_feature_map, language) - def properties_to_msd(self, properties, language, require_valid_flag=False): + def properties_to_msd(self, properties, language, expected_state=Msd.State.FULL): """Convert Properties to Msd. The language of the generated Msd is specified and can differ from the Properties language. @@ -445,8 +437,7 @@ class Converter: msd_code += '-' i += 1 msd_code += position_map[position] - msd = Msd(msd_code, language) - self.check_valid_msd(msd, require_valid_flag) + msd = Msd(msd_code, language, expected_state=expected_state) return msd def msd_to_ud(self, msd, lemma): @@ -459,7 +450,8 @@ class Converter: lemma(str): the lemma of the word form with the MSD """ - self.check_valid_msd(msd, False, allow_partial=False) + if msd.state != Msd.State.FULL: + raise MsdException(f"Msd must be full to be converted to UD.") upos_category, *upos_features = self.mte_to_ud_features[msd.code].split() final_upos = "" @@ -496,3 +488,17 @@ class Converter: def translate_properties(self, properties, language): return self.msd_to_properties(self.properties_to_msd(properties, language), language) + +def _load_default_specifications(): + global DEFAULT_SPECIFICATIONS + resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE) + if resource.is_file(): + try: + with resource.open('rb') as pickle_file: + DEFAULT_SPECIFICATIONS = pickle.load(pickle_file) + except Exception as e: + exit('Could not parse specifications pickle file installed.') + else: + exit("Default specifications not found.") + +_load_default_specifications() \ No newline at end of file