Specifications are loaded on import

This commit is contained in:
2026-04-15 08:23:06 +02:00
parent aef9a3698f
commit bb3c673e29

View File

@@ -4,7 +4,7 @@ import lxml.etree as lxml
from collections import defaultdict from collections import defaultdict
from importlib_resources import files from importlib_resources import files
from enum import Enum from enum import IntEnum
from conversion_utils.utils import xpath_find, get_xml_id from conversion_utils.utils import xpath_find, get_xml_id
@@ -62,10 +62,10 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
('pronoun', 8, 'se'), ('zaimek', 8, 'se'), ('pronoun', 8, 'se'), ('zaimek', 8, 'se'),
('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')} ('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')}
class MsdState(Enum): class MsdState(IntEnum):
FULL = 1 UNKNOWN = -1
PARTIAL = 2 PARTIAL = 1
UNKNOWN = 3 FULL = 2
class MsdException(Exception): class MsdException(Exception):
pass pass
@@ -266,9 +266,35 @@ class UD:
class Msd: class Msd:
"""JOS msd.""" """JOS msd."""
def __init__(self, code, language): class State(IntEnum):
UNKNOWN = -1
PARTIAL = 1
FULL = 2
def __init__(self, code, language, expected_state=State.FULL, require_valid=False):
self.code = code self.code = code
self.language = language self.language = language
self.expected_state = expected_state
self.require_valid = require_valid
self.state = self._validate_and_get_state()
def _validate_and_get_state(self):
states = set()
if self.code in DEFAULT_SPECIFICATIONS.codes_map[self.language]:
states.add(self.State.FULL)
if self.code in DEFAULT_SPECIFICATIONS.partial_codes_map[self.language]:
states.add(self.State.PARTIAL)
if len(states) == 0:
states.add(self.State.UNKNOWN)
if self.expected_state not in states:
if self.require_valid:
raise MsdException(f"Given msd '{self.codecode}' is '{self.state.name}', but expected state is '{self.expected_state.name}'.")
else:
if self.state == self.State.UNKNOWN:
print(f"[WARN] The Msd '{self.code}' is unknown.")
else:
print(f"[WARN] The Msd '{self.code}' is unknown for expected state '{self.expected_state.name}'.")
return max(states)
def __str__(self): def __str__(self):
return 'code={code}, language={language}'.format(code=self.code, language=self.language) return 'code={code}, language={language}'.format(code=self.code, language=self.language)
@@ -282,15 +308,7 @@ class Converter:
def __init__(self, xml_file_name=None): def __init__(self, xml_file_name=None):
if (xml_file_name is None): if (xml_file_name is None):
resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE) self.specifications = DEFAULT_SPECIFICATIONS
if (resource.is_file()):
try:
with resource.open('rb') as pickle_file:
self.specifications = pickle.load(pickle_file)
except:
exit('Could not parse specifications pickle file installed.')
else:
exit('No pickle installed or xml provided.')
else: else:
parser = SpecificationsParser() parser = SpecificationsParser()
try: try:
@@ -313,7 +331,7 @@ class Converter:
form_feature_map={}, form_feature_map={},
language=msd.language language=msd.language
) )
return self.properties_to_msd(properties, msd.language).code return self.properties_to_msd(properties, msd.language, expected_state=Msd.State.PARTIAL).code
def _parse_msd_ud_conversion(self, file_name): def _parse_msd_ud_conversion(self, file_name):
"""Parse file with direct conversions from English Msd to Universal Dependencies.""" """Parse file with direct conversions from English Msd to Universal Dependencies."""
@@ -336,32 +354,7 @@ class Converter:
all_rules[priority].append(current_rules) all_rules[priority].append(current_rules)
return all_rules return all_rules
def is_valid_msd(self, msd): def msd_to_properties(self, msd, language, lemma=None, warn_level_flag=False):
"""Verify if the Msd code is in the standard JOS set."""
return msd.code in self.specifications.codes_map[msd.language]
def get_msd_state(self, msd):
"""Determine if the Msd code is full, partial or unknown."""
if msd.code in self.specifications.codes_map[msd.language]:
return MsdState.FULL
elif msd.code in self.specifications.partial_codes_map[msd.language]:
return MsdState.PARTIAL
else:
return MsdState.UNKNOWN
def check_valid_msd(self, msd, require_valid_flag, allow_partial=True):
"""If the Msd code is not valid, raise an exception or give a warning."""
msd_state = self.get_msd_state(msd)
if msd_state == MsdState.UNKNOWN:
message = f"The msd '{msd.code}' is unknown"
if require_valid_flag:
raise MsdException(message)
else:
print('[WARN] ' + message)
if msd_state == MsdState.PARTIAL and not allow_partial:
raise MsdException(f"Partial msd '{msd.code}' is not allowed. Full msd is required.")
def msd_to_properties(self, msd, language, lemma=None, require_valid_flag=False, warn_level_flag=False):
"""Convert Msd to Properties. """Convert Msd to Properties.
The language of the generated Properties is specified and can differ from the Msd language. The language of the generated Properties is specified and can differ from the Msd language.
@@ -387,7 +380,6 @@ class Converter:
Properties: the result of the conversion of the Msd in the language requested Properties: the result of the conversion of the Msd in the language requested
""" """
self.check_valid_msd(msd, require_valid_flag)
category_char = msd.code[0].lower() category_char = msd.code[0].lower()
value_chars = msd.code[1:] value_chars = msd.code[1:]
category = self.specifications.find_category_by_code(category_char, msd.language) category = self.specifications.find_category_by_code(category_char, msd.language)
@@ -413,7 +405,7 @@ class Converter:
form_feature_map[feature_name] = feature_value form_feature_map[feature_name] = feature_value
return Properties(category_name, lexeme_feature_map, form_feature_map, language) return Properties(category_name, lexeme_feature_map, form_feature_map, language)
def properties_to_msd(self, properties, language, require_valid_flag=False): def properties_to_msd(self, properties, language, expected_state=Msd.State.FULL):
"""Convert Properties to Msd. """Convert Properties to Msd.
The language of the generated Msd is specified and can differ from the Properties language. The language of the generated Msd is specified and can differ from the Properties language.
@@ -445,8 +437,7 @@ class Converter:
msd_code += '-' msd_code += '-'
i += 1 i += 1
msd_code += position_map[position] msd_code += position_map[position]
msd = Msd(msd_code, language) msd = Msd(msd_code, language, expected_state=expected_state)
self.check_valid_msd(msd, require_valid_flag)
return msd return msd
def msd_to_ud(self, msd, lemma): def msd_to_ud(self, msd, lemma):
@@ -459,7 +450,8 @@ class Converter:
lemma(str): the lemma of the word form with the MSD lemma(str): the lemma of the word form with the MSD
""" """
self.check_valid_msd(msd, False, allow_partial=False) if msd.state != Msd.State.FULL:
raise MsdException(f"Msd must be full to be converted to UD.")
upos_category, *upos_features = self.mte_to_ud_features[msd.code].split() upos_category, *upos_features = self.mte_to_ud_features[msd.code].split()
final_upos = "" final_upos = ""
@@ -496,3 +488,17 @@ class Converter:
def translate_properties(self, properties, language): def translate_properties(self, properties, language):
return self.msd_to_properties(self.properties_to_msd(properties, language), language) return self.msd_to_properties(self.properties_to_msd(properties, language), language)
def _load_default_specifications():
global DEFAULT_SPECIFICATIONS
resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE)
if resource.is_file():
try:
with resource.open('rb') as pickle_file:
DEFAULT_SPECIFICATIONS = pickle.load(pickle_file)
except Exception as e:
exit('Could not parse specifications pickle file installed.')
else:
exit("Default specifications not found.")
_load_default_specifications()