diff --git a/conversion_utils/jos_msds_and_properties.py b/conversion_utils/jos_msds_and_properties.py index d6fbcd5..9badc0a 100644 --- a/conversion_utils/jos_msds_and_properties.py +++ b/conversion_utils/jos_msds_and_properties.py @@ -230,7 +230,10 @@ class Msd: return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language -class ConverterException(Exception): +class CustomException(Exception): + pass + +class MsdException(CustomException): pass class Converter: @@ -253,17 +256,46 @@ class Converter: except: exit('Could not parse specifications xml file provided.') - def msd_to_properties(self, msd, language, lemma=None): - """Convert Msd to Properties (possibly in the other language). + def is_valid_msd(self, msd): + """Verify if the Msd code is in the standard JOS set.""" + return msd.code in self.specifications.codes_map[msd.language] - The level (lexeme vs form) of certain reflexive msd features - depends on the lemma, so set the lemma if you need accurate - level information. - """ + def check_valid_msd(self, msd, require_valid_flag): + """If the Msd code is not valid, raise an exception or give a warning.""" + if (not self.is_valid_msd(msd)): + message = 'The msd {} is unknown'.format(msd.code) + if (require_valid_flag): + raise MsdException(message) + else: + print('[WARN] ' + message) + + def msd_to_properties(self, msd, language, lemma=None, require_valid_flag=False, warn_level_flag=False): + """Convert Msd to Properties. + + The language of the generated Properties is specified and can differ from the Msd language. + + If require_valid_flag is True, a MsdException is raised if the MSD is not in the standard + JOS set. Otherwise only a warning is given. + + If you care about accurate level information (i.e., which properties are lexeme-level and + which are form-level), note that some features depends on the particular lemma. For such + features, if lemma is not provided and warn_level_flag is True, a warning will be given. - if (msd.code not in self.specifications.codes_map[msd.language]): - raise ConverterException('The msd {} is unknown'.format(msd.code)) + If a MSD has dashes in place of letters for certain features, they are skipped, so that + these features are not included in the generated Properties object. + Parameters: + msd(Msd): the JOS MSD to convert + language(str): the language for the Properties object to be generated: "en" (English) or "sl" (Slovene) + lemma(str): the lemma of the word form with the MSD + require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is provided + warn_level_flag(boolean): whether to warn if cannot be sure of level of a property + + Returns: + Properties: the result of the conversion of the Msd in the language requested + + """ + self.check_valid_msd(msd, require_valid_flag) category_char = msd.code[0].lower() value_chars = msd.code[1:] category = self.specifications.find_category_by_code(category_char, msd.language) @@ -277,8 +309,8 @@ class Converter: value = feature.find_value_by_char(value_char, msd.language) feature_name = feature.names.get(language) feature_value = value.names.get(language) - if (lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]): - print('[WARN] The level of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.' + if (warn_level_flag and lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]): + print('[WARN] The level (lexeme vs form) of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.' .format(category=category_name, position=index)) level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag @@ -289,8 +321,21 @@ class Converter: form_feature_map[feature_name] = feature_value return Properties(category_name, lexeme_feature_map, form_feature_map, language) - def properties_to_msd(self, properties, language): - """Convert Properties to msd (possibly in the other language).""" + def properties_to_msd(self, properties, language, require_valid_flag=False): + """Convert Properties to Msd. + + The language of the generated Msd is specified and can differ from the Properties language. + + If require_valid_flag is True, a MsdException is raised if the generated MSD is not in + the standard JOS set. Otherwise only a warning is given. + + Any skipped positions among the Properties are represented as dashes in the MSD. + + Parameters: + properties(Properties): the properties to convert + language(str): the language for the Msd object to be returned: "en" (English) or "sl" (Slovene) + require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is generated + """ category = self.specifications.find_category_by_name(properties.category, properties.language) category_char = category.codes.get(language).upper() feature_map = properties.lexeme_feature_map.copy() @@ -308,7 +353,9 @@ class Converter: msd_code += '-' i += 1 msd_code += position_map[position] - return Msd(msd_code, language) + msd = Msd(msd_code, language) + self.check_valid_msd(msd, require_valid_flag) + return msd def translate_msd(self, msd, language): return self.properties_to_msd(self.msd_to_properties(msd, language), language) diff --git a/conversion_utils/tests/test_jos_msd_to_properties.py b/conversion_utils/tests/test_jos_msd_to_properties.py index f2f9694..df8bcb8 100644 --- a/conversion_utils/tests/test_jos_msd_to_properties.py +++ b/conversion_utils/tests/test_jos_msd_to_properties.py @@ -1,6 +1,6 @@ import unittest -from conversion_utils.jos_msds_and_properties import Converter, Msd +from conversion_utils.jos_msds_and_properties import Converter, Msd, MsdException class JosMsdToPropertiesTestCase(unittest.TestCase): @@ -55,3 +55,25 @@ class JosMsdToPropertiesTestCase(unittest.TestCase): self.assertEqual(properties.category, 'punctuation') self.assertEqual(properties.lexeme_feature_map, {}) self.assertEqual(properties.form_feature_map, {}) + + def test_good_msd_with_require_valid(self): + properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'en', require_valid_flag=True) + self.assertEqual(properties.language, 'en') + self.assertEqual(properties.category, 'noun') + self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'}) + self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'}) + + def test_bad_msd(self): + properties = self.converter.msd_to_properties(Msd('N---d', 'en'), 'en') + self.assertEqual(properties.language, 'en') + self.assertEqual(properties.category, 'noun') + self.assertEqual(properties.lexeme_feature_map, {}) + self.assertEqual(properties.form_feature_map, {'case':'dative'}) + + def test_bad_msd_with_require_valid(self): + try: + self.converter.msd_to_properties(Msd('N---d', 'en'), 'en', require_valid_flag=True) + fails = False + except MsdException: + fails = True + self.assertEqual(fails, True) diff --git a/conversion_utils/tests/test_jos_properties_to_msd.py b/conversion_utils/tests/test_jos_properties_to_msd.py index cb6230d..16ee493 100644 --- a/conversion_utils/tests/test_jos_properties_to_msd.py +++ b/conversion_utils/tests/test_jos_properties_to_msd.py @@ -1,6 +1,6 @@ import unittest -from conversion_utils.jos_msds_and_properties import Converter, Properties +from conversion_utils.jos_msds_and_properties import Converter, Properties, MsdException class JosPropertiesToMsdTestCase(unittest.TestCase): @@ -41,3 +41,21 @@ class JosPropertiesToMsdTestCase(unittest.TestCase): msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl') self.assertEqual(msd.language, 'sl') self.assertEqual(msd.code, 'U') + + def test_good_msd_with_require_valid(self): + msd = self.converter.properties_to_msd(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'en', require_valid_flag=True) + self.assertEqual(msd.language, 'en') + self.assertEqual(msd.code, 'Ncfdn') + + def test_bad_msd(self): + msd = self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en') + self.assertEqual(msd.language, 'en') + self.assertEqual(msd.code, 'Nc-d') + + def test_bad_msd_with_require_valid(self): + try: + self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en', require_valid_flag=True) + fails = False + except MsdException: + fails = True + self.assertEqual(fails, True)