diff --git a/conversion_utils/jos_msds_and_properties.py b/conversion_utils/jos_msds_and_properties.py index 24bf56f..0c599a3 100644 --- a/conversion_utils/jos_msds_and_properties.py +++ b/conversion_utils/jos_msds_and_properties.py @@ -53,12 +53,16 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'), class Specifications: """JOS specifications with list of all word categories.""" - def __init__(self, *categories): - self.categories = list(categories) + def __init__(self): + self.categories = [] + self.codes_map = {'en':set(), 'sl':set()} def add_category(self, category): self.categories.append(category) + def add_code(self, code, language): + self.codes_map[language].add(code) + def find_category_by_code(self, char, language): return next((category for category in self.categories if category.codes.get(language) == char), None) @@ -145,10 +149,17 @@ class SpecificationsParser: def parse(self, file_name): root = lxml.parse(file_name).getroot() - specifications = Specifications() div_elements = xpath_find(root, 'tei:div') + specifications = Specifications() for div_element in div_elements: - if (re.match(r'^msd\..-sl', get_xml_id(div_element))): + xml_id = get_xml_id(div_element) + if (xml_id == 'msd.msds-sl'): + msd_elements = xpath_find(div_element, 'tei:table/tei:row[@role="msd"]') + for msd_element in msd_elements: + msd_codes = self.get_cell_pair(msd_element, 'msd') + specifications.add_code(msd_codes.get('en').capitalize(), 'en') + specifications.add_code(msd_codes.get('sl').capitalize(), 'sl') + elif (re.match(r'^msd\..-sl', xml_id)): category_element = xpath_find(div_element, 'tei:table/tei:row[@role="type"]')[0] category_names = self.get_cell_pair(category_element, 'value') category_codes = self.get_cell_pair(category_element, 'code') @@ -214,6 +225,9 @@ class Converter: def msd_to_properties(self, msd, language, lemma=None): """Convert Msd to Properties (possibly in the other language).""" + if (msd.code not in self.specifications.codes_map[msd.language]): + exit('[ERROR] msd {} is unknown'.format(msd.code)) + category_char = msd.code[0].lower() value_chars = msd.code[1:] category = self.specifications.find_category_by_code(category_char, msd.language) diff --git a/tests/test_jos_msd_to_properties.py b/tests/test_jos_msd_to_properties.py index f4afb49..42faa72 100644 --- a/tests/test_jos_msd_to_properties.py +++ b/tests/test_jos_msd_to_properties.py @@ -39,23 +39,23 @@ class JosMsdToPropertiesTestCase(unittest.TestCase): self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'}) self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'}) - def test_exception_feature_level(self): # testing level exception: ('pronoun', 8, 'ti') + def test_exception_feature_level(self): properties = self.converter.msd_to_properties(Msd('Pp2-sd--y', 'en'), 'sl', 'ti') self.assertEqual(properties.language, 'sl') self.assertEqual(properties.category, 'zaimek') self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga'}) self.assertEqual(properties.form_feature_map, {'število':'ednina', 'sklon':'dajalnik', 'naslonskost':'klitična'}) - def test_normal_feature_level(self): # invalid msd, but useful for testing contrast with previous test - properties = self.converter.msd_to_properties(Msd('Pp2-pd--y', 'en'), 'sl', 'vi') + def test_normal_feature_level(self): + properties = self.converter.msd_to_properties(Msd('Px------y', 'en'), 'sl', 'jst') self.assertEqual(properties.language, 'sl') self.assertEqual(properties.category, 'zaimek') - self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'}) - self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'}) + self.assertEqual(properties.lexeme_feature_map, {'vrsta':'povratni', 'naslonskost':'klitična'}) + self.assertEqual(properties.form_feature_map, {}) def test_featureless(self): - properties = self.converter.msd_to_properties(Msd('V', 'sl'), 'en') + properties = self.converter.msd_to_properties(Msd('U', 'sl'), 'en') self.assertEqual(properties.language, 'en') - self.assertEqual(properties.category, 'conjunction') + self.assertEqual(properties.category, 'punctuation') self.assertEqual(properties.lexeme_feature_map, {}) self.assertEqual(properties.form_feature_map, {}) diff --git a/tests/test_jos_properties_to_msd.py b/tests/test_jos_properties_to_msd.py index 5728bdd..13f609d 100644 --- a/tests/test_jos_properties_to_msd.py +++ b/tests/test_jos_properties_to_msd.py @@ -37,12 +37,11 @@ class JosPropertiesToMsdTestCase(unittest.TestCase): self.assertEqual(msd.code, 'Pp2-sd--y') def test_normal_feature_level(self): - msd = self.converter.properties_to_msd(Properties('zaimek', {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'}, {'število':'množina', 'sklon':'dajalnik'}, 'sl'), 'en') + msd = self.converter.properties_to_msd(Properties('zaimek', {'vrsta':'povratni', 'naslonskost':'klitična'}, {}, 'sl'), 'en') self.assertEqual(msd.language, 'en') - self.assertEqual(msd.code, 'Pp2-pd--y') + self.assertEqual(msd.code, 'Px------y') def test_featureless(self): - msd = self.converter.properties_to_msd(Properties('conjunction', {}, {}, 'en'), 'sl') + msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl') self.assertEqual(msd.language, 'sl') - self.assertEqual(msd.code, 'V') - \ No newline at end of file + self.assertEqual(msd.code, 'U')