Checked if msds are valid

This commit is contained in:
Cyprian Laskowski 2021-09-10 16:27:57 +02:00
parent 67382bb74f
commit eca02ebdd3
3 changed files with 29 additions and 16 deletions

View File

@ -53,12 +53,16 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
class Specifications: class Specifications:
"""JOS specifications with list of all word categories.""" """JOS specifications with list of all word categories."""
def __init__(self, *categories): def __init__(self):
self.categories = list(categories) self.categories = []
self.codes_map = {'en':set(), 'sl':set()}
def add_category(self, category): def add_category(self, category):
self.categories.append(category) self.categories.append(category)
def add_code(self, code, language):
self.codes_map[language].add(code)
def find_category_by_code(self, char, language): def find_category_by_code(self, char, language):
return next((category for category in self.categories if category.codes.get(language) == char), None) return next((category for category in self.categories if category.codes.get(language) == char), None)
@ -145,10 +149,17 @@ class SpecificationsParser:
def parse(self, file_name): def parse(self, file_name):
root = lxml.parse(file_name).getroot() root = lxml.parse(file_name).getroot()
specifications = Specifications()
div_elements = xpath_find(root, 'tei:div') div_elements = xpath_find(root, 'tei:div')
specifications = Specifications()
for div_element in div_elements: for div_element in div_elements:
if (re.match(r'^msd\..-sl', get_xml_id(div_element))): xml_id = get_xml_id(div_element)
if (xml_id == 'msd.msds-sl'):
msd_elements = xpath_find(div_element, 'tei:table/tei:row[@role="msd"]')
for msd_element in msd_elements:
msd_codes = self.get_cell_pair(msd_element, 'msd')
specifications.add_code(msd_codes.get('en').capitalize(), 'en')
specifications.add_code(msd_codes.get('sl').capitalize(), 'sl')
elif (re.match(r'^msd\..-sl', xml_id)):
category_element = xpath_find(div_element, 'tei:table/tei:row[@role="type"]')[0] category_element = xpath_find(div_element, 'tei:table/tei:row[@role="type"]')[0]
category_names = self.get_cell_pair(category_element, 'value') category_names = self.get_cell_pair(category_element, 'value')
category_codes = self.get_cell_pair(category_element, 'code') category_codes = self.get_cell_pair(category_element, 'code')
@ -214,6 +225,9 @@ class Converter:
def msd_to_properties(self, msd, language, lemma=None): def msd_to_properties(self, msd, language, lemma=None):
"""Convert Msd to Properties (possibly in the other language).""" """Convert Msd to Properties (possibly in the other language)."""
if (msd.code not in self.specifications.codes_map[msd.language]):
exit('[ERROR] msd {} is unknown'.format(msd.code))
category_char = msd.code[0].lower() category_char = msd.code[0].lower()
value_chars = msd.code[1:] value_chars = msd.code[1:]
category = self.specifications.find_category_by_code(category_char, msd.language) category = self.specifications.find_category_by_code(category_char, msd.language)

View File

@ -39,23 +39,23 @@ class JosMsdToPropertiesTestCase(unittest.TestCase):
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'}) self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'})
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'}) self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
def test_exception_feature_level(self): # testing level exception: ('pronoun', 8, 'ti') def test_exception_feature_level(self):
properties = self.converter.msd_to_properties(Msd('Pp2-sd--y', 'en'), 'sl', 'ti') properties = self.converter.msd_to_properties(Msd('Pp2-sd--y', 'en'), 'sl', 'ti')
self.assertEqual(properties.language, 'sl') self.assertEqual(properties.language, 'sl')
self.assertEqual(properties.category, 'zaimek') self.assertEqual(properties.category, 'zaimek')
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga'}) self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga'})
self.assertEqual(properties.form_feature_map, {'število':'ednina', 'sklon':'dajalnik', 'naslonskost':'klitična'}) self.assertEqual(properties.form_feature_map, {'število':'ednina', 'sklon':'dajalnik', 'naslonskost':'klitična'})
def test_normal_feature_level(self): # invalid msd, but useful for testing contrast with previous test def test_normal_feature_level(self):
properties = self.converter.msd_to_properties(Msd('Pp2-pd--y', 'en'), 'sl', 'vi') properties = self.converter.msd_to_properties(Msd('Px------y', 'en'), 'sl', 'jst')
self.assertEqual(properties.language, 'sl') self.assertEqual(properties.language, 'sl')
self.assertEqual(properties.category, 'zaimek') self.assertEqual(properties.category, 'zaimek')
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'}) self.assertEqual(properties.lexeme_feature_map, {'vrsta':'povratni', 'naslonskost':'klitična'})
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'}) self.assertEqual(properties.form_feature_map, {})
def test_featureless(self): def test_featureless(self):
properties = self.converter.msd_to_properties(Msd('V', 'sl'), 'en') properties = self.converter.msd_to_properties(Msd('U', 'sl'), 'en')
self.assertEqual(properties.language, 'en') self.assertEqual(properties.language, 'en')
self.assertEqual(properties.category, 'conjunction') self.assertEqual(properties.category, 'punctuation')
self.assertEqual(properties.lexeme_feature_map, {}) self.assertEqual(properties.lexeme_feature_map, {})
self.assertEqual(properties.form_feature_map, {}) self.assertEqual(properties.form_feature_map, {})

View File

@ -37,12 +37,11 @@ class JosPropertiesToMsdTestCase(unittest.TestCase):
self.assertEqual(msd.code, 'Pp2-sd--y') self.assertEqual(msd.code, 'Pp2-sd--y')
def test_normal_feature_level(self): def test_normal_feature_level(self):
msd = self.converter.properties_to_msd(Properties('zaimek', {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'}, {'število':'množina', 'sklon':'dajalnik'}, 'sl'), 'en') msd = self.converter.properties_to_msd(Properties('zaimek', {'vrsta':'povratni', 'naslonskost':'klitična'}, {}, 'sl'), 'en')
self.assertEqual(msd.language, 'en') self.assertEqual(msd.language, 'en')
self.assertEqual(msd.code, 'Pp2-pd--y') self.assertEqual(msd.code, 'Px------y')
def test_featureless(self): def test_featureless(self):
msd = self.converter.properties_to_msd(Properties('conjunction', {}, {}, 'en'), 'sl') msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl')
self.assertEqual(msd.language, 'sl') self.assertEqual(msd.language, 'sl')
self.assertEqual(msd.code, 'V') self.assertEqual(msd.code, 'U')