Checked if msds are valid
This commit is contained in:
parent
67382bb74f
commit
eca02ebdd3
|
@ -53,12 +53,16 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
|
|||
class Specifications:
|
||||
"""JOS specifications with list of all word categories."""
|
||||
|
||||
def __init__(self, *categories):
|
||||
self.categories = list(categories)
|
||||
def __init__(self):
|
||||
self.categories = []
|
||||
self.codes_map = {'en':set(), 'sl':set()}
|
||||
|
||||
def add_category(self, category):
|
||||
self.categories.append(category)
|
||||
|
||||
def add_code(self, code, language):
|
||||
self.codes_map[language].add(code)
|
||||
|
||||
def find_category_by_code(self, char, language):
|
||||
return next((category for category in self.categories if category.codes.get(language) == char), None)
|
||||
|
||||
|
@ -145,10 +149,17 @@ class SpecificationsParser:
|
|||
|
||||
def parse(self, file_name):
|
||||
root = lxml.parse(file_name).getroot()
|
||||
specifications = Specifications()
|
||||
div_elements = xpath_find(root, 'tei:div')
|
||||
specifications = Specifications()
|
||||
for div_element in div_elements:
|
||||
if (re.match(r'^msd\..-sl', get_xml_id(div_element))):
|
||||
xml_id = get_xml_id(div_element)
|
||||
if (xml_id == 'msd.msds-sl'):
|
||||
msd_elements = xpath_find(div_element, 'tei:table/tei:row[@role="msd"]')
|
||||
for msd_element in msd_elements:
|
||||
msd_codes = self.get_cell_pair(msd_element, 'msd')
|
||||
specifications.add_code(msd_codes.get('en').capitalize(), 'en')
|
||||
specifications.add_code(msd_codes.get('sl').capitalize(), 'sl')
|
||||
elif (re.match(r'^msd\..-sl', xml_id)):
|
||||
category_element = xpath_find(div_element, 'tei:table/tei:row[@role="type"]')[0]
|
||||
category_names = self.get_cell_pair(category_element, 'value')
|
||||
category_codes = self.get_cell_pair(category_element, 'code')
|
||||
|
@ -214,6 +225,9 @@ class Converter:
|
|||
def msd_to_properties(self, msd, language, lemma=None):
|
||||
"""Convert Msd to Properties (possibly in the other language)."""
|
||||
|
||||
if (msd.code not in self.specifications.codes_map[msd.language]):
|
||||
exit('[ERROR] msd {} is unknown'.format(msd.code))
|
||||
|
||||
category_char = msd.code[0].lower()
|
||||
value_chars = msd.code[1:]
|
||||
category = self.specifications.find_category_by_code(category_char, msd.language)
|
||||
|
|
|
@ -39,23 +39,23 @@ class JosMsdToPropertiesTestCase(unittest.TestCase):
|
|||
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'})
|
||||
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
||||
|
||||
def test_exception_feature_level(self): # testing level exception: ('pronoun', 8, 'ti')
|
||||
def test_exception_feature_level(self):
|
||||
properties = self.converter.msd_to_properties(Msd('Pp2-sd--y', 'en'), 'sl', 'ti')
|
||||
self.assertEqual(properties.language, 'sl')
|
||||
self.assertEqual(properties.category, 'zaimek')
|
||||
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga'})
|
||||
self.assertEqual(properties.form_feature_map, {'število':'ednina', 'sklon':'dajalnik', 'naslonskost':'klitična'})
|
||||
|
||||
def test_normal_feature_level(self): # invalid msd, but useful for testing contrast with previous test
|
||||
properties = self.converter.msd_to_properties(Msd('Pp2-pd--y', 'en'), 'sl', 'vi')
|
||||
def test_normal_feature_level(self):
|
||||
properties = self.converter.msd_to_properties(Msd('Px------y', 'en'), 'sl', 'jst')
|
||||
self.assertEqual(properties.language, 'sl')
|
||||
self.assertEqual(properties.category, 'zaimek')
|
||||
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'})
|
||||
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
||||
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'povratni', 'naslonskost':'klitična'})
|
||||
self.assertEqual(properties.form_feature_map, {})
|
||||
|
||||
def test_featureless(self):
|
||||
properties = self.converter.msd_to_properties(Msd('V', 'sl'), 'en')
|
||||
properties = self.converter.msd_to_properties(Msd('U', 'sl'), 'en')
|
||||
self.assertEqual(properties.language, 'en')
|
||||
self.assertEqual(properties.category, 'conjunction')
|
||||
self.assertEqual(properties.category, 'punctuation')
|
||||
self.assertEqual(properties.lexeme_feature_map, {})
|
||||
self.assertEqual(properties.form_feature_map, {})
|
||||
|
|
|
@ -37,12 +37,11 @@ class JosPropertiesToMsdTestCase(unittest.TestCase):
|
|||
self.assertEqual(msd.code, 'Pp2-sd--y')
|
||||
|
||||
def test_normal_feature_level(self):
|
||||
msd = self.converter.properties_to_msd(Properties('zaimek', {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'}, {'število':'množina', 'sklon':'dajalnik'}, 'sl'), 'en')
|
||||
msd = self.converter.properties_to_msd(Properties('zaimek', {'vrsta':'povratni', 'naslonskost':'klitična'}, {}, 'sl'), 'en')
|
||||
self.assertEqual(msd.language, 'en')
|
||||
self.assertEqual(msd.code, 'Pp2-pd--y')
|
||||
self.assertEqual(msd.code, 'Px------y')
|
||||
|
||||
def test_featureless(self):
|
||||
msd = self.converter.properties_to_msd(Properties('conjunction', {}, {}, 'en'), 'sl')
|
||||
msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl')
|
||||
self.assertEqual(msd.language, 'sl')
|
||||
self.assertEqual(msd.code, 'V')
|
||||
|
||||
self.assertEqual(msd.code, 'U')
|
||||
|
|
Loading…
Reference in New Issue
Block a user