Checked if msds are valid
This commit is contained in:
parent
67382bb74f
commit
eca02ebdd3
|
@ -53,12 +53,16 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
|
||||||
class Specifications:
|
class Specifications:
|
||||||
"""JOS specifications with list of all word categories."""
|
"""JOS specifications with list of all word categories."""
|
||||||
|
|
||||||
def __init__(self, *categories):
|
def __init__(self):
|
||||||
self.categories = list(categories)
|
self.categories = []
|
||||||
|
self.codes_map = {'en':set(), 'sl':set()}
|
||||||
|
|
||||||
def add_category(self, category):
|
def add_category(self, category):
|
||||||
self.categories.append(category)
|
self.categories.append(category)
|
||||||
|
|
||||||
|
def add_code(self, code, language):
|
||||||
|
self.codes_map[language].add(code)
|
||||||
|
|
||||||
def find_category_by_code(self, char, language):
|
def find_category_by_code(self, char, language):
|
||||||
return next((category for category in self.categories if category.codes.get(language) == char), None)
|
return next((category for category in self.categories if category.codes.get(language) == char), None)
|
||||||
|
|
||||||
|
@ -145,10 +149,17 @@ class SpecificationsParser:
|
||||||
|
|
||||||
def parse(self, file_name):
|
def parse(self, file_name):
|
||||||
root = lxml.parse(file_name).getroot()
|
root = lxml.parse(file_name).getroot()
|
||||||
specifications = Specifications()
|
|
||||||
div_elements = xpath_find(root, 'tei:div')
|
div_elements = xpath_find(root, 'tei:div')
|
||||||
|
specifications = Specifications()
|
||||||
for div_element in div_elements:
|
for div_element in div_elements:
|
||||||
if (re.match(r'^msd\..-sl', get_xml_id(div_element))):
|
xml_id = get_xml_id(div_element)
|
||||||
|
if (xml_id == 'msd.msds-sl'):
|
||||||
|
msd_elements = xpath_find(div_element, 'tei:table/tei:row[@role="msd"]')
|
||||||
|
for msd_element in msd_elements:
|
||||||
|
msd_codes = self.get_cell_pair(msd_element, 'msd')
|
||||||
|
specifications.add_code(msd_codes.get('en').capitalize(), 'en')
|
||||||
|
specifications.add_code(msd_codes.get('sl').capitalize(), 'sl')
|
||||||
|
elif (re.match(r'^msd\..-sl', xml_id)):
|
||||||
category_element = xpath_find(div_element, 'tei:table/tei:row[@role="type"]')[0]
|
category_element = xpath_find(div_element, 'tei:table/tei:row[@role="type"]')[0]
|
||||||
category_names = self.get_cell_pair(category_element, 'value')
|
category_names = self.get_cell_pair(category_element, 'value')
|
||||||
category_codes = self.get_cell_pair(category_element, 'code')
|
category_codes = self.get_cell_pair(category_element, 'code')
|
||||||
|
@ -214,6 +225,9 @@ class Converter:
|
||||||
def msd_to_properties(self, msd, language, lemma=None):
|
def msd_to_properties(self, msd, language, lemma=None):
|
||||||
"""Convert Msd to Properties (possibly in the other language)."""
|
"""Convert Msd to Properties (possibly in the other language)."""
|
||||||
|
|
||||||
|
if (msd.code not in self.specifications.codes_map[msd.language]):
|
||||||
|
exit('[ERROR] msd {} is unknown'.format(msd.code))
|
||||||
|
|
||||||
category_char = msd.code[0].lower()
|
category_char = msd.code[0].lower()
|
||||||
value_chars = msd.code[1:]
|
value_chars = msd.code[1:]
|
||||||
category = self.specifications.find_category_by_code(category_char, msd.language)
|
category = self.specifications.find_category_by_code(category_char, msd.language)
|
||||||
|
|
|
@ -39,23 +39,23 @@ class JosMsdToPropertiesTestCase(unittest.TestCase):
|
||||||
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'})
|
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'})
|
||||||
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
||||||
|
|
||||||
def test_exception_feature_level(self): # testing level exception: ('pronoun', 8, 'ti')
|
def test_exception_feature_level(self):
|
||||||
properties = self.converter.msd_to_properties(Msd('Pp2-sd--y', 'en'), 'sl', 'ti')
|
properties = self.converter.msd_to_properties(Msd('Pp2-sd--y', 'en'), 'sl', 'ti')
|
||||||
self.assertEqual(properties.language, 'sl')
|
self.assertEqual(properties.language, 'sl')
|
||||||
self.assertEqual(properties.category, 'zaimek')
|
self.assertEqual(properties.category, 'zaimek')
|
||||||
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga'})
|
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga'})
|
||||||
self.assertEqual(properties.form_feature_map, {'število':'ednina', 'sklon':'dajalnik', 'naslonskost':'klitična'})
|
self.assertEqual(properties.form_feature_map, {'število':'ednina', 'sklon':'dajalnik', 'naslonskost':'klitična'})
|
||||||
|
|
||||||
def test_normal_feature_level(self): # invalid msd, but useful for testing contrast with previous test
|
def test_normal_feature_level(self):
|
||||||
properties = self.converter.msd_to_properties(Msd('Pp2-pd--y', 'en'), 'sl', 'vi')
|
properties = self.converter.msd_to_properties(Msd('Px------y', 'en'), 'sl', 'jst')
|
||||||
self.assertEqual(properties.language, 'sl')
|
self.assertEqual(properties.language, 'sl')
|
||||||
self.assertEqual(properties.category, 'zaimek')
|
self.assertEqual(properties.category, 'zaimek')
|
||||||
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'})
|
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'povratni', 'naslonskost':'klitična'})
|
||||||
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
self.assertEqual(properties.form_feature_map, {})
|
||||||
|
|
||||||
def test_featureless(self):
|
def test_featureless(self):
|
||||||
properties = self.converter.msd_to_properties(Msd('V', 'sl'), 'en')
|
properties = self.converter.msd_to_properties(Msd('U', 'sl'), 'en')
|
||||||
self.assertEqual(properties.language, 'en')
|
self.assertEqual(properties.language, 'en')
|
||||||
self.assertEqual(properties.category, 'conjunction')
|
self.assertEqual(properties.category, 'punctuation')
|
||||||
self.assertEqual(properties.lexeme_feature_map, {})
|
self.assertEqual(properties.lexeme_feature_map, {})
|
||||||
self.assertEqual(properties.form_feature_map, {})
|
self.assertEqual(properties.form_feature_map, {})
|
||||||
|
|
|
@ -37,12 +37,11 @@ class JosPropertiesToMsdTestCase(unittest.TestCase):
|
||||||
self.assertEqual(msd.code, 'Pp2-sd--y')
|
self.assertEqual(msd.code, 'Pp2-sd--y')
|
||||||
|
|
||||||
def test_normal_feature_level(self):
|
def test_normal_feature_level(self):
|
||||||
msd = self.converter.properties_to_msd(Properties('zaimek', {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'}, {'število':'množina', 'sklon':'dajalnik'}, 'sl'), 'en')
|
msd = self.converter.properties_to_msd(Properties('zaimek', {'vrsta':'povratni', 'naslonskost':'klitična'}, {}, 'sl'), 'en')
|
||||||
self.assertEqual(msd.language, 'en')
|
self.assertEqual(msd.language, 'en')
|
||||||
self.assertEqual(msd.code, 'Pp2-pd--y')
|
self.assertEqual(msd.code, 'Px------y')
|
||||||
|
|
||||||
def test_featureless(self):
|
def test_featureless(self):
|
||||||
msd = self.converter.properties_to_msd(Properties('conjunction', {}, {}, 'en'), 'sl')
|
msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl')
|
||||||
self.assertEqual(msd.language, 'sl')
|
self.assertEqual(msd.language, 'sl')
|
||||||
self.assertEqual(msd.code, 'V')
|
self.assertEqual(msd.code, 'U')
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user