Added conversion from properties to msd

This commit is contained in:
Cyprian Laskowski 2021-09-10 15:10:10 +02:00
parent c078765774
commit 5c5b2c20cc
2 changed files with 135 additions and 49 deletions

View File

@ -18,35 +18,35 @@ LEXEME_FEATURE_MAP = {'noun':{1,2},
'residual':{1}, 'residual':{1},
'punctuation':set()} 'punctuation':set()}
LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
('pronoun', 2, 'medme'), ('pronoun', 2, 'medme'), ('zaimek', 2, 'medme'),
('pronoun', 2, 'nadme'), ('pronoun', 2, 'nadme'), ('zaimek', 2, 'nadme'),
('pronoun', 2, 'name'), ('pronoun', 2, 'name'), ('zaimek', 2, 'name'),
('pronoun', 2, 'obme'), ('pronoun', 2, 'obme'), ('zaimek', 2, 'obme'),
('pronoun', 2, 'podme'), ('pronoun', 2, 'podme'), ('zaimek', 2, 'podme'),
('pronoun', 2, 'pome'), ('pronoun', 2, 'pome'), ('zaimek', 2, 'pome'),
('pronoun', 2, 'predme'), ('pronoun', 2, 'predme'), ('zaimek', 2, 'predme'),
('pronoun', 2, 'skozme'), ('pronoun', 2, 'skozme'), ('zaimek', 2, 'skozme'),
('pronoun', 2, 'vame'), ('pronoun', 2, 'vame'), ('zaimek', 2, 'vame'),
('pronoun', 2, 'zame'), ('pronoun', 2, 'zame'), ('zaimek', 2, 'zame'),
('pronoun', 3, 'tadva'), ('pronoun', 3, 'tadva'), ('zaimek', 3, 'tadva'),
('pronoun', 4, 'tadva'), ('pronoun', 4, 'tadva'), ('zaimek', 4, 'tadva'),
('pronoun', 5, 'čezme'), ('pronoun', 5, 'čezme'), ('zaimek', 5, 'čezme'),
('pronoun', 5, 'medme'), ('pronoun', 5, 'medme'), ('zaimek', 5, 'medme'),
('pronoun', 5, 'nadme'), ('pronoun', 5, 'nadme'), ('zaimek', 5, 'nadme'),
('pronoun', 5, 'name'), ('pronoun', 5, 'name'), ('zaimek', 5, 'name'),
('pronoun', 5, 'obme'), ('pronoun', 5, 'obme'), ('zaimek', 5, 'obme'),
('pronoun', 5, 'podme'), ('pronoun', 5, 'podme'), ('zaimek', 5, 'podme'),
('pronoun', 5, 'pome'), ('pronoun', 5, 'pome'), ('zaimek', 5, 'pome'),
('pronoun', 5, 'predme'), ('pronoun', 5, 'predme'), ('zaimek', 5, 'predme'),
('pronoun', 5, 'skozme'), ('pronoun', 5, 'skozme'), ('zaimek', 5, 'skozme'),
('pronoun', 5, 'vame'), ('pronoun', 5, 'vame'), ('zaimek', 5, 'vame'),
('pronoun', 5, 'zame'), ('pronoun', 5, 'zame'), ('zaimek', 5, 'zame'),
('pronoun', 7, 'njegov'), ('pronoun', 7, 'njegov'), ('zaimek', 7, 'njegov'),
('pronoun', 8, 'jaz'), ('pronoun', 8, 'jaz'), ('zaimek', 8, 'jaz'),
('pronoun', 8, 'on'), ('pronoun', 8, 'on'), ('zaimek', 8, 'on'),
('pronoun', 8, 'se'), ('pronoun', 8, 'se'), ('zaimek', 8, 'se'),
('pronoun', 8, 'ti')} ('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')}
class Specifications: class Specifications:
@ -57,9 +57,12 @@ class Specifications:
def add_category(self, category): def add_category(self, category):
self.categories.append(category) self.categories.append(category)
def find_category(self, char, language): def find_category_by_char(self, char, language):
return next((category for category in self.categories if category.char_pair.get(language) == char), None) return next((category for category in self.categories if category.char_pair.get(language) == char), None)
def find_category_by_name(self, name, language):
return next((category for category in self.categories if category.string_pair.get(language) == name), None)
def __str__(self): def __str__(self):
return 'categories:{categories}'.format(categories=self.categories) return 'categories:{categories}'.format(categories=self.categories)
@ -87,9 +90,12 @@ class Category:
def add_feature(self, feature): def add_feature(self, feature):
self.features.append(feature) self.features.append(feature)
def find_feature(self, position): def find_feature_by_position(self, position):
return next((feature for feature in self.features if feature.position == position), None) return next((feature for feature in self.features if feature.position == position), None)
def find_feature_by_name(self, name, language):
return next((feature for feature in self.features if feature.string_pair.get(language) == name), None)
def __str__(self): def __str__(self):
return 'strings:{strings}, chars:{chars}, features:{features}'.\ return 'strings:{strings}, chars:{chars}, features:{features}'.\
format(strings=self.string_pair, chars=self.char_pair, features=self.features) format(strings=self.string_pair, chars=self.char_pair, features=self.features)
@ -106,9 +112,12 @@ class Feature:
def add_value(self, value): def add_value(self, value):
self.values.append(value) self.values.append(value)
def find_value(self, char, language): def find_value_by_char(self, char, language):
return next((value for value in self.values if value.char_pair.get(language) == char), None) return next((value for value in self.values if value.char_pair.get(language) == char), None)
def find_value_by_name(self, name, language):
return next((value for value in self.values if value.string_pair.get(language) == name), None)
def __str__(self): def __str__(self):
return 'strings:{strings}, position:{position}, level:{level}, values:{values}'.\ return 'strings:{strings}, position:{position}, level:{level}, values:{values}'.\
format(strings=self.string_pair, position=self.position, level='level' if self.lexeme_level_flag else 'form', values=self.values) format(strings=self.string_pair, position=self.position, level='level' if self.lexeme_level_flag else 'form', values=self.values)
@ -166,20 +175,11 @@ class SpecificationsBuilder:
class Properties: class Properties:
def __init__(self, lemma, category, feature_value_list, language): def __init__(self, category, lexeme_feature_map, form_feature_map, language):
self.category = category
self.lexeme_feature_map = lexeme_feature_map
self.form_feature_map = form_feature_map
self.language = language self.language = language
self.category = category.string_pair.get(language)
self.lexeme_feature_map = {}
self.form_feature_map = {}
for (feature, value) in feature_value_list:
feature_name = feature.string_pair.get(language)
feature_value = value.string_pair.get(language)
level_exception_flag = (category.string_pair.get('en'), feature.position, lemma) in LEVEL_EXCEPTIONS
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
if (lexeme_level_flag):
self.lexeme_feature_map[feature_name] = feature_value
else:
self.form_feature_map[feature_name] = feature_value
def __str__(self): def __str__(self):
return 'language={language}, category={category}, lexeme features={lexeme_features}, form_features={form_features}'.\ return 'language={language}, category={category}, lexeme features={lexeme_features}, form_features={form_features}'.\
@ -204,11 +204,48 @@ class MsdToProperties:
def convert(self, msd, lemma, properties_language): def convert(self, msd, lemma, properties_language):
category_char = msd.code[0].lower() category_char = msd.code[0].lower()
value_chars = msd.code[1:] value_chars = msd.code[1:]
category = self.specifications.find_category(category_char, msd.language) category = self.specifications.find_category_by_char(category_char, msd.language)
category_name = category.string_pair.get(properties_language)
feature_value_list = [] feature_value_list = []
lexeme_feature_map = {}
form_feature_map = {}
for (index, value_char) in enumerate(value_chars, start=1): for (index, value_char) in enumerate(value_chars, start=1):
if (value_char != '-'): if (value_char != '-'):
feature = category.find_feature(index) feature = category.find_feature_by_position(index)
value = feature.find_value(value_char, msd.language) value = feature.find_value_by_char(value_char, msd.language)
feature_name = feature.string_pair.get(properties_language)
feature_value = value.string_pair.get(properties_language)
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
feature_value_list.append((feature, value)) feature_value_list.append((feature, value))
return Properties(lemma, category, feature_value_list, properties_language) if (lexeme_level_flag):
lexeme_feature_map[feature_name] = feature_value
else:
form_feature_map[feature_name] = feature_value
return Properties(category_name, lexeme_feature_map, form_feature_map, properties_language)
class PropertiesToMsd:
def __init__(self, specifications):
self.specifications = specifications
def convert(self, properties, msd_language):
category = self.specifications.find_category_by_name(properties.category, properties.language)
category_char = category.char_pair.get(msd_language).upper()
feature_map = properties.lexeme_feature_map.copy()
feature_map.update(properties.form_feature_map.copy())
position_map = {}
for (name, value) in feature_map.items():
feature = category.find_feature_by_name(name, properties.language)
value = feature.find_value_by_name(value, properties.language)
position_map[feature.position] = value.char_pair.get(msd_language)
msd_code = category_char
i = 0
for position in sorted(position_map.keys()):
i += 1
while (i < position):
msd_code += '-'
i += 1
msd_code += position_map[position]
return Msd(msd_code, msd_language)

View File

@ -0,0 +1,49 @@
import os.path
import lxml.etree as lxml
import unittest
from conversion_utils.jos_msds_and_properties import SpecificationsBuilder, PropertiesToMsd, Properties
class JosPropertiesToMsdTestCase(unittest.TestCase):
def setUp(self):
specifications_file_name = os.path.join(os.path.dirname(__file__), '../resources/msd-sl.spc.xml')
builder = SpecificationsBuilder()
specifications = builder.build(specifications_file_name)
self.converter = PropertiesToMsd(specifications)
def test_en_en(self):
msd = self.converter.convert(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'en')
self.assertEqual(msd.language, 'en')
self.assertEqual(msd.code, 'Ncfdn')
def test_en_sl(self):
msd = self.converter.convert(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'sl')
self.assertEqual(msd.language, 'sl')
self.assertEqual(msd.code, 'Sozdi')
def test_sl_en(self):
msd = self.converter.convert(Properties('samostalnik', {'vrsta':'občno_ime', 'spol':'ženski'}, {'število':'dvojina', 'sklon':'imenovalnik'}, 'sl'), 'en')
self.assertEqual(msd.language, 'en')
self.assertEqual(msd.code, 'Ncfdn')
def test_sl_sl(self):
msd = self.converter.convert(Properties('samostalnik', {'vrsta':'občno_ime', 'spol':'ženski'}, {'število':'dvojina', 'sklon':'imenovalnik'}, 'sl'), 'sl')
self.assertEqual(msd.language, 'sl')
self.assertEqual(msd.code, 'Sozdi')
def test_exception_feature_level(self):
msd = self.converter.convert(Properties('zaimek', {'vrsta':'osebni', 'oseba':'druga'}, {'število':'ednina', 'sklon':'dajalnik', 'naslonskost':'klitična'}, 'sl'), 'en')
self.assertEqual(msd.language, 'en')
self.assertEqual(msd.code, 'Pp2-sd--y')
def test_normal_feature_level(self):
msd = self.converter.convert(Properties('zaimek', {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'}, {'število':'množina', 'sklon':'dajalnik'}, 'sl'), 'en')
self.assertEqual(msd.language, 'en')
self.assertEqual(msd.code, 'Pp2-pd--y')
def test_featureless(self):
msd = self.converter.convert(Properties('conjunction', {}, {}, 'en'), 'sl')
self.assertEqual(msd.language, 'sl')
self.assertEqual(msd.code, 'V')