Added conversion from properties to msd
This commit is contained in:
parent
c078765774
commit
5c5b2c20cc
|
@ -18,35 +18,35 @@ LEXEME_FEATURE_MAP = {'noun':{1,2},
|
||||||
'residual':{1},
|
'residual':{1},
|
||||||
'punctuation':set()}
|
'punctuation':set()}
|
||||||
|
|
||||||
LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'),
|
LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
|
||||||
('pronoun', 2, 'medme'),
|
('pronoun', 2, 'medme'), ('zaimek', 2, 'medme'),
|
||||||
('pronoun', 2, 'nadme'),
|
('pronoun', 2, 'nadme'), ('zaimek', 2, 'nadme'),
|
||||||
('pronoun', 2, 'name'),
|
('pronoun', 2, 'name'), ('zaimek', 2, 'name'),
|
||||||
('pronoun', 2, 'obme'),
|
('pronoun', 2, 'obme'), ('zaimek', 2, 'obme'),
|
||||||
('pronoun', 2, 'podme'),
|
('pronoun', 2, 'podme'), ('zaimek', 2, 'podme'),
|
||||||
('pronoun', 2, 'pome'),
|
('pronoun', 2, 'pome'), ('zaimek', 2, 'pome'),
|
||||||
('pronoun', 2, 'predme'),
|
('pronoun', 2, 'predme'), ('zaimek', 2, 'predme'),
|
||||||
('pronoun', 2, 'skozme'),
|
('pronoun', 2, 'skozme'), ('zaimek', 2, 'skozme'),
|
||||||
('pronoun', 2, 'vame'),
|
('pronoun', 2, 'vame'), ('zaimek', 2, 'vame'),
|
||||||
('pronoun', 2, 'zame'),
|
('pronoun', 2, 'zame'), ('zaimek', 2, 'zame'),
|
||||||
('pronoun', 3, 'tadva'),
|
('pronoun', 3, 'tadva'), ('zaimek', 3, 'tadva'),
|
||||||
('pronoun', 4, 'tadva'),
|
('pronoun', 4, 'tadva'), ('zaimek', 4, 'tadva'),
|
||||||
('pronoun', 5, 'čezme'),
|
('pronoun', 5, 'čezme'), ('zaimek', 5, 'čezme'),
|
||||||
('pronoun', 5, 'medme'),
|
('pronoun', 5, 'medme'), ('zaimek', 5, 'medme'),
|
||||||
('pronoun', 5, 'nadme'),
|
('pronoun', 5, 'nadme'), ('zaimek', 5, 'nadme'),
|
||||||
('pronoun', 5, 'name'),
|
('pronoun', 5, 'name'), ('zaimek', 5, 'name'),
|
||||||
('pronoun', 5, 'obme'),
|
('pronoun', 5, 'obme'), ('zaimek', 5, 'obme'),
|
||||||
('pronoun', 5, 'podme'),
|
('pronoun', 5, 'podme'), ('zaimek', 5, 'podme'),
|
||||||
('pronoun', 5, 'pome'),
|
('pronoun', 5, 'pome'), ('zaimek', 5, 'pome'),
|
||||||
('pronoun', 5, 'predme'),
|
('pronoun', 5, 'predme'), ('zaimek', 5, 'predme'),
|
||||||
('pronoun', 5, 'skozme'),
|
('pronoun', 5, 'skozme'), ('zaimek', 5, 'skozme'),
|
||||||
('pronoun', 5, 'vame'),
|
('pronoun', 5, 'vame'), ('zaimek', 5, 'vame'),
|
||||||
('pronoun', 5, 'zame'),
|
('pronoun', 5, 'zame'), ('zaimek', 5, 'zame'),
|
||||||
('pronoun', 7, 'njegov'),
|
('pronoun', 7, 'njegov'), ('zaimek', 7, 'njegov'),
|
||||||
('pronoun', 8, 'jaz'),
|
('pronoun', 8, 'jaz'), ('zaimek', 8, 'jaz'),
|
||||||
('pronoun', 8, 'on'),
|
('pronoun', 8, 'on'), ('zaimek', 8, 'on'),
|
||||||
('pronoun', 8, 'se'),
|
('pronoun', 8, 'se'), ('zaimek', 8, 'se'),
|
||||||
('pronoun', 8, 'ti')}
|
('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')}
|
||||||
|
|
||||||
|
|
||||||
class Specifications:
|
class Specifications:
|
||||||
|
@ -57,9 +57,12 @@ class Specifications:
|
||||||
def add_category(self, category):
|
def add_category(self, category):
|
||||||
self.categories.append(category)
|
self.categories.append(category)
|
||||||
|
|
||||||
def find_category(self, char, language):
|
def find_category_by_char(self, char, language):
|
||||||
return next((category for category in self.categories if category.char_pair.get(language) == char), None)
|
return next((category for category in self.categories if category.char_pair.get(language) == char), None)
|
||||||
|
|
||||||
|
def find_category_by_name(self, name, language):
|
||||||
|
return next((category for category in self.categories if category.string_pair.get(language) == name), None)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'categories:{categories}'.format(categories=self.categories)
|
return 'categories:{categories}'.format(categories=self.categories)
|
||||||
|
|
||||||
|
@ -87,9 +90,12 @@ class Category:
|
||||||
def add_feature(self, feature):
|
def add_feature(self, feature):
|
||||||
self.features.append(feature)
|
self.features.append(feature)
|
||||||
|
|
||||||
def find_feature(self, position):
|
def find_feature_by_position(self, position):
|
||||||
return next((feature for feature in self.features if feature.position == position), None)
|
return next((feature for feature in self.features if feature.position == position), None)
|
||||||
|
|
||||||
|
def find_feature_by_name(self, name, language):
|
||||||
|
return next((feature for feature in self.features if feature.string_pair.get(language) == name), None)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'strings:{strings}, chars:{chars}, features:{features}'.\
|
return 'strings:{strings}, chars:{chars}, features:{features}'.\
|
||||||
format(strings=self.string_pair, chars=self.char_pair, features=self.features)
|
format(strings=self.string_pair, chars=self.char_pair, features=self.features)
|
||||||
|
@ -106,9 +112,12 @@ class Feature:
|
||||||
def add_value(self, value):
|
def add_value(self, value):
|
||||||
self.values.append(value)
|
self.values.append(value)
|
||||||
|
|
||||||
def find_value(self, char, language):
|
def find_value_by_char(self, char, language):
|
||||||
return next((value for value in self.values if value.char_pair.get(language) == char), None)
|
return next((value for value in self.values if value.char_pair.get(language) == char), None)
|
||||||
|
|
||||||
|
def find_value_by_name(self, name, language):
|
||||||
|
return next((value for value in self.values if value.string_pair.get(language) == name), None)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'strings:{strings}, position:{position}, level:{level}, values:{values}'.\
|
return 'strings:{strings}, position:{position}, level:{level}, values:{values}'.\
|
||||||
format(strings=self.string_pair, position=self.position, level='level' if self.lexeme_level_flag else 'form', values=self.values)
|
format(strings=self.string_pair, position=self.position, level='level' if self.lexeme_level_flag else 'form', values=self.values)
|
||||||
|
@ -166,20 +175,11 @@ class SpecificationsBuilder:
|
||||||
|
|
||||||
class Properties:
|
class Properties:
|
||||||
|
|
||||||
def __init__(self, lemma, category, feature_value_list, language):
|
def __init__(self, category, lexeme_feature_map, form_feature_map, language):
|
||||||
|
self.category = category
|
||||||
|
self.lexeme_feature_map = lexeme_feature_map
|
||||||
|
self.form_feature_map = form_feature_map
|
||||||
self.language = language
|
self.language = language
|
||||||
self.category = category.string_pair.get(language)
|
|
||||||
self.lexeme_feature_map = {}
|
|
||||||
self.form_feature_map = {}
|
|
||||||
for (feature, value) in feature_value_list:
|
|
||||||
feature_name = feature.string_pair.get(language)
|
|
||||||
feature_value = value.string_pair.get(language)
|
|
||||||
level_exception_flag = (category.string_pair.get('en'), feature.position, lemma) in LEVEL_EXCEPTIONS
|
|
||||||
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
|
|
||||||
if (lexeme_level_flag):
|
|
||||||
self.lexeme_feature_map[feature_name] = feature_value
|
|
||||||
else:
|
|
||||||
self.form_feature_map[feature_name] = feature_value
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'language={language}, category={category}, lexeme features={lexeme_features}, form_features={form_features}'.\
|
return 'language={language}, category={category}, lexeme features={lexeme_features}, form_features={form_features}'.\
|
||||||
|
@ -204,11 +204,48 @@ class MsdToProperties:
|
||||||
def convert(self, msd, lemma, properties_language):
|
def convert(self, msd, lemma, properties_language):
|
||||||
category_char = msd.code[0].lower()
|
category_char = msd.code[0].lower()
|
||||||
value_chars = msd.code[1:]
|
value_chars = msd.code[1:]
|
||||||
category = self.specifications.find_category(category_char, msd.language)
|
category = self.specifications.find_category_by_char(category_char, msd.language)
|
||||||
|
category_name = category.string_pair.get(properties_language)
|
||||||
feature_value_list = []
|
feature_value_list = []
|
||||||
|
lexeme_feature_map = {}
|
||||||
|
form_feature_map = {}
|
||||||
for (index, value_char) in enumerate(value_chars, start=1):
|
for (index, value_char) in enumerate(value_chars, start=1):
|
||||||
if (value_char != '-'):
|
if (value_char != '-'):
|
||||||
feature = category.find_feature(index)
|
feature = category.find_feature_by_position(index)
|
||||||
value = feature.find_value(value_char, msd.language)
|
value = feature.find_value_by_char(value_char, msd.language)
|
||||||
|
feature_name = feature.string_pair.get(properties_language)
|
||||||
|
feature_value = value.string_pair.get(properties_language)
|
||||||
|
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
|
||||||
|
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
|
||||||
feature_value_list.append((feature, value))
|
feature_value_list.append((feature, value))
|
||||||
return Properties(lemma, category, feature_value_list, properties_language)
|
if (lexeme_level_flag):
|
||||||
|
lexeme_feature_map[feature_name] = feature_value
|
||||||
|
else:
|
||||||
|
form_feature_map[feature_name] = feature_value
|
||||||
|
return Properties(category_name, lexeme_feature_map, form_feature_map, properties_language)
|
||||||
|
|
||||||
|
|
||||||
|
class PropertiesToMsd:
|
||||||
|
|
||||||
|
def __init__(self, specifications):
|
||||||
|
self.specifications = specifications
|
||||||
|
|
||||||
|
def convert(self, properties, msd_language):
|
||||||
|
category = self.specifications.find_category_by_name(properties.category, properties.language)
|
||||||
|
category_char = category.char_pair.get(msd_language).upper()
|
||||||
|
feature_map = properties.lexeme_feature_map.copy()
|
||||||
|
feature_map.update(properties.form_feature_map.copy())
|
||||||
|
position_map = {}
|
||||||
|
for (name, value) in feature_map.items():
|
||||||
|
feature = category.find_feature_by_name(name, properties.language)
|
||||||
|
value = feature.find_value_by_name(value, properties.language)
|
||||||
|
position_map[feature.position] = value.char_pair.get(msd_language)
|
||||||
|
msd_code = category_char
|
||||||
|
i = 0
|
||||||
|
for position in sorted(position_map.keys()):
|
||||||
|
i += 1
|
||||||
|
while (i < position):
|
||||||
|
msd_code += '-'
|
||||||
|
i += 1
|
||||||
|
msd_code += position_map[position]
|
||||||
|
return Msd(msd_code, msd_language)
|
||||||
|
|
49
tests/test_jos_properties_to_msd.py
Normal file
49
tests/test_jos_properties_to_msd.py
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
import os.path
|
||||||
|
import lxml.etree as lxml
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from conversion_utils.jos_msds_and_properties import SpecificationsBuilder, PropertiesToMsd, Properties
|
||||||
|
|
||||||
|
class JosPropertiesToMsdTestCase(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
specifications_file_name = os.path.join(os.path.dirname(__file__), '../resources/msd-sl.spc.xml')
|
||||||
|
builder = SpecificationsBuilder()
|
||||||
|
specifications = builder.build(specifications_file_name)
|
||||||
|
self.converter = PropertiesToMsd(specifications)
|
||||||
|
|
||||||
|
def test_en_en(self):
|
||||||
|
msd = self.converter.convert(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'en')
|
||||||
|
self.assertEqual(msd.language, 'en')
|
||||||
|
self.assertEqual(msd.code, 'Ncfdn')
|
||||||
|
|
||||||
|
def test_en_sl(self):
|
||||||
|
msd = self.converter.convert(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'sl')
|
||||||
|
self.assertEqual(msd.language, 'sl')
|
||||||
|
self.assertEqual(msd.code, 'Sozdi')
|
||||||
|
|
||||||
|
def test_sl_en(self):
|
||||||
|
msd = self.converter.convert(Properties('samostalnik', {'vrsta':'občno_ime', 'spol':'ženski'}, {'število':'dvojina', 'sklon':'imenovalnik'}, 'sl'), 'en')
|
||||||
|
self.assertEqual(msd.language, 'en')
|
||||||
|
self.assertEqual(msd.code, 'Ncfdn')
|
||||||
|
|
||||||
|
def test_sl_sl(self):
|
||||||
|
msd = self.converter.convert(Properties('samostalnik', {'vrsta':'občno_ime', 'spol':'ženski'}, {'število':'dvojina', 'sklon':'imenovalnik'}, 'sl'), 'sl')
|
||||||
|
self.assertEqual(msd.language, 'sl')
|
||||||
|
self.assertEqual(msd.code, 'Sozdi')
|
||||||
|
|
||||||
|
def test_exception_feature_level(self):
|
||||||
|
msd = self.converter.convert(Properties('zaimek', {'vrsta':'osebni', 'oseba':'druga'}, {'število':'ednina', 'sklon':'dajalnik', 'naslonskost':'klitična'}, 'sl'), 'en')
|
||||||
|
self.assertEqual(msd.language, 'en')
|
||||||
|
self.assertEqual(msd.code, 'Pp2-sd--y')
|
||||||
|
|
||||||
|
def test_normal_feature_level(self):
|
||||||
|
msd = self.converter.convert(Properties('zaimek', {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'}, {'število':'množina', 'sklon':'dajalnik'}, 'sl'), 'en')
|
||||||
|
self.assertEqual(msd.language, 'en')
|
||||||
|
self.assertEqual(msd.code, 'Pp2-pd--y')
|
||||||
|
|
||||||
|
def test_featureless(self):
|
||||||
|
msd = self.converter.convert(Properties('conjunction', {}, {}, 'en'), 'sl')
|
||||||
|
self.assertEqual(msd.language, 'sl')
|
||||||
|
self.assertEqual(msd.code, 'V')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user