From 5c5b2c20cc9b4ab78c0d2454f3fdf03250603687 Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Fri, 10 Sep 2021 15:10:10 +0200 Subject: [PATCH] Added conversion from properties to msd --- conversion_utils/jos_msds_and_properties.py | 135 +++++++++++++------- tests/test_jos_properties_to_msd.py | 49 +++++++ 2 files changed, 135 insertions(+), 49 deletions(-) create mode 100644 tests/test_jos_properties_to_msd.py diff --git a/conversion_utils/jos_msds_and_properties.py b/conversion_utils/jos_msds_and_properties.py index f9a7769..cccd4b9 100644 --- a/conversion_utils/jos_msds_and_properties.py +++ b/conversion_utils/jos_msds_and_properties.py @@ -18,35 +18,35 @@ LEXEME_FEATURE_MAP = {'noun':{1,2}, 'residual':{1}, 'punctuation':set()} -LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), - ('pronoun', 2, 'medme'), - ('pronoun', 2, 'nadme'), - ('pronoun', 2, 'name'), - ('pronoun', 2, 'obme'), - ('pronoun', 2, 'podme'), - ('pronoun', 2, 'pome'), - ('pronoun', 2, 'predme'), - ('pronoun', 2, 'skozme'), - ('pronoun', 2, 'vame'), - ('pronoun', 2, 'zame'), - ('pronoun', 3, 'tadva'), - ('pronoun', 4, 'tadva'), - ('pronoun', 5, 'čezme'), - ('pronoun', 5, 'medme'), - ('pronoun', 5, 'nadme'), - ('pronoun', 5, 'name'), - ('pronoun', 5, 'obme'), - ('pronoun', 5, 'podme'), - ('pronoun', 5, 'pome'), - ('pronoun', 5, 'predme'), - ('pronoun', 5, 'skozme'), - ('pronoun', 5, 'vame'), - ('pronoun', 5, 'zame'), - ('pronoun', 7, 'njegov'), - ('pronoun', 8, 'jaz'), - ('pronoun', 8, 'on'), - ('pronoun', 8, 'se'), - ('pronoun', 8, 'ti')} +LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'), + ('pronoun', 2, 'medme'), ('zaimek', 2, 'medme'), + ('pronoun', 2, 'nadme'), ('zaimek', 2, 'nadme'), + ('pronoun', 2, 'name'), ('zaimek', 2, 'name'), + ('pronoun', 2, 'obme'), ('zaimek', 2, 'obme'), + ('pronoun', 2, 'podme'), ('zaimek', 2, 'podme'), + ('pronoun', 2, 'pome'), ('zaimek', 2, 'pome'), + ('pronoun', 2, 'predme'), ('zaimek', 2, 'predme'), + ('pronoun', 2, 'skozme'), ('zaimek', 2, 'skozme'), + ('pronoun', 2, 'vame'), ('zaimek', 2, 'vame'), + ('pronoun', 2, 'zame'), ('zaimek', 2, 'zame'), + ('pronoun', 3, 'tadva'), ('zaimek', 3, 'tadva'), + ('pronoun', 4, 'tadva'), ('zaimek', 4, 'tadva'), + ('pronoun', 5, 'čezme'), ('zaimek', 5, 'čezme'), + ('pronoun', 5, 'medme'), ('zaimek', 5, 'medme'), + ('pronoun', 5, 'nadme'), ('zaimek', 5, 'nadme'), + ('pronoun', 5, 'name'), ('zaimek', 5, 'name'), + ('pronoun', 5, 'obme'), ('zaimek', 5, 'obme'), + ('pronoun', 5, 'podme'), ('zaimek', 5, 'podme'), + ('pronoun', 5, 'pome'), ('zaimek', 5, 'pome'), + ('pronoun', 5, 'predme'), ('zaimek', 5, 'predme'), + ('pronoun', 5, 'skozme'), ('zaimek', 5, 'skozme'), + ('pronoun', 5, 'vame'), ('zaimek', 5, 'vame'), + ('pronoun', 5, 'zame'), ('zaimek', 5, 'zame'), + ('pronoun', 7, 'njegov'), ('zaimek', 7, 'njegov'), + ('pronoun', 8, 'jaz'), ('zaimek', 8, 'jaz'), + ('pronoun', 8, 'on'), ('zaimek', 8, 'on'), + ('pronoun', 8, 'se'), ('zaimek', 8, 'se'), + ('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')} class Specifications: @@ -57,9 +57,12 @@ class Specifications: def add_category(self, category): self.categories.append(category) - def find_category(self, char, language): + def find_category_by_char(self, char, language): return next((category for category in self.categories if category.char_pair.get(language) == char), None) + def find_category_by_name(self, name, language): + return next((category for category in self.categories if category.string_pair.get(language) == name), None) + def __str__(self): return 'categories:{categories}'.format(categories=self.categories) @@ -87,9 +90,12 @@ class Category: def add_feature(self, feature): self.features.append(feature) - def find_feature(self, position): + def find_feature_by_position(self, position): return next((feature for feature in self.features if feature.position == position), None) + def find_feature_by_name(self, name, language): + return next((feature for feature in self.features if feature.string_pair.get(language) == name), None) + def __str__(self): return 'strings:{strings}, chars:{chars}, features:{features}'.\ format(strings=self.string_pair, chars=self.char_pair, features=self.features) @@ -106,9 +112,12 @@ class Feature: def add_value(self, value): self.values.append(value) - def find_value(self, char, language): + def find_value_by_char(self, char, language): return next((value for value in self.values if value.char_pair.get(language) == char), None) + def find_value_by_name(self, name, language): + return next((value for value in self.values if value.string_pair.get(language) == name), None) + def __str__(self): return 'strings:{strings}, position:{position}, level:{level}, values:{values}'.\ format(strings=self.string_pair, position=self.position, level='level' if self.lexeme_level_flag else 'form', values=self.values) @@ -166,20 +175,11 @@ class SpecificationsBuilder: class Properties: - def __init__(self, lemma, category, feature_value_list, language): + def __init__(self, category, lexeme_feature_map, form_feature_map, language): + self.category = category + self.lexeme_feature_map = lexeme_feature_map + self.form_feature_map = form_feature_map self.language = language - self.category = category.string_pair.get(language) - self.lexeme_feature_map = {} - self.form_feature_map = {} - for (feature, value) in feature_value_list: - feature_name = feature.string_pair.get(language) - feature_value = value.string_pair.get(language) - level_exception_flag = (category.string_pair.get('en'), feature.position, lemma) in LEVEL_EXCEPTIONS - lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag - if (lexeme_level_flag): - self.lexeme_feature_map[feature_name] = feature_value - else: - self.form_feature_map[feature_name] = feature_value def __str__(self): return 'language={language}, category={category}, lexeme features={lexeme_features}, form_features={form_features}'.\ @@ -204,11 +204,48 @@ class MsdToProperties: def convert(self, msd, lemma, properties_language): category_char = msd.code[0].lower() value_chars = msd.code[1:] - category = self.specifications.find_category(category_char, msd.language) + category = self.specifications.find_category_by_char(category_char, msd.language) + category_name = category.string_pair.get(properties_language) feature_value_list = [] + lexeme_feature_map = {} + form_feature_map = {} for (index, value_char) in enumerate(value_chars, start=1): if (value_char != '-'): - feature = category.find_feature(index) - value = feature.find_value(value_char, msd.language) + feature = category.find_feature_by_position(index) + value = feature.find_value_by_char(value_char, msd.language) + feature_name = feature.string_pair.get(properties_language) + feature_value = value.string_pair.get(properties_language) + level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS + lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag feature_value_list.append((feature, value)) - return Properties(lemma, category, feature_value_list, properties_language) + if (lexeme_level_flag): + lexeme_feature_map[feature_name] = feature_value + else: + form_feature_map[feature_name] = feature_value + return Properties(category_name, lexeme_feature_map, form_feature_map, properties_language) + + +class PropertiesToMsd: + + def __init__(self, specifications): + self.specifications = specifications + + def convert(self, properties, msd_language): + category = self.specifications.find_category_by_name(properties.category, properties.language) + category_char = category.char_pair.get(msd_language).upper() + feature_map = properties.lexeme_feature_map.copy() + feature_map.update(properties.form_feature_map.copy()) + position_map = {} + for (name, value) in feature_map.items(): + feature = category.find_feature_by_name(name, properties.language) + value = feature.find_value_by_name(value, properties.language) + position_map[feature.position] = value.char_pair.get(msd_language) + msd_code = category_char + i = 0 + for position in sorted(position_map.keys()): + i += 1 + while (i < position): + msd_code += '-' + i += 1 + msd_code += position_map[position] + return Msd(msd_code, msd_language) diff --git a/tests/test_jos_properties_to_msd.py b/tests/test_jos_properties_to_msd.py new file mode 100644 index 0000000..23c1cfe --- /dev/null +++ b/tests/test_jos_properties_to_msd.py @@ -0,0 +1,49 @@ +import os.path +import lxml.etree as lxml +import unittest + +from conversion_utils.jos_msds_and_properties import SpecificationsBuilder, PropertiesToMsd, Properties + +class JosPropertiesToMsdTestCase(unittest.TestCase): + + def setUp(self): + specifications_file_name = os.path.join(os.path.dirname(__file__), '../resources/msd-sl.spc.xml') + builder = SpecificationsBuilder() + specifications = builder.build(specifications_file_name) + self.converter = PropertiesToMsd(specifications) + + def test_en_en(self): + msd = self.converter.convert(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'en') + self.assertEqual(msd.language, 'en') + self.assertEqual(msd.code, 'Ncfdn') + + def test_en_sl(self): + msd = self.converter.convert(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'sl') + self.assertEqual(msd.language, 'sl') + self.assertEqual(msd.code, 'Sozdi') + + def test_sl_en(self): + msd = self.converter.convert(Properties('samostalnik', {'vrsta':'občno_ime', 'spol':'ženski'}, {'število':'dvojina', 'sklon':'imenovalnik'}, 'sl'), 'en') + self.assertEqual(msd.language, 'en') + self.assertEqual(msd.code, 'Ncfdn') + + def test_sl_sl(self): + msd = self.converter.convert(Properties('samostalnik', {'vrsta':'občno_ime', 'spol':'ženski'}, {'število':'dvojina', 'sklon':'imenovalnik'}, 'sl'), 'sl') + self.assertEqual(msd.language, 'sl') + self.assertEqual(msd.code, 'Sozdi') + + def test_exception_feature_level(self): + msd = self.converter.convert(Properties('zaimek', {'vrsta':'osebni', 'oseba':'druga'}, {'število':'ednina', 'sklon':'dajalnik', 'naslonskost':'klitična'}, 'sl'), 'en') + self.assertEqual(msd.language, 'en') + self.assertEqual(msd.code, 'Pp2-sd--y') + + def test_normal_feature_level(self): + msd = self.converter.convert(Properties('zaimek', {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'}, {'število':'množina', 'sklon':'dajalnik'}, 'sl'), 'en') + self.assertEqual(msd.language, 'en') + self.assertEqual(msd.code, 'Pp2-pd--y') + + def test_featureless(self): + msd = self.converter.convert(Properties('conjunction', {}, {}, 'en'), 'sl') + self.assertEqual(msd.language, 'sl') + self.assertEqual(msd.code, 'V') + \ No newline at end of file