From 30bafe09ac021c494d6e9ab63fd9c438690b31d1 Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Fri, 10 Sep 2021 15:52:11 +0200 Subject: [PATCH] Made non-pronoun lemmas optional, added some docstrings --- conversion_utils/jos_msds_and_properties.py | 61 +++++++++++++-------- tests/test_jos_msd_to_properties.py | 15 +++-- tests/test_jos_properties_to_msd.py | 1 - 3 files changed, 45 insertions(+), 32 deletions(-) diff --git a/conversion_utils/jos_msds_and_properties.py b/conversion_utils/jos_msds_and_properties.py index 64c12c0..d43c6f2 100644 --- a/conversion_utils/jos_msds_and_properties.py +++ b/conversion_utils/jos_msds_and_properties.py @@ -1,9 +1,9 @@ import lxml.etree as lxml import re -import sys from conversion_utils.utils import xpath_find, get_xml_id +## Positions of lexeme-level features for each category LEXEME_FEATURE_MAP = {'noun':{1,2}, 'verb':{1,2}, 'adjective':{1}, @@ -18,6 +18,7 @@ LEXEME_FEATURE_MAP = {'noun':{1,2}, 'residual':{1}, 'punctuation':set()} +## Exceptions to feature levels specified in LEXEME_FEATURE_MAP LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'), ('pronoun', 2, 'medme'), ('zaimek', 2, 'medme'), ('pronoun', 2, 'nadme'), ('zaimek', 2, 'nadme'), @@ -50,6 +51,7 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'), class Specifications: + """JOS specifications with list of all word categories.""" def __init__(self, *categories): self.categories = list(categories) @@ -67,20 +69,8 @@ class Specifications: return 'categories:{categories}'.format(categories=self.categories) -class Pair: - - def __init__(self, en, sl): - self.en = en - self.sl = sl - - def get(self, language): - return getattr(self, language) - - def __str__(self): - return 'en:{en}, sl:{sl}'.format(en=self.en, sl=self.sl) - - class Category: + """JOS word category, including list of supported features.""" def __init__(self, names, codes, *features): self.names = names @@ -102,6 +92,7 @@ class Category: class Feature: + """JOS category-dependent features, including list of supported values.""" def __init__(self, names, position, lexeme_level_flag, *values): self.names = names @@ -124,6 +115,7 @@ class Feature: class Value: + """JOS feature-dependent values.""" def __init__(self, names, codes): self.codes = codes @@ -134,7 +126,22 @@ class Value: format(codes=self.codes, names=self.names) +class Pair: + """Generic pair of English and Slovene strings.""" + + def __init__(self, en, sl): + self.en = en + self.sl = sl + + def get(self, language): + return getattr(self, language) + + def __str__(self): + return 'en:{en}, sl:{sl}'.format(en=self.en, sl=self.sl) + + class SpecificationsParser: + """Parser of JOS TEI specifications, yielding Specifications.""" def parse(self, file_name): root = lxml.parse(file_name).getroot() @@ -174,6 +181,7 @@ class SpecificationsParser: class Properties: + """Representation of properties encoded in msds.""" def __init__(self, category, lexeme_feature_map, form_feature_map, language): self.category = category @@ -187,6 +195,7 @@ class Properties: class Msd: + """JOS msd.""" def __init__(self, code, language): self.code = code @@ -197,15 +206,18 @@ class Msd: class Converter: + """Converter between Msd and Properties objects.""" def __init__(self, specifications): self.specifications = specifications - def msd_to_properties(self, msd, lemma, properties_language): + def msd_to_properties(self, msd, language, lemma=None): + """Convert Msd to Properties (possibly in the other language).""" + category_char = msd.code[0].lower() value_chars = msd.code[1:] category = self.specifications.find_category_by_code(category_char, msd.language) - category_name = category.names.get(properties_language) + category_name = category.names.get(language) feature_value_list = [] lexeme_feature_map = {} form_feature_map = {} @@ -213,8 +225,10 @@ class Converter: if (value_char != '-'): feature = category.find_feature_by_position(index) value = feature.find_value_by_char(value_char, msd.language) - feature_name = feature.names.get(properties_language) - feature_value = value.names.get(properties_language) + feature_name = feature.names.get(language) + feature_value = value.names.get(language) + if (lemma is None and category_name in [level_exception[0] for level_exception in LEVEL_EXCEPTIONS]): + exit('[ERROR] lemma is None but feature levels depend on lemma for category {}'.format(category_name)) level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag feature_value_list.append((feature, value)) @@ -222,18 +236,19 @@ class Converter: lexeme_feature_map[feature_name] = feature_value else: form_feature_map[feature_name] = feature_value - return Properties(category_name, lexeme_feature_map, form_feature_map, properties_language) + return Properties(category_name, lexeme_feature_map, form_feature_map, language) - def properties_to_msd(self, properties, msd_language): + def properties_to_msd(self, properties, language): + """Convert Properties to msd (possibly in the other language).""" category = self.specifications.find_category_by_name(properties.category, properties.language) - category_char = category.codes.get(msd_language).upper() + category_char = category.codes.get(language).upper() feature_map = properties.lexeme_feature_map.copy() feature_map.update(properties.form_feature_map.copy()) position_map = {} for (name, value) in feature_map.items(): feature = category.find_feature_by_name(name, properties.language) value = feature.find_value_by_name(value, properties.language) - position_map[feature.position] = value.codes.get(msd_language) + position_map[feature.position] = value.codes.get(language) msd_code = category_char i = 0 for position in sorted(position_map.keys()): @@ -242,4 +257,4 @@ class Converter: msd_code += '-' i += 1 msd_code += position_map[position] - return Msd(msd_code, msd_language) + return Msd(msd_code, language) diff --git a/tests/test_jos_msd_to_properties.py b/tests/test_jos_msd_to_properties.py index 4f0bcc3..f4afb49 100644 --- a/tests/test_jos_msd_to_properties.py +++ b/tests/test_jos_msd_to_properties.py @@ -1,5 +1,4 @@ import os.path -import lxml.etree as lxml import unittest from conversion_utils.jos_msds_and_properties import SpecificationsParser, Converter, Msd @@ -13,49 +12,49 @@ class JosMsdToPropertiesTestCase(unittest.TestCase): self.converter = Converter(specifications) def test_en_en(self): - properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'miza', 'en') + properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'en') self.assertEqual(properties.language, 'en') self.assertEqual(properties.category, 'noun') self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'}) self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'}) def test_en_sl(self): - properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'miza', 'sl') + properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'sl') self.assertEqual(properties.language, 'sl') self.assertEqual(properties.category, 'samostalnik') self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'}) self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'}) def test_sl_en(self): - properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'miza', 'en') + properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'en') self.assertEqual(properties.language, 'en') self.assertEqual(properties.category, 'noun') self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'}) self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'}) def test_sl_sl(self): - properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'miza', 'sl') + properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'sl') self.assertEqual(properties.language, 'sl') self.assertEqual(properties.category, 'samostalnik') self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'}) self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'}) def test_exception_feature_level(self): # testing level exception: ('pronoun', 8, 'ti') - properties = self.converter.msd_to_properties(Msd('Pp2-sd--y', 'en'), 'ti', 'sl') + properties = self.converter.msd_to_properties(Msd('Pp2-sd--y', 'en'), 'sl', 'ti') self.assertEqual(properties.language, 'sl') self.assertEqual(properties.category, 'zaimek') self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga'}) self.assertEqual(properties.form_feature_map, {'število':'ednina', 'sklon':'dajalnik', 'naslonskost':'klitična'}) def test_normal_feature_level(self): # invalid msd, but useful for testing contrast with previous test - properties = self.converter.msd_to_properties(Msd('Pp2-pd--y', 'en'), 'vi', 'sl') + properties = self.converter.msd_to_properties(Msd('Pp2-pd--y', 'en'), 'sl', 'vi') self.assertEqual(properties.language, 'sl') self.assertEqual(properties.category, 'zaimek') self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'}) self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'}) def test_featureless(self): - properties = self.converter.msd_to_properties(Msd('V', 'sl'), 'in', 'en') + properties = self.converter.msd_to_properties(Msd('V', 'sl'), 'en') self.assertEqual(properties.language, 'en') self.assertEqual(properties.category, 'conjunction') self.assertEqual(properties.lexeme_feature_map, {}) diff --git a/tests/test_jos_properties_to_msd.py b/tests/test_jos_properties_to_msd.py index 32a5b41..5728bdd 100644 --- a/tests/test_jos_properties_to_msd.py +++ b/tests/test_jos_properties_to_msd.py @@ -1,5 +1,4 @@ import os.path -import lxml.etree as lxml import unittest from conversion_utils.jos_msds_and_properties import SpecificationsParser, Converter, Properties