From f14879f619b837b5932a4a01f9f5fd0394382305 Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Fri, 10 Sep 2021 14:09:50 +0200 Subject: [PATCH] Improved language handling --- conversion_utils/jos_msds_and_properties.py | 47 ++++++++++++--------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/conversion_utils/jos_msds_and_properties.py b/conversion_utils/jos_msds_and_properties.py index 2751ec1..ff82c9e 100644 --- a/conversion_utils/jos_msds_and_properties.py +++ b/conversion_utils/jos_msds_and_properties.py @@ -4,8 +4,6 @@ import sys from conversion_utils.utils import xpath_find, get_xml_id -LANGUAGES = ['en', 'sl'] - LEXEME_FEATURE_MAP = {'noun':{1,2}, 'verb':{1,2}, 'adjective':{1}, @@ -59,13 +57,26 @@ class Specifications: def add_category(self, category): self.categories.append(category) - def find_category(self, char, language_index): - return next((category for category in self.categories if category.char_pair[language_index] == char), None) + def find_category(self, char, language): + return next((category for category in self.categories if category.char_pair.get(language) == char), None) def __str__(self): return 'categories:{categories}'.format(categories=self.categories) +class Pair: + + def __init__(self, en, sl): + self.en = en + self.sl = sl + + def get(self, language): + return getattr(self, language) + + def __str__(self): + return 'en:{en}, sl:{sl}'.format(en=self.en, sl=self.sl) + + class Category: def __init__(self, string_pair, char_pair, *features): @@ -95,8 +106,8 @@ class Feature: def add_value(self, value): self.values.append(value) - def find_value(self, char, language_index): - return next((value for value in self.values if value.char_pair[language_index] == char), None) + def find_value(self, char, language): + return next((value for value in self.values if value.char_pair.get(language) == char), None) def __str__(self): return 'strings:{strings}, position:{position}, level:{level}, values:{values}'.\ @@ -131,7 +142,7 @@ class SpecificationsBuilder: for feature_element in feature_elements: feature_string_pair = self.get_cell_pair(feature_element, 'name') feature_position = int(self.get_cell(feature_element, 'position')) - lexeme_level_flag = feature_position in LEXEME_FEATURE_MAP[category_string_pair[0]] + lexeme_level_flag = feature_position in LEXEME_FEATURE_MAP[category_string_pair.get('en')] feature = Feature(feature_string_pair, feature_position, lexeme_level_flag) category.add_feature(feature) value_elements = xpath_find(feature_element, 'tei:cell[@role="values"]/tei:table/tei:row[@role="value"]') @@ -150,20 +161,20 @@ class SpecificationsBuilder: return text def get_cell_pair(self, row, role): - return (self.get_cell(row, role, 'en'), self.get_cell(row, role, 'sl')) + return Pair(self.get_cell(row, role, 'en'), self.get_cell(row, role, 'sl')) class Properties: - def __init__(self, lemma, category, feature_value_list, language_index): - self.language = LANGUAGES[language_index] - self.category = category.string_pair[language_index] + def __init__(self, lemma, category, feature_value_list, language): + self.language = language + self.category = category.string_pair.get(language) self.lexeme_feature_map = {} self.form_feature_map = {} for (feature, value) in feature_value_list: - feature_name = feature.string_pair[language_index] - feature_value = value.string_pair[language_index] - level_exception_flag = (category.string_pair[0], feature.position, lemma) in LEVEL_EXCEPTIONS + feature_name = feature.string_pair.get(language) + feature_value = value.string_pair.get(language) + level_exception_flag = (category.string_pair.get('en'), feature.position, lemma) in LEVEL_EXCEPTIONS lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag if (lexeme_level_flag): self.lexeme_feature_map[feature_name] = feature_value @@ -188,15 +199,13 @@ class Msd: class MsdToProperties: def convert(self, specifications, msd, lemma, properties_language): - msd_language_index = LANGUAGES.index(msd.language) - properties_language_index = LANGUAGES.index(properties_language) category_char = msd.code[0].lower() value_chars = msd.code[1:] - category = specifications.find_category(category_char, msd_language_index) + category = specifications.find_category(category_char, msd.language) feature_value_list = [] for (index, value_char) in enumerate(value_chars, start=1): if (value_char != '-'): feature = category.find_feature(index) - value = feature.find_value(value_char, msd_language_index) + value = feature.find_value(value_char, msd.language) feature_value_list.append((feature, value)) - return Properties(lemma, category, feature_value_list, properties_language_index) + return Properties(lemma, category, feature_value_list, properties_language)