import lxml.etree as lxml import re import sys from conversion_utils.utils import xpath_find, get_xml_id LANGUAGE_INDEX_MAP = {'en':0, 'sl':1} LEXEME_FEATURE_MAP = {'noun':{1,2}, 'verb':{1,2}, 'adjective':{1}, 'adverb':{1}, 'pronoun':{1,2,6,7,8}, 'numeral':{1,2}, 'preposition':{1}, 'conjunction':{1}, 'particle':set(), 'interjection':set(), 'abbreviation':set(), 'residual':{1}, 'punctuation':set()} LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('pronoun', 2, 'medme'), ('pronoun', 2, 'nadme'), ('pronoun', 2, 'name'), ('pronoun', 2, 'obme'), ('pronoun', 2, 'podme'), ('pronoun', 2, 'pome'), ('pronoun', 2, 'predme'), ('pronoun', 2, 'skozme'), ('pronoun', 2, 'vame'), ('pronoun', 2, 'zame'), ('pronoun', 3, 'tadva'), ('pronoun', 4, 'tadva'), ('pronoun', 5, 'čezme'), ('pronoun', 5, 'medme'), ('pronoun', 5, 'nadme'), ('pronoun', 5, 'name'), ('pronoun', 5, 'obme'), ('pronoun', 5, 'podme'), ('pronoun', 5, 'pome'), ('pronoun', 5, 'predme'), ('pronoun', 5, 'skozme'), ('pronoun', 5, 'vame'), ('pronoun', 5, 'zame'), ('pronoun', 7, 'njegov'), ('pronoun', 8, 'jaz'), ('pronoun', 8, 'on'), ('pronoun', 8, 'se'), ('pronoun', 8, 'ti')} class Specifications: def __init__(self, *categories): self.categories = list(categories) def add_category(self, category): self.categories.append(category) def find_category(self, char, language_index): return next((category for category in self.categories if category.char_pair[language_index] == char), None) def __str__(self): return 'categories:{categories}'.format(categories=self.categories) class Category: def __init__(self, string_pair, char_pair, *features): self.string_pair = string_pair self.char_pair = char_pair self.features = list(features) def add_feature(self, feature): self.features.append(feature) def find_feature(self, position): return next((feature for feature in self.features if feature.position == position), None) def __str__(self): return 'strings:{strings}, chars:{chars}, features:{features}'.\ format(strings=self.string_pair, chars=self.char_pair, features=self.features) class Feature: def __init__(self, string_pair, position, lexeme_level_flag, *values): self.string_pair = string_pair self.position = position self.lexeme_level_flag = lexeme_level_flag self.values = list(values) def add_value(self, value): self.values.append(value) def find_value(self, char, language_index): return next((value for value in self.values if value.char_pair[language_index] == char), None) def __str__(self): return 'strings:{strings}, position:{position}, level:{level}, values:{values}'.\ format(strings=self.string_pair, position=self.position, level='level' if self.lexeme_level_flag else 'form', values=self.values) class Value: def __init__(self, string_pair, char_pair): self.char_pair = char_pair self.string_pair = string_pair def __str__(self): return 'chars:{chars}, strings:{strings}'.\ format(chars=self.char_pair, strings=self.strings_pair) class SpecificationsBuilder: def build(self, file_name): root = lxml.parse(file_name).getroot() specifications = Specifications() div_elements = xpath_find(root, 'tei:div') for div_element in div_elements: if (re.match(r'^msd\..-sl', get_xml_id(div_element))): category_element = xpath_find(div_element, 'tei:table/tei:row[@role="type"]')[0] category_string_pair = self.get_cell_pair(category_element, 'value') category_char_pair = self.get_cell_pair(category_element, 'code') category = Category(category_string_pair, category_char_pair) specifications.add_category(category) feature_elements = xpath_find(div_element, 'tei:table/tei:row[@role="attribute"]') for feature_element in feature_elements: feature_string_pair = self.get_cell_pair(feature_element, 'name') feature_position = int(self.get_cell(feature_element, 'position')) lexeme_level_flag = feature_position in LEXEME_FEATURE_MAP[category_string_pair[0]] feature = Feature(feature_string_pair, feature_position, lexeme_level_flag) category.add_feature(feature) value_elements = xpath_find(feature_element, 'tei:cell[@role="values"]/tei:table/tei:row[@role="value"]') for value_element in value_elements: value_char_pair = self.get_cell_pair(value_element, 'name') value_string_pair = self.get_cell_pair(value_element, 'code') value = Value(value_char_pair, value_string_pair) feature.add_value(value) return specifications def get_cell(self, row, role, language=None): language_condition = ' and @xml:lang="' + language + '"' if language is not None else '' expression = 'tei:cell[@role="' + role + '"' + language_condition + ']' text = xpath_find(row, expression)[0].text.lower() if (text == 'adposition'): text = 'preposition' return text def get_cell_pair(self, row, role): return (self.get_cell(row, role, 'en'), self.get_cell(row, role, 'sl')) class Properties: def __init__(self, lemma, category, feature_value_list, language_index): self.language = next(l for (l, i) in LANGUAGE_INDEX_MAP.items() if i == language_index) self.category = category.string_pair[language_index] self.lexeme_feature_map = {} self.form_feature_map = {} for (feature, value) in feature_value_list: feature_name = feature.string_pair[language_index] feature_value = value.string_pair[language_index] level_exception_flag = (category.string_pair[0], feature.position, lemma) in LEVEL_EXCEPTIONS lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag if (lexeme_level_flag): self.lexeme_feature_map[feature_name] = feature_value else: self.form_feature_map[feature_name] = feature_value def __str__(self): return 'language={language}, category={category}, lexeme features={lexeme_features}, form_features={form_features}'.\ format(language=self.language, category=self.category, lexeme_features=str(self.lexeme_feature_map), form_features=str(self.form_feature_map)) class MsdToProperties: def convert(self, specifications, lemma, msd, msd_language, properties_language): msd_language_index = LANGUAGE_INDEX_MAP[msd_language] properties_language_index = LANGUAGE_INDEX_MAP[properties_language] category_char = msd[0].lower() value_chars = msd[1:] category = specifications.find_category(category_char, msd_language_index) feature_value_list = [] for (index, value_char) in enumerate(value_chars, start=1): if (value_char != '-'): feature = category.find_feature(index) value = feature.find_value(value_char, msd_language_index) feature_value_list.append((feature, value)) return Properties(lemma, category, feature_value_list, properties_language_index)