You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
conversion_utils/conversion_utils/jos_msds_and_properties.py

252 lines
11 KiB

import lxml.etree as lxml
import re
import sys
from conversion_utils.utils import xpath_find, get_xml_id
LEXEME_FEATURE_MAP = {'noun':{1,2},
'verb':{1,2},
'adjective':{1},
'adverb':{1},
'pronoun':{1,2,6,7,8},
'numeral':{1,2},
'preposition':{1},
'conjunction':{1},
'particle':set(),
'interjection':set(),
'abbreviation':set(),
'residual':{1},
'punctuation':set()}
LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
('pronoun', 2, 'medme'), ('zaimek', 2, 'medme'),
('pronoun', 2, 'nadme'), ('zaimek', 2, 'nadme'),
('pronoun', 2, 'name'), ('zaimek', 2, 'name'),
('pronoun', 2, 'obme'), ('zaimek', 2, 'obme'),
('pronoun', 2, 'podme'), ('zaimek', 2, 'podme'),
('pronoun', 2, 'pome'), ('zaimek', 2, 'pome'),
('pronoun', 2, 'predme'), ('zaimek', 2, 'predme'),
('pronoun', 2, 'skozme'), ('zaimek', 2, 'skozme'),
('pronoun', 2, 'vame'), ('zaimek', 2, 'vame'),
('pronoun', 2, 'zame'), ('zaimek', 2, 'zame'),
('pronoun', 3, 'tadva'), ('zaimek', 3, 'tadva'),
('pronoun', 4, 'tadva'), ('zaimek', 4, 'tadva'),
('pronoun', 5, 'čezme'), ('zaimek', 5, 'čezme'),
('pronoun', 5, 'medme'), ('zaimek', 5, 'medme'),
('pronoun', 5, 'nadme'), ('zaimek', 5, 'nadme'),
('pronoun', 5, 'name'), ('zaimek', 5, 'name'),
('pronoun', 5, 'obme'), ('zaimek', 5, 'obme'),
('pronoun', 5, 'podme'), ('zaimek', 5, 'podme'),
('pronoun', 5, 'pome'), ('zaimek', 5, 'pome'),
('pronoun', 5, 'predme'), ('zaimek', 5, 'predme'),
('pronoun', 5, 'skozme'), ('zaimek', 5, 'skozme'),
('pronoun', 5, 'vame'), ('zaimek', 5, 'vame'),
('pronoun', 5, 'zame'), ('zaimek', 5, 'zame'),
('pronoun', 7, 'njegov'), ('zaimek', 7, 'njegov'),
('pronoun', 8, 'jaz'), ('zaimek', 8, 'jaz'),
('pronoun', 8, 'on'), ('zaimek', 8, 'on'),
('pronoun', 8, 'se'), ('zaimek', 8, 'se'),
('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')}
class Specifications:
def __init__(self, *categories):
self.categories = list(categories)
def add_category(self, category):
self.categories.append(category)
def find_category_by_char(self, char, language):
return next((category for category in self.categories if category.char_pair.get(language) == char), None)
def find_category_by_name(self, name, language):
return next((category for category in self.categories if category.string_pair.get(language) == name), None)
def __str__(self):
return 'categories:{categories}'.format(categories=self.categories)
class Pair:
def __init__(self, en, sl):
self.en = en
self.sl = sl
def get(self, language):
return getattr(self, language)
def __str__(self):
return 'en:{en}, sl:{sl}'.format(en=self.en, sl=self.sl)
class Category:
def __init__(self, string_pair, char_pair, *features):
self.string_pair = string_pair
self.char_pair = char_pair
self.features = list(features)
def add_feature(self, feature):
self.features.append(feature)
def find_feature_by_position(self, position):
return next((feature for feature in self.features if feature.position == position), None)
def find_feature_by_name(self, name, language):
return next((feature for feature in self.features if feature.string_pair.get(language) == name), None)
def __str__(self):
return 'strings:{strings}, chars:{chars}, features:{features}'.\
format(strings=self.string_pair, chars=self.char_pair, features=self.features)
class Feature:
def __init__(self, string_pair, position, lexeme_level_flag, *values):
self.string_pair = string_pair
self.position = position
self.lexeme_level_flag = lexeme_level_flag
self.values = list(values)
def add_value(self, value):
self.values.append(value)
def find_value_by_char(self, char, language):
return next((value for value in self.values if value.char_pair.get(language) == char), None)
def find_value_by_name(self, name, language):
return next((value for value in self.values if value.string_pair.get(language) == name), None)
def __str__(self):
return 'strings:{strings}, position:{position}, level:{level}, values:{values}'.\
format(strings=self.string_pair, position=self.position, level='level' if self.lexeme_level_flag else 'form', values=self.values)
class Value:
def __init__(self, string_pair, char_pair):
self.char_pair = char_pair
self.string_pair = string_pair
def __str__(self):
return 'chars:{chars}, strings:{strings}'.\
format(chars=self.char_pair, strings=self.strings_pair)
class SpecificationsBuilder:
def build(self, file_name):
root = lxml.parse(file_name).getroot()
specifications = Specifications()
div_elements = xpath_find(root, 'tei:div')
for div_element in div_elements:
if (re.match(r'^msd\..-sl', get_xml_id(div_element))):
category_element = xpath_find(div_element, 'tei:table/tei:row[@role="type"]')[0]
category_string_pair = self.get_cell_pair(category_element, 'value')
category_char_pair = self.get_cell_pair(category_element, 'code')
category = Category(category_string_pair, category_char_pair)
specifications.add_category(category)
feature_elements = xpath_find(div_element, 'tei:table/tei:row[@role="attribute"]')
for feature_element in feature_elements:
feature_string_pair = self.get_cell_pair(feature_element, 'name')
feature_position = int(self.get_cell(feature_element, 'position'))
lexeme_level_flag = feature_position in LEXEME_FEATURE_MAP[category_string_pair.get('en')]
feature = Feature(feature_string_pair, feature_position, lexeme_level_flag)
category.add_feature(feature)
value_elements = xpath_find(feature_element, 'tei:cell[@role="values"]/tei:table/tei:row[@role="value"]')
for value_element in value_elements:
value_char_pair = self.get_cell_pair(value_element, 'name')
value_string_pair = self.get_cell_pair(value_element, 'code')
value = Value(value_char_pair, value_string_pair)
feature.add_value(value)
return specifications
def get_cell(self, row, role, language=None):
language_condition = ' and @xml:lang="' + language + '"' if language is not None else ''
expression = 'tei:cell[@role="' + role + '"' + language_condition + ']'
text = xpath_find(row, expression)[0].text.lower()
if (text == 'adposition'): text = 'preposition'
return text
def get_cell_pair(self, row, role):
return Pair(self.get_cell(row, role, 'en'), self.get_cell(row, role, 'sl'))
class Properties:
def __init__(self, category, lexeme_feature_map, form_feature_map, language):
self.category = category
self.lexeme_feature_map = lexeme_feature_map
self.form_feature_map = form_feature_map
self.language = language
def __str__(self):
return 'language={language}, category={category}, lexeme features={lexeme_features}, form_features={form_features}'.\
format(language=self.language, category=self.category, lexeme_features=str(self.lexeme_feature_map), form_features=str(self.form_feature_map))
class Msd:
def __init__(self, code, language):
self.code = code
self.language = language
def __str__(self):
return 'code={code}, language={language}'.format(code=self.code, language=self.language)
class MsdToProperties:
def __init__(self, specifications):
self.specifications = specifications
def convert(self, msd, lemma, properties_language):
category_char = msd.code[0].lower()
value_chars = msd.code[1:]
category = self.specifications.find_category_by_char(category_char, msd.language)
category_name = category.string_pair.get(properties_language)
feature_value_list = []
lexeme_feature_map = {}
form_feature_map = {}
for (index, value_char) in enumerate(value_chars, start=1):
if (value_char != '-'):
feature = category.find_feature_by_position(index)
value = feature.find_value_by_char(value_char, msd.language)
feature_name = feature.string_pair.get(properties_language)
feature_value = value.string_pair.get(properties_language)
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
feature_value_list.append((feature, value))
if (lexeme_level_flag):
lexeme_feature_map[feature_name] = feature_value
else:
form_feature_map[feature_name] = feature_value
return Properties(category_name, lexeme_feature_map, form_feature_map, properties_language)
class PropertiesToMsd:
def __init__(self, specifications):
self.specifications = specifications
def convert(self, properties, msd_language):
category = self.specifications.find_category_by_name(properties.category, properties.language)
category_char = category.char_pair.get(msd_language).upper()
feature_map = properties.lexeme_feature_map.copy()
feature_map.update(properties.form_feature_map.copy())
position_map = {}
for (name, value) in feature_map.items():
feature = category.find_feature_by_name(name, properties.language)
value = feature.find_value_by_name(value, properties.language)
position_map[feature.position] = value.char_pair.get(msd_language)
msd_code = category_char
i = 0
for position in sorted(position_map.keys()):
i += 1
while (i < position):
msd_code += '-'
i += 1
msd_code += position_map[position]
return Msd(msd_code, msd_language)