Improved language handling
This commit is contained in:
parent
88f678fa1f
commit
f14879f619
|
@ -4,8 +4,6 @@ import sys
|
||||||
|
|
||||||
from conversion_utils.utils import xpath_find, get_xml_id
|
from conversion_utils.utils import xpath_find, get_xml_id
|
||||||
|
|
||||||
LANGUAGES = ['en', 'sl']
|
|
||||||
|
|
||||||
LEXEME_FEATURE_MAP = {'noun':{1,2},
|
LEXEME_FEATURE_MAP = {'noun':{1,2},
|
||||||
'verb':{1,2},
|
'verb':{1,2},
|
||||||
'adjective':{1},
|
'adjective':{1},
|
||||||
|
@ -59,13 +57,26 @@ class Specifications:
|
||||||
def add_category(self, category):
|
def add_category(self, category):
|
||||||
self.categories.append(category)
|
self.categories.append(category)
|
||||||
|
|
||||||
def find_category(self, char, language_index):
|
def find_category(self, char, language):
|
||||||
return next((category for category in self.categories if category.char_pair[language_index] == char), None)
|
return next((category for category in self.categories if category.char_pair.get(language) == char), None)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'categories:{categories}'.format(categories=self.categories)
|
return 'categories:{categories}'.format(categories=self.categories)
|
||||||
|
|
||||||
|
|
||||||
|
class Pair:
|
||||||
|
|
||||||
|
def __init__(self, en, sl):
|
||||||
|
self.en = en
|
||||||
|
self.sl = sl
|
||||||
|
|
||||||
|
def get(self, language):
|
||||||
|
return getattr(self, language)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return 'en:{en}, sl:{sl}'.format(en=self.en, sl=self.sl)
|
||||||
|
|
||||||
|
|
||||||
class Category:
|
class Category:
|
||||||
|
|
||||||
def __init__(self, string_pair, char_pair, *features):
|
def __init__(self, string_pair, char_pair, *features):
|
||||||
|
@ -95,8 +106,8 @@ class Feature:
|
||||||
def add_value(self, value):
|
def add_value(self, value):
|
||||||
self.values.append(value)
|
self.values.append(value)
|
||||||
|
|
||||||
def find_value(self, char, language_index):
|
def find_value(self, char, language):
|
||||||
return next((value for value in self.values if value.char_pair[language_index] == char), None)
|
return next((value for value in self.values if value.char_pair.get(language) == char), None)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'strings:{strings}, position:{position}, level:{level}, values:{values}'.\
|
return 'strings:{strings}, position:{position}, level:{level}, values:{values}'.\
|
||||||
|
@ -131,7 +142,7 @@ class SpecificationsBuilder:
|
||||||
for feature_element in feature_elements:
|
for feature_element in feature_elements:
|
||||||
feature_string_pair = self.get_cell_pair(feature_element, 'name')
|
feature_string_pair = self.get_cell_pair(feature_element, 'name')
|
||||||
feature_position = int(self.get_cell(feature_element, 'position'))
|
feature_position = int(self.get_cell(feature_element, 'position'))
|
||||||
lexeme_level_flag = feature_position in LEXEME_FEATURE_MAP[category_string_pair[0]]
|
lexeme_level_flag = feature_position in LEXEME_FEATURE_MAP[category_string_pair.get('en')]
|
||||||
feature = Feature(feature_string_pair, feature_position, lexeme_level_flag)
|
feature = Feature(feature_string_pair, feature_position, lexeme_level_flag)
|
||||||
category.add_feature(feature)
|
category.add_feature(feature)
|
||||||
value_elements = xpath_find(feature_element, 'tei:cell[@role="values"]/tei:table/tei:row[@role="value"]')
|
value_elements = xpath_find(feature_element, 'tei:cell[@role="values"]/tei:table/tei:row[@role="value"]')
|
||||||
|
@ -150,20 +161,20 @@ class SpecificationsBuilder:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def get_cell_pair(self, row, role):
|
def get_cell_pair(self, row, role):
|
||||||
return (self.get_cell(row, role, 'en'), self.get_cell(row, role, 'sl'))
|
return Pair(self.get_cell(row, role, 'en'), self.get_cell(row, role, 'sl'))
|
||||||
|
|
||||||
|
|
||||||
class Properties:
|
class Properties:
|
||||||
|
|
||||||
def __init__(self, lemma, category, feature_value_list, language_index):
|
def __init__(self, lemma, category, feature_value_list, language):
|
||||||
self.language = LANGUAGES[language_index]
|
self.language = language
|
||||||
self.category = category.string_pair[language_index]
|
self.category = category.string_pair.get(language)
|
||||||
self.lexeme_feature_map = {}
|
self.lexeme_feature_map = {}
|
||||||
self.form_feature_map = {}
|
self.form_feature_map = {}
|
||||||
for (feature, value) in feature_value_list:
|
for (feature, value) in feature_value_list:
|
||||||
feature_name = feature.string_pair[language_index]
|
feature_name = feature.string_pair.get(language)
|
||||||
feature_value = value.string_pair[language_index]
|
feature_value = value.string_pair.get(language)
|
||||||
level_exception_flag = (category.string_pair[0], feature.position, lemma) in LEVEL_EXCEPTIONS
|
level_exception_flag = (category.string_pair.get('en'), feature.position, lemma) in LEVEL_EXCEPTIONS
|
||||||
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
|
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
|
||||||
if (lexeme_level_flag):
|
if (lexeme_level_flag):
|
||||||
self.lexeme_feature_map[feature_name] = feature_value
|
self.lexeme_feature_map[feature_name] = feature_value
|
||||||
|
@ -188,15 +199,13 @@ class Msd:
|
||||||
class MsdToProperties:
|
class MsdToProperties:
|
||||||
|
|
||||||
def convert(self, specifications, msd, lemma, properties_language):
|
def convert(self, specifications, msd, lemma, properties_language):
|
||||||
msd_language_index = LANGUAGES.index(msd.language)
|
|
||||||
properties_language_index = LANGUAGES.index(properties_language)
|
|
||||||
category_char = msd.code[0].lower()
|
category_char = msd.code[0].lower()
|
||||||
value_chars = msd.code[1:]
|
value_chars = msd.code[1:]
|
||||||
category = specifications.find_category(category_char, msd_language_index)
|
category = specifications.find_category(category_char, msd.language)
|
||||||
feature_value_list = []
|
feature_value_list = []
|
||||||
for (index, value_char) in enumerate(value_chars, start=1):
|
for (index, value_char) in enumerate(value_chars, start=1):
|
||||||
if (value_char != '-'):
|
if (value_char != '-'):
|
||||||
feature = category.find_feature(index)
|
feature = category.find_feature(index)
|
||||||
value = feature.find_value(value_char, msd_language_index)
|
value = feature.find_value(value_char, msd.language)
|
||||||
feature_value_list.append((feature, value))
|
feature_value_list.append((feature, value))
|
||||||
return Properties(lemma, category, feature_value_list, properties_language_index)
|
return Properties(lemma, category, feature_value_list, properties_language)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user