Improved language handling

multiple_files_conllu_to_tei
Cyprian Laskowski 3 years ago
parent 88f678fa1f
commit f14879f619

@ -4,8 +4,6 @@ import sys
from conversion_utils.utils import xpath_find, get_xml_id
LANGUAGES = ['en', 'sl']
LEXEME_FEATURE_MAP = {'noun':{1,2},
'verb':{1,2},
'adjective':{1},
@ -59,13 +57,26 @@ class Specifications:
def add_category(self, category):
self.categories.append(category)
def find_category(self, char, language_index):
return next((category for category in self.categories if category.char_pair[language_index] == char), None)
def find_category(self, char, language):
return next((category for category in self.categories if category.char_pair.get(language) == char), None)
def __str__(self):
return 'categories:{categories}'.format(categories=self.categories)
class Pair:
def __init__(self, en, sl):
self.en = en
self.sl = sl
def get(self, language):
return getattr(self, language)
def __str__(self):
return 'en:{en}, sl:{sl}'.format(en=self.en, sl=self.sl)
class Category:
def __init__(self, string_pair, char_pair, *features):
@ -95,8 +106,8 @@ class Feature:
def add_value(self, value):
self.values.append(value)
def find_value(self, char, language_index):
return next((value for value in self.values if value.char_pair[language_index] == char), None)
def find_value(self, char, language):
return next((value for value in self.values if value.char_pair.get(language) == char), None)
def __str__(self):
return 'strings:{strings}, position:{position}, level:{level}, values:{values}'.\
@ -131,7 +142,7 @@ class SpecificationsBuilder:
for feature_element in feature_elements:
feature_string_pair = self.get_cell_pair(feature_element, 'name')
feature_position = int(self.get_cell(feature_element, 'position'))
lexeme_level_flag = feature_position in LEXEME_FEATURE_MAP[category_string_pair[0]]
lexeme_level_flag = feature_position in LEXEME_FEATURE_MAP[category_string_pair.get('en')]
feature = Feature(feature_string_pair, feature_position, lexeme_level_flag)
category.add_feature(feature)
value_elements = xpath_find(feature_element, 'tei:cell[@role="values"]/tei:table/tei:row[@role="value"]')
@ -150,20 +161,20 @@ class SpecificationsBuilder:
return text
def get_cell_pair(self, row, role):
return (self.get_cell(row, role, 'en'), self.get_cell(row, role, 'sl'))
return Pair(self.get_cell(row, role, 'en'), self.get_cell(row, role, 'sl'))
class Properties:
def __init__(self, lemma, category, feature_value_list, language_index):
self.language = LANGUAGES[language_index]
self.category = category.string_pair[language_index]
def __init__(self, lemma, category, feature_value_list, language):
self.language = language
self.category = category.string_pair.get(language)
self.lexeme_feature_map = {}
self.form_feature_map = {}
for (feature, value) in feature_value_list:
feature_name = feature.string_pair[language_index]
feature_value = value.string_pair[language_index]
level_exception_flag = (category.string_pair[0], feature.position, lemma) in LEVEL_EXCEPTIONS
feature_name = feature.string_pair.get(language)
feature_value = value.string_pair.get(language)
level_exception_flag = (category.string_pair.get('en'), feature.position, lemma) in LEVEL_EXCEPTIONS
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
if (lexeme_level_flag):
self.lexeme_feature_map[feature_name] = feature_value
@ -188,15 +199,13 @@ class Msd:
class MsdToProperties:
def convert(self, specifications, msd, lemma, properties_language):
msd_language_index = LANGUAGES.index(msd.language)
properties_language_index = LANGUAGES.index(properties_language)
category_char = msd.code[0].lower()
value_chars = msd.code[1:]
category = specifications.find_category(category_char, msd_language_index)
category = specifications.find_category(category_char, msd.language)
feature_value_list = []
for (index, value_char) in enumerate(value_chars, start=1):
if (value_char != '-'):
feature = category.find_feature(index)
value = feature.find_value(value_char, msd_language_index)
value = feature.find_value(value_char, msd.language)
feature_value_list.append((feature, value))
return Properties(lemma, category, feature_value_list, properties_language_index)
return Properties(lemma, category, feature_value_list, properties_language)

Loading…
Cancel
Save