2021-09-10 08:40:59 +00:00
|
|
|
import lxml.etree as lxml
|
|
|
|
import re
|
2021-09-29 22:22:43 +00:00
|
|
|
import pickle
|
|
|
|
import importlib.resources as pkg_resources
|
2021-09-10 08:40:59 +00:00
|
|
|
|
|
|
|
from conversion_utils.utils import xpath_find, get_xml_id
|
|
|
|
|
2021-09-29 22:22:43 +00:00
|
|
|
JOS_SPECIFICATIONS_PICKLE_RESOURCE = 'jos_specifications.pickle'
|
|
|
|
|
2021-09-10 13:52:11 +00:00
|
|
|
## Positions of lexeme-level features for each category
|
2021-09-10 08:40:59 +00:00
|
|
|
LEXEME_FEATURE_MAP = {'noun':{1,2},
|
|
|
|
'verb':{1,2},
|
|
|
|
'adjective':{1},
|
|
|
|
'adverb':{1},
|
|
|
|
'pronoun':{1,2,6,7,8},
|
|
|
|
'numeral':{1,2},
|
|
|
|
'preposition':{1},
|
|
|
|
'conjunction':{1},
|
|
|
|
'particle':set(),
|
|
|
|
'interjection':set(),
|
|
|
|
'abbreviation':set(),
|
|
|
|
'residual':{1},
|
|
|
|
'punctuation':set()}
|
|
|
|
|
2021-09-10 13:52:11 +00:00
|
|
|
## Exceptions to feature levels specified in LEXEME_FEATURE_MAP
|
2021-09-10 13:10:10 +00:00
|
|
|
LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
|
|
|
|
('pronoun', 2, 'medme'), ('zaimek', 2, 'medme'),
|
|
|
|
('pronoun', 2, 'nadme'), ('zaimek', 2, 'nadme'),
|
|
|
|
('pronoun', 2, 'name'), ('zaimek', 2, 'name'),
|
|
|
|
('pronoun', 2, 'obme'), ('zaimek', 2, 'obme'),
|
|
|
|
('pronoun', 2, 'podme'), ('zaimek', 2, 'podme'),
|
|
|
|
('pronoun', 2, 'pome'), ('zaimek', 2, 'pome'),
|
|
|
|
('pronoun', 2, 'predme'), ('zaimek', 2, 'predme'),
|
|
|
|
('pronoun', 2, 'skozme'), ('zaimek', 2, 'skozme'),
|
|
|
|
('pronoun', 2, 'vame'), ('zaimek', 2, 'vame'),
|
|
|
|
('pronoun', 2, 'zame'), ('zaimek', 2, 'zame'),
|
|
|
|
('pronoun', 3, 'tadva'), ('zaimek', 3, 'tadva'),
|
|
|
|
('pronoun', 4, 'tadva'), ('zaimek', 4, 'tadva'),
|
|
|
|
('pronoun', 5, 'čezme'), ('zaimek', 5, 'čezme'),
|
|
|
|
('pronoun', 5, 'medme'), ('zaimek', 5, 'medme'),
|
|
|
|
('pronoun', 5, 'nadme'), ('zaimek', 5, 'nadme'),
|
|
|
|
('pronoun', 5, 'name'), ('zaimek', 5, 'name'),
|
|
|
|
('pronoun', 5, 'obme'), ('zaimek', 5, 'obme'),
|
|
|
|
('pronoun', 5, 'podme'), ('zaimek', 5, 'podme'),
|
|
|
|
('pronoun', 5, 'pome'), ('zaimek', 5, 'pome'),
|
|
|
|
('pronoun', 5, 'predme'), ('zaimek', 5, 'predme'),
|
|
|
|
('pronoun', 5, 'skozme'), ('zaimek', 5, 'skozme'),
|
|
|
|
('pronoun', 5, 'vame'), ('zaimek', 5, 'vame'),
|
|
|
|
('pronoun', 5, 'zame'), ('zaimek', 5, 'zame'),
|
|
|
|
('pronoun', 7, 'njegov'), ('zaimek', 7, 'njegov'),
|
|
|
|
('pronoun', 8, 'jaz'), ('zaimek', 8, 'jaz'),
|
|
|
|
('pronoun', 8, 'on'), ('zaimek', 8, 'on'),
|
|
|
|
('pronoun', 8, 'se'), ('zaimek', 8, 'se'),
|
|
|
|
('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')}
|
2021-09-10 08:40:59 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Specifications:
|
2021-09-10 13:52:11 +00:00
|
|
|
"""JOS specifications with list of all word categories."""
|
2021-09-10 08:40:59 +00:00
|
|
|
|
2021-09-10 14:27:57 +00:00
|
|
|
def __init__(self):
|
|
|
|
self.categories = []
|
|
|
|
self.codes_map = {'en':set(), 'sl':set()}
|
2021-09-10 08:40:59 +00:00
|
|
|
|
|
|
|
def add_category(self, category):
|
|
|
|
self.categories.append(category)
|
|
|
|
|
2021-09-10 14:27:57 +00:00
|
|
|
def add_code(self, code, language):
|
|
|
|
self.codes_map[language].add(code)
|
|
|
|
|
2021-09-10 13:21:55 +00:00
|
|
|
def find_category_by_code(self, char, language):
|
|
|
|
return next((category for category in self.categories if category.codes.get(language) == char), None)
|
2021-09-10 08:40:59 +00:00
|
|
|
|
2021-09-10 13:10:10 +00:00
|
|
|
def find_category_by_name(self, name, language):
|
2021-09-10 13:21:55 +00:00
|
|
|
return next((category for category in self.categories if category.names.get(language) == name), None)
|
2021-09-10 13:10:10 +00:00
|
|
|
|
2021-09-10 08:40:59 +00:00
|
|
|
def __str__(self):
|
|
|
|
return 'categories:{categories}'.format(categories=self.categories)
|
|
|
|
|
|
|
|
|
|
|
|
class Category:
|
2021-09-10 13:52:11 +00:00
|
|
|
"""JOS word category, including list of supported features."""
|
2021-09-10 08:40:59 +00:00
|
|
|
|
2021-09-10 13:21:55 +00:00
|
|
|
def __init__(self, names, codes, *features):
|
|
|
|
self.names = names
|
|
|
|
self.codes = codes
|
2021-09-10 08:40:59 +00:00
|
|
|
self.features = list(features)
|
|
|
|
|
|
|
|
def add_feature(self, feature):
|
|
|
|
self.features.append(feature)
|
|
|
|
|
2021-09-10 13:10:10 +00:00
|
|
|
def find_feature_by_position(self, position):
|
2021-09-10 08:40:59 +00:00
|
|
|
return next((feature for feature in self.features if feature.position == position), None)
|
|
|
|
|
2021-09-10 13:10:10 +00:00
|
|
|
def find_feature_by_name(self, name, language):
|
2021-09-10 13:21:55 +00:00
|
|
|
return next((feature for feature in self.features if feature.names.get(language) == name), None)
|
2021-09-10 13:10:10 +00:00
|
|
|
|
2021-09-10 08:40:59 +00:00
|
|
|
def __str__(self):
|
2021-09-10 13:21:55 +00:00
|
|
|
return 'names:{names}, codes:{codes}, features:{features}'.\
|
|
|
|
format(strings=self.names, chars=self.codes, features=self.features)
|
2021-09-10 08:40:59 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Feature:
|
2021-09-10 13:52:11 +00:00
|
|
|
"""JOS category-dependent features, including list of supported values."""
|
2021-09-10 08:40:59 +00:00
|
|
|
|
2021-09-10 13:21:55 +00:00
|
|
|
def __init__(self, names, position, lexeme_level_flag, *values):
|
|
|
|
self.names = names
|
2021-09-10 08:40:59 +00:00
|
|
|
self.position = position
|
|
|
|
self.lexeme_level_flag = lexeme_level_flag
|
|
|
|
self.values = list(values)
|
|
|
|
|
|
|
|
def add_value(self, value):
|
|
|
|
self.values.append(value)
|
|
|
|
|
2021-09-10 13:10:10 +00:00
|
|
|
def find_value_by_char(self, char, language):
|
2021-09-10 13:21:55 +00:00
|
|
|
return next((value for value in self.values if value.codes.get(language) == char), None)
|
2021-09-10 08:40:59 +00:00
|
|
|
|
2021-09-10 13:10:10 +00:00
|
|
|
def find_value_by_name(self, name, language):
|
2021-09-10 13:21:55 +00:00
|
|
|
return next((value for value in self.values if value.names.get(language) == name), None)
|
2021-09-10 13:10:10 +00:00
|
|
|
|
2021-09-10 08:40:59 +00:00
|
|
|
def __str__(self):
|
2021-09-10 13:21:55 +00:00
|
|
|
return 'names:{names}, position:{position}, level:{level}, values:{values}'.\
|
|
|
|
format(strings=self.names, position=self.position, level='level' if self.lexeme_level_flag else 'form', values=self.values)
|
2021-09-10 08:40:59 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Value:
|
2021-09-10 13:52:11 +00:00
|
|
|
"""JOS feature-dependent values."""
|
2021-09-10 08:40:59 +00:00
|
|
|
|
2021-09-10 13:21:55 +00:00
|
|
|
def __init__(self, names, codes):
|
|
|
|
self.codes = codes
|
|
|
|
self.names = names
|
2021-09-10 08:40:59 +00:00
|
|
|
|
|
|
|
def __str__(self):
|
2021-09-10 13:21:55 +00:00
|
|
|
return 'codes:{codes}, names:{names}'.\
|
|
|
|
format(codes=self.codes, names=self.names)
|
2021-09-10 08:40:59 +00:00
|
|
|
|
|
|
|
|
2021-09-10 13:52:11 +00:00
|
|
|
class Pair:
|
|
|
|
"""Generic pair of English and Slovene strings."""
|
|
|
|
|
|
|
|
def __init__(self, en, sl):
|
|
|
|
self.en = en
|
|
|
|
self.sl = sl
|
|
|
|
|
|
|
|
def get(self, language):
|
|
|
|
return getattr(self, language)
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return 'en:{en}, sl:{sl}'.format(en=self.en, sl=self.sl)
|
|
|
|
|
|
|
|
|
2021-09-10 13:30:04 +00:00
|
|
|
class SpecificationsParser:
|
2021-09-10 13:52:11 +00:00
|
|
|
"""Parser of JOS TEI specifications, yielding Specifications."""
|
2021-09-10 08:40:59 +00:00
|
|
|
|
2021-09-10 13:30:04 +00:00
|
|
|
def parse(self, file_name):
|
2021-09-10 08:40:59 +00:00
|
|
|
root = lxml.parse(file_name).getroot()
|
|
|
|
div_elements = xpath_find(root, 'tei:div')
|
2021-09-10 14:27:57 +00:00
|
|
|
specifications = Specifications()
|
2021-09-10 08:40:59 +00:00
|
|
|
for div_element in div_elements:
|
2021-09-10 14:27:57 +00:00
|
|
|
xml_id = get_xml_id(div_element)
|
|
|
|
if (xml_id == 'msd.msds-sl'):
|
|
|
|
msd_elements = xpath_find(div_element, 'tei:table/tei:row[@role="msd"]')
|
|
|
|
for msd_element in msd_elements:
|
|
|
|
msd_codes = self.get_cell_pair(msd_element, 'msd')
|
|
|
|
specifications.add_code(msd_codes.get('en').capitalize(), 'en')
|
|
|
|
specifications.add_code(msd_codes.get('sl').capitalize(), 'sl')
|
|
|
|
elif (re.match(r'^msd\..-sl', xml_id)):
|
2021-09-10 08:40:59 +00:00
|
|
|
category_element = xpath_find(div_element, 'tei:table/tei:row[@role="type"]')[0]
|
2021-09-10 13:21:55 +00:00
|
|
|
category_names = self.get_cell_pair(category_element, 'value')
|
|
|
|
category_codes = self.get_cell_pair(category_element, 'code')
|
|
|
|
category = Category(category_names, category_codes)
|
2021-09-10 08:40:59 +00:00
|
|
|
specifications.add_category(category)
|
|
|
|
feature_elements = xpath_find(div_element, 'tei:table/tei:row[@role="attribute"]')
|
|
|
|
for feature_element in feature_elements:
|
2021-09-10 13:21:55 +00:00
|
|
|
feature_names = self.get_cell_pair(feature_element, 'name')
|
2021-09-10 08:40:59 +00:00
|
|
|
feature_position = int(self.get_cell(feature_element, 'position'))
|
2021-09-10 13:21:55 +00:00
|
|
|
lexeme_level_flag = feature_position in LEXEME_FEATURE_MAP[category_names.get('en')]
|
|
|
|
feature = Feature(feature_names, feature_position, lexeme_level_flag)
|
2021-09-10 08:40:59 +00:00
|
|
|
category.add_feature(feature)
|
|
|
|
value_elements = xpath_find(feature_element, 'tei:cell[@role="values"]/tei:table/tei:row[@role="value"]')
|
|
|
|
for value_element in value_elements:
|
2021-09-10 13:21:55 +00:00
|
|
|
value_codes = self.get_cell_pair(value_element, 'name')
|
|
|
|
value_names = self.get_cell_pair(value_element, 'code')
|
|
|
|
value = Value(value_codes, value_names)
|
2021-09-10 08:40:59 +00:00
|
|
|
feature.add_value(value)
|
|
|
|
return specifications
|
|
|
|
|
|
|
|
def get_cell(self, row, role, language=None):
|
|
|
|
language_condition = ' and @xml:lang="' + language + '"' if language is not None else ''
|
|
|
|
expression = 'tei:cell[@role="' + role + '"' + language_condition + ']'
|
|
|
|
text = xpath_find(row, expression)[0].text.lower()
|
|
|
|
if (text == 'adposition'): text = 'preposition'
|
|
|
|
return text
|
|
|
|
|
|
|
|
def get_cell_pair(self, row, role):
|
2021-09-10 12:09:50 +00:00
|
|
|
return Pair(self.get_cell(row, role, 'en'), self.get_cell(row, role, 'sl'))
|
2021-09-10 08:40:59 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Properties:
|
2021-09-10 13:52:11 +00:00
|
|
|
"""Representation of properties encoded in msds."""
|
2021-09-10 08:40:59 +00:00
|
|
|
|
2021-09-10 13:10:10 +00:00
|
|
|
def __init__(self, category, lexeme_feature_map, form_feature_map, language):
|
|
|
|
self.category = category
|
|
|
|
self.lexeme_feature_map = lexeme_feature_map
|
|
|
|
self.form_feature_map = form_feature_map
|
2021-09-10 12:09:50 +00:00
|
|
|
self.language = language
|
2021-09-10 08:40:59 +00:00
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return 'language={language}, category={category}, lexeme features={lexeme_features}, form_features={form_features}'.\
|
|
|
|
format(language=self.language, category=self.category, lexeme_features=str(self.lexeme_feature_map), form_features=str(self.form_feature_map))
|
|
|
|
|
|
|
|
|
2021-09-10 11:49:34 +00:00
|
|
|
class Msd:
|
2021-09-10 13:52:11 +00:00
|
|
|
"""JOS msd."""
|
2021-09-10 11:49:34 +00:00
|
|
|
|
|
|
|
def __init__(self, code, language):
|
|
|
|
self.code = code
|
|
|
|
self.language = language
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return 'code={code}, language={language}'.format(code=self.code, language=self.language)
|
|
|
|
|
|
|
|
|
2021-09-10 13:15:04 +00:00
|
|
|
class Converter:
|
2021-09-10 13:52:11 +00:00
|
|
|
"""Converter between Msd and Properties objects."""
|
2021-09-10 08:40:59 +00:00
|
|
|
|
2021-09-29 22:22:43 +00:00
|
|
|
def __init__(self, xml_file_name=None):
|
|
|
|
if (xml_file_name is None):
|
|
|
|
if (pkg_resources.is_resource('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE)):
|
|
|
|
try:
|
|
|
|
with pkg_resources.open_binary('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE) as pickle_file:
|
|
|
|
self.specifications = pickle.load(pickle_file)
|
|
|
|
except:
|
|
|
|
exit('Could not parse specifications pickle file installed.')
|
|
|
|
else:
|
|
|
|
exit('No pickle installed or xml provided.')
|
|
|
|
else:
|
|
|
|
parser = SpecificationsParser()
|
|
|
|
try:
|
|
|
|
self.specifications = parser.parse(xml_file_name)
|
|
|
|
except:
|
|
|
|
exit('Could not parse specifications xml file provided.')
|
2021-09-10 12:12:04 +00:00
|
|
|
|
2021-09-10 13:52:11 +00:00
|
|
|
def msd_to_properties(self, msd, language, lemma=None):
|
|
|
|
"""Convert Msd to Properties (possibly in the other language)."""
|
|
|
|
|
2021-09-10 14:27:57 +00:00
|
|
|
if (msd.code not in self.specifications.codes_map[msd.language]):
|
|
|
|
exit('[ERROR] msd {} is unknown'.format(msd.code))
|
|
|
|
|
2021-09-10 11:49:34 +00:00
|
|
|
category_char = msd.code[0].lower()
|
|
|
|
value_chars = msd.code[1:]
|
2021-09-10 13:21:55 +00:00
|
|
|
category = self.specifications.find_category_by_code(category_char, msd.language)
|
2021-09-10 13:52:11 +00:00
|
|
|
category_name = category.names.get(language)
|
2021-09-10 08:40:59 +00:00
|
|
|
feature_value_list = []
|
2021-09-10 13:10:10 +00:00
|
|
|
lexeme_feature_map = {}
|
|
|
|
form_feature_map = {}
|
2021-09-10 08:40:59 +00:00
|
|
|
for (index, value_char) in enumerate(value_chars, start=1):
|
|
|
|
if (value_char != '-'):
|
2021-09-10 13:10:10 +00:00
|
|
|
feature = category.find_feature_by_position(index)
|
|
|
|
value = feature.find_value_by_char(value_char, msd.language)
|
2021-09-10 13:52:11 +00:00
|
|
|
feature_name = feature.names.get(language)
|
|
|
|
feature_value = value.names.get(language)
|
2021-09-10 13:56:01 +00:00
|
|
|
if (lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
|
|
|
|
exit('[ERROR] lemma is None but feature level depends on lemma for category={category}, position={position}'\
|
|
|
|
.format(category=category_name, position=index))
|
2021-09-10 13:10:10 +00:00
|
|
|
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
|
|
|
|
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
|
2021-09-10 08:40:59 +00:00
|
|
|
feature_value_list.append((feature, value))
|
2021-09-10 13:10:10 +00:00
|
|
|
if (lexeme_level_flag):
|
|
|
|
lexeme_feature_map[feature_name] = feature_value
|
|
|
|
else:
|
|
|
|
form_feature_map[feature_name] = feature_value
|
2021-09-10 13:52:11 +00:00
|
|
|
return Properties(category_name, lexeme_feature_map, form_feature_map, language)
|
2021-09-10 13:10:10 +00:00
|
|
|
|
2021-09-10 13:52:11 +00:00
|
|
|
def properties_to_msd(self, properties, language):
|
|
|
|
"""Convert Properties to msd (possibly in the other language)."""
|
2021-09-10 13:10:10 +00:00
|
|
|
category = self.specifications.find_category_by_name(properties.category, properties.language)
|
2021-09-10 13:52:11 +00:00
|
|
|
category_char = category.codes.get(language).upper()
|
2021-09-10 13:10:10 +00:00
|
|
|
feature_map = properties.lexeme_feature_map.copy()
|
|
|
|
feature_map.update(properties.form_feature_map.copy())
|
|
|
|
position_map = {}
|
|
|
|
for (name, value) in feature_map.items():
|
|
|
|
feature = category.find_feature_by_name(name, properties.language)
|
|
|
|
value = feature.find_value_by_name(value, properties.language)
|
2021-09-10 13:52:11 +00:00
|
|
|
position_map[feature.position] = value.codes.get(language)
|
2021-09-10 13:10:10 +00:00
|
|
|
msd_code = category_char
|
|
|
|
i = 0
|
|
|
|
for position in sorted(position_map.keys()):
|
|
|
|
i += 1
|
|
|
|
while (i < position):
|
|
|
|
msd_code += '-'
|
|
|
|
i += 1
|
|
|
|
msd_code += position_map[position]
|
2021-09-10 13:52:11 +00:00
|
|
|
return Msd(msd_code, language)
|