|
|
|
@ -1,9 +1,9 @@
|
|
|
|
|
import lxml.etree as lxml
|
|
|
|
|
import re
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
|
|
from conversion_utils.utils import xpath_find, get_xml_id
|
|
|
|
|
|
|
|
|
|
## Positions of lexeme-level features for each category
|
|
|
|
|
LEXEME_FEATURE_MAP = {'noun':{1,2},
|
|
|
|
|
'verb':{1,2},
|
|
|
|
|
'adjective':{1},
|
|
|
|
@ -18,6 +18,7 @@ LEXEME_FEATURE_MAP = {'noun':{1,2},
|
|
|
|
|
'residual':{1},
|
|
|
|
|
'punctuation':set()}
|
|
|
|
|
|
|
|
|
|
## Exceptions to feature levels specified in LEXEME_FEATURE_MAP
|
|
|
|
|
LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
|
|
|
|
|
('pronoun', 2, 'medme'), ('zaimek', 2, 'medme'),
|
|
|
|
|
('pronoun', 2, 'nadme'), ('zaimek', 2, 'nadme'),
|
|
|
|
@ -50,6 +51,7 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Specifications:
|
|
|
|
|
"""JOS specifications with list of all word categories."""
|
|
|
|
|
|
|
|
|
|
def __init__(self, *categories):
|
|
|
|
|
self.categories = list(categories)
|
|
|
|
@ -67,20 +69,8 @@ class Specifications:
|
|
|
|
|
return 'categories:{categories}'.format(categories=self.categories)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Pair:
|
|
|
|
|
|
|
|
|
|
def __init__(self, en, sl):
|
|
|
|
|
self.en = en
|
|
|
|
|
self.sl = sl
|
|
|
|
|
|
|
|
|
|
def get(self, language):
|
|
|
|
|
return getattr(self, language)
|
|
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
|
return 'en:{en}, sl:{sl}'.format(en=self.en, sl=self.sl)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Category:
|
|
|
|
|
"""JOS word category, including list of supported features."""
|
|
|
|
|
|
|
|
|
|
def __init__(self, names, codes, *features):
|
|
|
|
|
self.names = names
|
|
|
|
@ -102,6 +92,7 @@ class Category:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Feature:
|
|
|
|
|
"""JOS category-dependent features, including list of supported values."""
|
|
|
|
|
|
|
|
|
|
def __init__(self, names, position, lexeme_level_flag, *values):
|
|
|
|
|
self.names = names
|
|
|
|
@ -124,6 +115,7 @@ class Feature:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Value:
|
|
|
|
|
"""JOS feature-dependent values."""
|
|
|
|
|
|
|
|
|
|
def __init__(self, names, codes):
|
|
|
|
|
self.codes = codes
|
|
|
|
@ -134,7 +126,22 @@ class Value:
|
|
|
|
|
format(codes=self.codes, names=self.names)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Pair:
|
|
|
|
|
"""Generic pair of English and Slovene strings."""
|
|
|
|
|
|
|
|
|
|
def __init__(self, en, sl):
|
|
|
|
|
self.en = en
|
|
|
|
|
self.sl = sl
|
|
|
|
|
|
|
|
|
|
def get(self, language):
|
|
|
|
|
return getattr(self, language)
|
|
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
|
return 'en:{en}, sl:{sl}'.format(en=self.en, sl=self.sl)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SpecificationsParser:
|
|
|
|
|
"""Parser of JOS TEI specifications, yielding Specifications."""
|
|
|
|
|
|
|
|
|
|
def parse(self, file_name):
|
|
|
|
|
root = lxml.parse(file_name).getroot()
|
|
|
|
@ -174,6 +181,7 @@ class SpecificationsParser:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Properties:
|
|
|
|
|
"""Representation of properties encoded in msds."""
|
|
|
|
|
|
|
|
|
|
def __init__(self, category, lexeme_feature_map, form_feature_map, language):
|
|
|
|
|
self.category = category
|
|
|
|
@ -187,6 +195,7 @@ class Properties:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Msd:
|
|
|
|
|
"""JOS msd."""
|
|
|
|
|
|
|
|
|
|
def __init__(self, code, language):
|
|
|
|
|
self.code = code
|
|
|
|
@ -197,15 +206,18 @@ class Msd:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Converter:
|
|
|
|
|
"""Converter between Msd and Properties objects."""
|
|
|
|
|
|
|
|
|
|
def __init__(self, specifications):
|
|
|
|
|
self.specifications = specifications
|
|
|
|
|
|
|
|
|
|
def msd_to_properties(self, msd, lemma, properties_language):
|
|
|
|
|
def msd_to_properties(self, msd, language, lemma=None):
|
|
|
|
|
"""Convert Msd to Properties (possibly in the other language)."""
|
|
|
|
|
|
|
|
|
|
category_char = msd.code[0].lower()
|
|
|
|
|
value_chars = msd.code[1:]
|
|
|
|
|
category = self.specifications.find_category_by_code(category_char, msd.language)
|
|
|
|
|
category_name = category.names.get(properties_language)
|
|
|
|
|
category_name = category.names.get(language)
|
|
|
|
|
feature_value_list = []
|
|
|
|
|
lexeme_feature_map = {}
|
|
|
|
|
form_feature_map = {}
|
|
|
|
@ -213,8 +225,10 @@ class Converter:
|
|
|
|
|
if (value_char != '-'):
|
|
|
|
|
feature = category.find_feature_by_position(index)
|
|
|
|
|
value = feature.find_value_by_char(value_char, msd.language)
|
|
|
|
|
feature_name = feature.names.get(properties_language)
|
|
|
|
|
feature_value = value.names.get(properties_language)
|
|
|
|
|
feature_name = feature.names.get(language)
|
|
|
|
|
feature_value = value.names.get(language)
|
|
|
|
|
if (lemma is None and category_name in [level_exception[0] for level_exception in LEVEL_EXCEPTIONS]):
|
|
|
|
|
exit('[ERROR] lemma is None but feature levels depend on lemma for category {}'.format(category_name))
|
|
|
|
|
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
|
|
|
|
|
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
|
|
|
|
|
feature_value_list.append((feature, value))
|
|
|
|
@ -222,18 +236,19 @@ class Converter:
|
|
|
|
|
lexeme_feature_map[feature_name] = feature_value
|
|
|
|
|
else:
|
|
|
|
|
form_feature_map[feature_name] = feature_value
|
|
|
|
|
return Properties(category_name, lexeme_feature_map, form_feature_map, properties_language)
|
|
|
|
|
return Properties(category_name, lexeme_feature_map, form_feature_map, language)
|
|
|
|
|
|
|
|
|
|
def properties_to_msd(self, properties, msd_language):
|
|
|
|
|
def properties_to_msd(self, properties, language):
|
|
|
|
|
"""Convert Properties to msd (possibly in the other language)."""
|
|
|
|
|
category = self.specifications.find_category_by_name(properties.category, properties.language)
|
|
|
|
|
category_char = category.codes.get(msd_language).upper()
|
|
|
|
|
category_char = category.codes.get(language).upper()
|
|
|
|
|
feature_map = properties.lexeme_feature_map.copy()
|
|
|
|
|
feature_map.update(properties.form_feature_map.copy())
|
|
|
|
|
position_map = {}
|
|
|
|
|
for (name, value) in feature_map.items():
|
|
|
|
|
feature = category.find_feature_by_name(name, properties.language)
|
|
|
|
|
value = feature.find_value_by_name(value, properties.language)
|
|
|
|
|
position_map[feature.position] = value.codes.get(msd_language)
|
|
|
|
|
position_map[feature.position] = value.codes.get(language)
|
|
|
|
|
msd_code = category_char
|
|
|
|
|
i = 0
|
|
|
|
|
for position in sorted(position_map.keys()):
|
|
|
|
@ -242,4 +257,4 @@ class Converter:
|
|
|
|
|
msd_code += '-'
|
|
|
|
|
i += 1
|
|
|
|
|
msd_code += position_map[position]
|
|
|
|
|
return Msd(msd_code, msd_language)
|
|
|
|
|
return Msd(msd_code, language)
|
|
|
|
|