Made non-pronoun lemmas optional, added some docstrings
This commit is contained in:
parent
53c38df80d
commit
30bafe09ac
|
@ -1,9 +1,9 @@
|
||||||
import lxml.etree as lxml
|
import lxml.etree as lxml
|
||||||
import re
|
import re
|
||||||
import sys
|
|
||||||
|
|
||||||
from conversion_utils.utils import xpath_find, get_xml_id
|
from conversion_utils.utils import xpath_find, get_xml_id
|
||||||
|
|
||||||
|
## Positions of lexeme-level features for each category
|
||||||
LEXEME_FEATURE_MAP = {'noun':{1,2},
|
LEXEME_FEATURE_MAP = {'noun':{1,2},
|
||||||
'verb':{1,2},
|
'verb':{1,2},
|
||||||
'adjective':{1},
|
'adjective':{1},
|
||||||
|
@ -18,6 +18,7 @@ LEXEME_FEATURE_MAP = {'noun':{1,2},
|
||||||
'residual':{1},
|
'residual':{1},
|
||||||
'punctuation':set()}
|
'punctuation':set()}
|
||||||
|
|
||||||
|
## Exceptions to feature levels specified in LEXEME_FEATURE_MAP
|
||||||
LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
|
LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
|
||||||
('pronoun', 2, 'medme'), ('zaimek', 2, 'medme'),
|
('pronoun', 2, 'medme'), ('zaimek', 2, 'medme'),
|
||||||
('pronoun', 2, 'nadme'), ('zaimek', 2, 'nadme'),
|
('pronoun', 2, 'nadme'), ('zaimek', 2, 'nadme'),
|
||||||
|
@ -50,6 +51,7 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
|
||||||
|
|
||||||
|
|
||||||
class Specifications:
|
class Specifications:
|
||||||
|
"""JOS specifications with list of all word categories."""
|
||||||
|
|
||||||
def __init__(self, *categories):
|
def __init__(self, *categories):
|
||||||
self.categories = list(categories)
|
self.categories = list(categories)
|
||||||
|
@ -67,20 +69,8 @@ class Specifications:
|
||||||
return 'categories:{categories}'.format(categories=self.categories)
|
return 'categories:{categories}'.format(categories=self.categories)
|
||||||
|
|
||||||
|
|
||||||
class Pair:
|
|
||||||
|
|
||||||
def __init__(self, en, sl):
|
|
||||||
self.en = en
|
|
||||||
self.sl = sl
|
|
||||||
|
|
||||||
def get(self, language):
|
|
||||||
return getattr(self, language)
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return 'en:{en}, sl:{sl}'.format(en=self.en, sl=self.sl)
|
|
||||||
|
|
||||||
|
|
||||||
class Category:
|
class Category:
|
||||||
|
"""JOS word category, including list of supported features."""
|
||||||
|
|
||||||
def __init__(self, names, codes, *features):
|
def __init__(self, names, codes, *features):
|
||||||
self.names = names
|
self.names = names
|
||||||
|
@ -102,6 +92,7 @@ class Category:
|
||||||
|
|
||||||
|
|
||||||
class Feature:
|
class Feature:
|
||||||
|
"""JOS category-dependent features, including list of supported values."""
|
||||||
|
|
||||||
def __init__(self, names, position, lexeme_level_flag, *values):
|
def __init__(self, names, position, lexeme_level_flag, *values):
|
||||||
self.names = names
|
self.names = names
|
||||||
|
@ -124,6 +115,7 @@ class Feature:
|
||||||
|
|
||||||
|
|
||||||
class Value:
|
class Value:
|
||||||
|
"""JOS feature-dependent values."""
|
||||||
|
|
||||||
def __init__(self, names, codes):
|
def __init__(self, names, codes):
|
||||||
self.codes = codes
|
self.codes = codes
|
||||||
|
@ -134,7 +126,22 @@ class Value:
|
||||||
format(codes=self.codes, names=self.names)
|
format(codes=self.codes, names=self.names)
|
||||||
|
|
||||||
|
|
||||||
|
class Pair:
|
||||||
|
"""Generic pair of English and Slovene strings."""
|
||||||
|
|
||||||
|
def __init__(self, en, sl):
|
||||||
|
self.en = en
|
||||||
|
self.sl = sl
|
||||||
|
|
||||||
|
def get(self, language):
|
||||||
|
return getattr(self, language)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return 'en:{en}, sl:{sl}'.format(en=self.en, sl=self.sl)
|
||||||
|
|
||||||
|
|
||||||
class SpecificationsParser:
|
class SpecificationsParser:
|
||||||
|
"""Parser of JOS TEI specifications, yielding Specifications."""
|
||||||
|
|
||||||
def parse(self, file_name):
|
def parse(self, file_name):
|
||||||
root = lxml.parse(file_name).getroot()
|
root = lxml.parse(file_name).getroot()
|
||||||
|
@ -174,6 +181,7 @@ class SpecificationsParser:
|
||||||
|
|
||||||
|
|
||||||
class Properties:
|
class Properties:
|
||||||
|
"""Representation of properties encoded in msds."""
|
||||||
|
|
||||||
def __init__(self, category, lexeme_feature_map, form_feature_map, language):
|
def __init__(self, category, lexeme_feature_map, form_feature_map, language):
|
||||||
self.category = category
|
self.category = category
|
||||||
|
@ -187,6 +195,7 @@ class Properties:
|
||||||
|
|
||||||
|
|
||||||
class Msd:
|
class Msd:
|
||||||
|
"""JOS msd."""
|
||||||
|
|
||||||
def __init__(self, code, language):
|
def __init__(self, code, language):
|
||||||
self.code = code
|
self.code = code
|
||||||
|
@ -197,15 +206,18 @@ class Msd:
|
||||||
|
|
||||||
|
|
||||||
class Converter:
|
class Converter:
|
||||||
|
"""Converter between Msd and Properties objects."""
|
||||||
|
|
||||||
def __init__(self, specifications):
|
def __init__(self, specifications):
|
||||||
self.specifications = specifications
|
self.specifications = specifications
|
||||||
|
|
||||||
def msd_to_properties(self, msd, lemma, properties_language):
|
def msd_to_properties(self, msd, language, lemma=None):
|
||||||
|
"""Convert Msd to Properties (possibly in the other language)."""
|
||||||
|
|
||||||
category_char = msd.code[0].lower()
|
category_char = msd.code[0].lower()
|
||||||
value_chars = msd.code[1:]
|
value_chars = msd.code[1:]
|
||||||
category = self.specifications.find_category_by_code(category_char, msd.language)
|
category = self.specifications.find_category_by_code(category_char, msd.language)
|
||||||
category_name = category.names.get(properties_language)
|
category_name = category.names.get(language)
|
||||||
feature_value_list = []
|
feature_value_list = []
|
||||||
lexeme_feature_map = {}
|
lexeme_feature_map = {}
|
||||||
form_feature_map = {}
|
form_feature_map = {}
|
||||||
|
@ -213,8 +225,10 @@ class Converter:
|
||||||
if (value_char != '-'):
|
if (value_char != '-'):
|
||||||
feature = category.find_feature_by_position(index)
|
feature = category.find_feature_by_position(index)
|
||||||
value = feature.find_value_by_char(value_char, msd.language)
|
value = feature.find_value_by_char(value_char, msd.language)
|
||||||
feature_name = feature.names.get(properties_language)
|
feature_name = feature.names.get(language)
|
||||||
feature_value = value.names.get(properties_language)
|
feature_value = value.names.get(language)
|
||||||
|
if (lemma is None and category_name in [level_exception[0] for level_exception in LEVEL_EXCEPTIONS]):
|
||||||
|
exit('[ERROR] lemma is None but feature levels depend on lemma for category {}'.format(category_name))
|
||||||
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
|
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
|
||||||
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
|
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
|
||||||
feature_value_list.append((feature, value))
|
feature_value_list.append((feature, value))
|
||||||
|
@ -222,18 +236,19 @@ class Converter:
|
||||||
lexeme_feature_map[feature_name] = feature_value
|
lexeme_feature_map[feature_name] = feature_value
|
||||||
else:
|
else:
|
||||||
form_feature_map[feature_name] = feature_value
|
form_feature_map[feature_name] = feature_value
|
||||||
return Properties(category_name, lexeme_feature_map, form_feature_map, properties_language)
|
return Properties(category_name, lexeme_feature_map, form_feature_map, language)
|
||||||
|
|
||||||
def properties_to_msd(self, properties, msd_language):
|
def properties_to_msd(self, properties, language):
|
||||||
|
"""Convert Properties to msd (possibly in the other language)."""
|
||||||
category = self.specifications.find_category_by_name(properties.category, properties.language)
|
category = self.specifications.find_category_by_name(properties.category, properties.language)
|
||||||
category_char = category.codes.get(msd_language).upper()
|
category_char = category.codes.get(language).upper()
|
||||||
feature_map = properties.lexeme_feature_map.copy()
|
feature_map = properties.lexeme_feature_map.copy()
|
||||||
feature_map.update(properties.form_feature_map.copy())
|
feature_map.update(properties.form_feature_map.copy())
|
||||||
position_map = {}
|
position_map = {}
|
||||||
for (name, value) in feature_map.items():
|
for (name, value) in feature_map.items():
|
||||||
feature = category.find_feature_by_name(name, properties.language)
|
feature = category.find_feature_by_name(name, properties.language)
|
||||||
value = feature.find_value_by_name(value, properties.language)
|
value = feature.find_value_by_name(value, properties.language)
|
||||||
position_map[feature.position] = value.codes.get(msd_language)
|
position_map[feature.position] = value.codes.get(language)
|
||||||
msd_code = category_char
|
msd_code = category_char
|
||||||
i = 0
|
i = 0
|
||||||
for position in sorted(position_map.keys()):
|
for position in sorted(position_map.keys()):
|
||||||
|
@ -242,4 +257,4 @@ class Converter:
|
||||||
msd_code += '-'
|
msd_code += '-'
|
||||||
i += 1
|
i += 1
|
||||||
msd_code += position_map[position]
|
msd_code += position_map[position]
|
||||||
return Msd(msd_code, msd_language)
|
return Msd(msd_code, language)
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import os.path
|
import os.path
|
||||||
import lxml.etree as lxml
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from conversion_utils.jos_msds_and_properties import SpecificationsParser, Converter, Msd
|
from conversion_utils.jos_msds_and_properties import SpecificationsParser, Converter, Msd
|
||||||
|
@ -13,49 +12,49 @@ class JosMsdToPropertiesTestCase(unittest.TestCase):
|
||||||
self.converter = Converter(specifications)
|
self.converter = Converter(specifications)
|
||||||
|
|
||||||
def test_en_en(self):
|
def test_en_en(self):
|
||||||
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'miza', 'en')
|
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'en')
|
||||||
self.assertEqual(properties.language, 'en')
|
self.assertEqual(properties.language, 'en')
|
||||||
self.assertEqual(properties.category, 'noun')
|
self.assertEqual(properties.category, 'noun')
|
||||||
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
|
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
|
||||||
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
|
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
|
||||||
|
|
||||||
def test_en_sl(self):
|
def test_en_sl(self):
|
||||||
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'miza', 'sl')
|
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'sl')
|
||||||
self.assertEqual(properties.language, 'sl')
|
self.assertEqual(properties.language, 'sl')
|
||||||
self.assertEqual(properties.category, 'samostalnik')
|
self.assertEqual(properties.category, 'samostalnik')
|
||||||
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'})
|
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'})
|
||||||
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
||||||
|
|
||||||
def test_sl_en(self):
|
def test_sl_en(self):
|
||||||
properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'miza', 'en')
|
properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'en')
|
||||||
self.assertEqual(properties.language, 'en')
|
self.assertEqual(properties.language, 'en')
|
||||||
self.assertEqual(properties.category, 'noun')
|
self.assertEqual(properties.category, 'noun')
|
||||||
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
|
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
|
||||||
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
|
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
|
||||||
|
|
||||||
def test_sl_sl(self):
|
def test_sl_sl(self):
|
||||||
properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'miza', 'sl')
|
properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'sl')
|
||||||
self.assertEqual(properties.language, 'sl')
|
self.assertEqual(properties.language, 'sl')
|
||||||
self.assertEqual(properties.category, 'samostalnik')
|
self.assertEqual(properties.category, 'samostalnik')
|
||||||
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'})
|
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'})
|
||||||
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
||||||
|
|
||||||
def test_exception_feature_level(self): # testing level exception: ('pronoun', 8, 'ti')
|
def test_exception_feature_level(self): # testing level exception: ('pronoun', 8, 'ti')
|
||||||
properties = self.converter.msd_to_properties(Msd('Pp2-sd--y', 'en'), 'ti', 'sl')
|
properties = self.converter.msd_to_properties(Msd('Pp2-sd--y', 'en'), 'sl', 'ti')
|
||||||
self.assertEqual(properties.language, 'sl')
|
self.assertEqual(properties.language, 'sl')
|
||||||
self.assertEqual(properties.category, 'zaimek')
|
self.assertEqual(properties.category, 'zaimek')
|
||||||
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga'})
|
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga'})
|
||||||
self.assertEqual(properties.form_feature_map, {'število':'ednina', 'sklon':'dajalnik', 'naslonskost':'klitična'})
|
self.assertEqual(properties.form_feature_map, {'število':'ednina', 'sklon':'dajalnik', 'naslonskost':'klitična'})
|
||||||
|
|
||||||
def test_normal_feature_level(self): # invalid msd, but useful for testing contrast with previous test
|
def test_normal_feature_level(self): # invalid msd, but useful for testing contrast with previous test
|
||||||
properties = self.converter.msd_to_properties(Msd('Pp2-pd--y', 'en'), 'vi', 'sl')
|
properties = self.converter.msd_to_properties(Msd('Pp2-pd--y', 'en'), 'sl', 'vi')
|
||||||
self.assertEqual(properties.language, 'sl')
|
self.assertEqual(properties.language, 'sl')
|
||||||
self.assertEqual(properties.category, 'zaimek')
|
self.assertEqual(properties.category, 'zaimek')
|
||||||
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'})
|
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'})
|
||||||
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
||||||
|
|
||||||
def test_featureless(self):
|
def test_featureless(self):
|
||||||
properties = self.converter.msd_to_properties(Msd('V', 'sl'), 'in', 'en')
|
properties = self.converter.msd_to_properties(Msd('V', 'sl'), 'en')
|
||||||
self.assertEqual(properties.language, 'en')
|
self.assertEqual(properties.language, 'en')
|
||||||
self.assertEqual(properties.category, 'conjunction')
|
self.assertEqual(properties.category, 'conjunction')
|
||||||
self.assertEqual(properties.lexeme_feature_map, {})
|
self.assertEqual(properties.lexeme_feature_map, {})
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import os.path
|
import os.path
|
||||||
import lxml.etree as lxml
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from conversion_utils.jos_msds_and_properties import SpecificationsParser, Converter, Properties
|
from conversion_utils.jos_msds_and_properties import SpecificationsParser, Converter, Properties
|
||||||
|
|
Loading…
Reference in New Issue
Block a user