Made non-pronoun lemmas optional, added some docstrings
This commit is contained in:
parent
53c38df80d
commit
30bafe09ac
|
@ -1,9 +1,9 @@
|
|||
import lxml.etree as lxml
|
||||
import re
|
||||
import sys
|
||||
|
||||
from conversion_utils.utils import xpath_find, get_xml_id
|
||||
|
||||
## Positions of lexeme-level features for each category
|
||||
LEXEME_FEATURE_MAP = {'noun':{1,2},
|
||||
'verb':{1,2},
|
||||
'adjective':{1},
|
||||
|
@ -18,6 +18,7 @@ LEXEME_FEATURE_MAP = {'noun':{1,2},
|
|||
'residual':{1},
|
||||
'punctuation':set()}
|
||||
|
||||
## Exceptions to feature levels specified in LEXEME_FEATURE_MAP
|
||||
LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
|
||||
('pronoun', 2, 'medme'), ('zaimek', 2, 'medme'),
|
||||
('pronoun', 2, 'nadme'), ('zaimek', 2, 'nadme'),
|
||||
|
@ -50,6 +51,7 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
|
|||
|
||||
|
||||
class Specifications:
|
||||
"""JOS specifications with list of all word categories."""
|
||||
|
||||
def __init__(self, *categories):
|
||||
self.categories = list(categories)
|
||||
|
@ -67,20 +69,8 @@ class Specifications:
|
|||
return 'categories:{categories}'.format(categories=self.categories)
|
||||
|
||||
|
||||
class Pair:
|
||||
|
||||
def __init__(self, en, sl):
|
||||
self.en = en
|
||||
self.sl = sl
|
||||
|
||||
def get(self, language):
|
||||
return getattr(self, language)
|
||||
|
||||
def __str__(self):
|
||||
return 'en:{en}, sl:{sl}'.format(en=self.en, sl=self.sl)
|
||||
|
||||
|
||||
class Category:
|
||||
"""JOS word category, including list of supported features."""
|
||||
|
||||
def __init__(self, names, codes, *features):
|
||||
self.names = names
|
||||
|
@ -102,6 +92,7 @@ class Category:
|
|||
|
||||
|
||||
class Feature:
|
||||
"""JOS category-dependent features, including list of supported values."""
|
||||
|
||||
def __init__(self, names, position, lexeme_level_flag, *values):
|
||||
self.names = names
|
||||
|
@ -124,6 +115,7 @@ class Feature:
|
|||
|
||||
|
||||
class Value:
|
||||
"""JOS feature-dependent values."""
|
||||
|
||||
def __init__(self, names, codes):
|
||||
self.codes = codes
|
||||
|
@ -134,7 +126,22 @@ class Value:
|
|||
format(codes=self.codes, names=self.names)
|
||||
|
||||
|
||||
class Pair:
|
||||
"""Generic pair of English and Slovene strings."""
|
||||
|
||||
def __init__(self, en, sl):
|
||||
self.en = en
|
||||
self.sl = sl
|
||||
|
||||
def get(self, language):
|
||||
return getattr(self, language)
|
||||
|
||||
def __str__(self):
|
||||
return 'en:{en}, sl:{sl}'.format(en=self.en, sl=self.sl)
|
||||
|
||||
|
||||
class SpecificationsParser:
|
||||
"""Parser of JOS TEI specifications, yielding Specifications."""
|
||||
|
||||
def parse(self, file_name):
|
||||
root = lxml.parse(file_name).getroot()
|
||||
|
@ -174,6 +181,7 @@ class SpecificationsParser:
|
|||
|
||||
|
||||
class Properties:
|
||||
"""Representation of properties encoded in msds."""
|
||||
|
||||
def __init__(self, category, lexeme_feature_map, form_feature_map, language):
|
||||
self.category = category
|
||||
|
@ -187,6 +195,7 @@ class Properties:
|
|||
|
||||
|
||||
class Msd:
|
||||
"""JOS msd."""
|
||||
|
||||
def __init__(self, code, language):
|
||||
self.code = code
|
||||
|
@ -197,15 +206,18 @@ class Msd:
|
|||
|
||||
|
||||
class Converter:
|
||||
"""Converter between Msd and Properties objects."""
|
||||
|
||||
def __init__(self, specifications):
|
||||
self.specifications = specifications
|
||||
|
||||
def msd_to_properties(self, msd, lemma, properties_language):
|
||||
def msd_to_properties(self, msd, language, lemma=None):
|
||||
"""Convert Msd to Properties (possibly in the other language)."""
|
||||
|
||||
category_char = msd.code[0].lower()
|
||||
value_chars = msd.code[1:]
|
||||
category = self.specifications.find_category_by_code(category_char, msd.language)
|
||||
category_name = category.names.get(properties_language)
|
||||
category_name = category.names.get(language)
|
||||
feature_value_list = []
|
||||
lexeme_feature_map = {}
|
||||
form_feature_map = {}
|
||||
|
@ -213,8 +225,10 @@ class Converter:
|
|||
if (value_char != '-'):
|
||||
feature = category.find_feature_by_position(index)
|
||||
value = feature.find_value_by_char(value_char, msd.language)
|
||||
feature_name = feature.names.get(properties_language)
|
||||
feature_value = value.names.get(properties_language)
|
||||
feature_name = feature.names.get(language)
|
||||
feature_value = value.names.get(language)
|
||||
if (lemma is None and category_name in [level_exception[0] for level_exception in LEVEL_EXCEPTIONS]):
|
||||
exit('[ERROR] lemma is None but feature levels depend on lemma for category {}'.format(category_name))
|
||||
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
|
||||
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
|
||||
feature_value_list.append((feature, value))
|
||||
|
@ -222,18 +236,19 @@ class Converter:
|
|||
lexeme_feature_map[feature_name] = feature_value
|
||||
else:
|
||||
form_feature_map[feature_name] = feature_value
|
||||
return Properties(category_name, lexeme_feature_map, form_feature_map, properties_language)
|
||||
return Properties(category_name, lexeme_feature_map, form_feature_map, language)
|
||||
|
||||
def properties_to_msd(self, properties, msd_language):
|
||||
def properties_to_msd(self, properties, language):
|
||||
"""Convert Properties to msd (possibly in the other language)."""
|
||||
category = self.specifications.find_category_by_name(properties.category, properties.language)
|
||||
category_char = category.codes.get(msd_language).upper()
|
||||
category_char = category.codes.get(language).upper()
|
||||
feature_map = properties.lexeme_feature_map.copy()
|
||||
feature_map.update(properties.form_feature_map.copy())
|
||||
position_map = {}
|
||||
for (name, value) in feature_map.items():
|
||||
feature = category.find_feature_by_name(name, properties.language)
|
||||
value = feature.find_value_by_name(value, properties.language)
|
||||
position_map[feature.position] = value.codes.get(msd_language)
|
||||
position_map[feature.position] = value.codes.get(language)
|
||||
msd_code = category_char
|
||||
i = 0
|
||||
for position in sorted(position_map.keys()):
|
||||
|
@ -242,4 +257,4 @@ class Converter:
|
|||
msd_code += '-'
|
||||
i += 1
|
||||
msd_code += position_map[position]
|
||||
return Msd(msd_code, msd_language)
|
||||
return Msd(msd_code, language)
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import os.path
|
||||
import lxml.etree as lxml
|
||||
import unittest
|
||||
|
||||
from conversion_utils.jos_msds_and_properties import SpecificationsParser, Converter, Msd
|
||||
|
@ -13,49 +12,49 @@ class JosMsdToPropertiesTestCase(unittest.TestCase):
|
|||
self.converter = Converter(specifications)
|
||||
|
||||
def test_en_en(self):
|
||||
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'miza', 'en')
|
||||
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'en')
|
||||
self.assertEqual(properties.language, 'en')
|
||||
self.assertEqual(properties.category, 'noun')
|
||||
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
|
||||
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
|
||||
|
||||
def test_en_sl(self):
|
||||
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'miza', 'sl')
|
||||
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'sl')
|
||||
self.assertEqual(properties.language, 'sl')
|
||||
self.assertEqual(properties.category, 'samostalnik')
|
||||
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'})
|
||||
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
||||
|
||||
def test_sl_en(self):
|
||||
properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'miza', 'en')
|
||||
properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'en')
|
||||
self.assertEqual(properties.language, 'en')
|
||||
self.assertEqual(properties.category, 'noun')
|
||||
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
|
||||
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
|
||||
|
||||
def test_sl_sl(self):
|
||||
properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'miza', 'sl')
|
||||
properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'sl')
|
||||
self.assertEqual(properties.language, 'sl')
|
||||
self.assertEqual(properties.category, 'samostalnik')
|
||||
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'})
|
||||
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
||||
|
||||
def test_exception_feature_level(self): # testing level exception: ('pronoun', 8, 'ti')
|
||||
properties = self.converter.msd_to_properties(Msd('Pp2-sd--y', 'en'), 'ti', 'sl')
|
||||
properties = self.converter.msd_to_properties(Msd('Pp2-sd--y', 'en'), 'sl', 'ti')
|
||||
self.assertEqual(properties.language, 'sl')
|
||||
self.assertEqual(properties.category, 'zaimek')
|
||||
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga'})
|
||||
self.assertEqual(properties.form_feature_map, {'število':'ednina', 'sklon':'dajalnik', 'naslonskost':'klitična'})
|
||||
|
||||
def test_normal_feature_level(self): # invalid msd, but useful for testing contrast with previous test
|
||||
properties = self.converter.msd_to_properties(Msd('Pp2-pd--y', 'en'), 'vi', 'sl')
|
||||
properties = self.converter.msd_to_properties(Msd('Pp2-pd--y', 'en'), 'sl', 'vi')
|
||||
self.assertEqual(properties.language, 'sl')
|
||||
self.assertEqual(properties.category, 'zaimek')
|
||||
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'})
|
||||
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
||||
|
||||
def test_featureless(self):
|
||||
properties = self.converter.msd_to_properties(Msd('V', 'sl'), 'in', 'en')
|
||||
properties = self.converter.msd_to_properties(Msd('V', 'sl'), 'en')
|
||||
self.assertEqual(properties.language, 'en')
|
||||
self.assertEqual(properties.category, 'conjunction')
|
||||
self.assertEqual(properties.lexeme_feature_map, {})
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import os.path
|
||||
import lxml.etree as lxml
|
||||
import unittest
|
||||
|
||||
from conversion_utils.jos_msds_and_properties import SpecificationsParser, Converter, Properties
|
||||
|
|
Loading…
Reference in New Issue
Block a user