Made non-pronoun lemmas optional, added some docstrings

multiple_files_conllu_to_tei
Cyprian Laskowski 3 years ago
parent 53c38df80d
commit 30bafe09ac

@ -1,9 +1,9 @@
import lxml.etree as lxml
import re
import sys
from conversion_utils.utils import xpath_find, get_xml_id
## Positions of lexeme-level features for each category
LEXEME_FEATURE_MAP = {'noun':{1,2},
'verb':{1,2},
'adjective':{1},
@ -18,6 +18,7 @@ LEXEME_FEATURE_MAP = {'noun':{1,2},
'residual':{1},
'punctuation':set()}
## Exceptions to feature levels specified in LEXEME_FEATURE_MAP
LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
('pronoun', 2, 'medme'), ('zaimek', 2, 'medme'),
('pronoun', 2, 'nadme'), ('zaimek', 2, 'nadme'),
@ -50,6 +51,7 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
class Specifications:
"""JOS specifications with list of all word categories."""
def __init__(self, *categories):
self.categories = list(categories)
@ -67,20 +69,8 @@ class Specifications:
return 'categories:{categories}'.format(categories=self.categories)
class Pair:
def __init__(self, en, sl):
self.en = en
self.sl = sl
def get(self, language):
return getattr(self, language)
def __str__(self):
return 'en:{en}, sl:{sl}'.format(en=self.en, sl=self.sl)
class Category:
"""JOS word category, including list of supported features."""
def __init__(self, names, codes, *features):
self.names = names
@ -102,6 +92,7 @@ class Category:
class Feature:
"""JOS category-dependent features, including list of supported values."""
def __init__(self, names, position, lexeme_level_flag, *values):
self.names = names
@ -124,6 +115,7 @@ class Feature:
class Value:
"""JOS feature-dependent values."""
def __init__(self, names, codes):
self.codes = codes
@ -134,7 +126,22 @@ class Value:
format(codes=self.codes, names=self.names)
class Pair:
"""Generic pair of English and Slovene strings."""
def __init__(self, en, sl):
self.en = en
self.sl = sl
def get(self, language):
return getattr(self, language)
def __str__(self):
return 'en:{en}, sl:{sl}'.format(en=self.en, sl=self.sl)
class SpecificationsParser:
"""Parser of JOS TEI specifications, yielding Specifications."""
def parse(self, file_name):
root = lxml.parse(file_name).getroot()
@ -174,6 +181,7 @@ class SpecificationsParser:
class Properties:
"""Representation of properties encoded in msds."""
def __init__(self, category, lexeme_feature_map, form_feature_map, language):
self.category = category
@ -187,6 +195,7 @@ class Properties:
class Msd:
"""JOS msd."""
def __init__(self, code, language):
self.code = code
@ -197,15 +206,18 @@ class Msd:
class Converter:
"""Converter between Msd and Properties objects."""
def __init__(self, specifications):
self.specifications = specifications
def msd_to_properties(self, msd, lemma, properties_language):
def msd_to_properties(self, msd, language, lemma=None):
"""Convert Msd to Properties (possibly in the other language)."""
category_char = msd.code[0].lower()
value_chars = msd.code[1:]
category = self.specifications.find_category_by_code(category_char, msd.language)
category_name = category.names.get(properties_language)
category_name = category.names.get(language)
feature_value_list = []
lexeme_feature_map = {}
form_feature_map = {}
@ -213,8 +225,10 @@ class Converter:
if (value_char != '-'):
feature = category.find_feature_by_position(index)
value = feature.find_value_by_char(value_char, msd.language)
feature_name = feature.names.get(properties_language)
feature_value = value.names.get(properties_language)
feature_name = feature.names.get(language)
feature_value = value.names.get(language)
if (lemma is None and category_name in [level_exception[0] for level_exception in LEVEL_EXCEPTIONS]):
exit('[ERROR] lemma is None but feature levels depend on lemma for category {}'.format(category_name))
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
feature_value_list.append((feature, value))
@ -222,18 +236,19 @@ class Converter:
lexeme_feature_map[feature_name] = feature_value
else:
form_feature_map[feature_name] = feature_value
return Properties(category_name, lexeme_feature_map, form_feature_map, properties_language)
return Properties(category_name, lexeme_feature_map, form_feature_map, language)
def properties_to_msd(self, properties, msd_language):
def properties_to_msd(self, properties, language):
"""Convert Properties to msd (possibly in the other language)."""
category = self.specifications.find_category_by_name(properties.category, properties.language)
category_char = category.codes.get(msd_language).upper()
category_char = category.codes.get(language).upper()
feature_map = properties.lexeme_feature_map.copy()
feature_map.update(properties.form_feature_map.copy())
position_map = {}
for (name, value) in feature_map.items():
feature = category.find_feature_by_name(name, properties.language)
value = feature.find_value_by_name(value, properties.language)
position_map[feature.position] = value.codes.get(msd_language)
position_map[feature.position] = value.codes.get(language)
msd_code = category_char
i = 0
for position in sorted(position_map.keys()):
@ -242,4 +257,4 @@ class Converter:
msd_code += '-'
i += 1
msd_code += position_map[position]
return Msd(msd_code, msd_language)
return Msd(msd_code, language)

@ -1,5 +1,4 @@
import os.path
import lxml.etree as lxml
import unittest
from conversion_utils.jos_msds_and_properties import SpecificationsParser, Converter, Msd
@ -13,49 +12,49 @@ class JosMsdToPropertiesTestCase(unittest.TestCase):
self.converter = Converter(specifications)
def test_en_en(self):
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'miza', 'en')
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'en')
self.assertEqual(properties.language, 'en')
self.assertEqual(properties.category, 'noun')
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
def test_en_sl(self):
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'miza', 'sl')
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'sl')
self.assertEqual(properties.language, 'sl')
self.assertEqual(properties.category, 'samostalnik')
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'})
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
def test_sl_en(self):
properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'miza', 'en')
properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'en')
self.assertEqual(properties.language, 'en')
self.assertEqual(properties.category, 'noun')
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
def test_sl_sl(self):
properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'miza', 'sl')
properties = self.converter.msd_to_properties(Msd('Sozmd', 'sl'), 'sl')
self.assertEqual(properties.language, 'sl')
self.assertEqual(properties.category, 'samostalnik')
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'})
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
def test_exception_feature_level(self): # testing level exception: ('pronoun', 8, 'ti')
properties = self.converter.msd_to_properties(Msd('Pp2-sd--y', 'en'), 'ti', 'sl')
properties = self.converter.msd_to_properties(Msd('Pp2-sd--y', 'en'), 'sl', 'ti')
self.assertEqual(properties.language, 'sl')
self.assertEqual(properties.category, 'zaimek')
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga'})
self.assertEqual(properties.form_feature_map, {'število':'ednina', 'sklon':'dajalnik', 'naslonskost':'klitična'})
def test_normal_feature_level(self): # invalid msd, but useful for testing contrast with previous test
properties = self.converter.msd_to_properties(Msd('Pp2-pd--y', 'en'), 'vi', 'sl')
properties = self.converter.msd_to_properties(Msd('Pp2-pd--y', 'en'), 'sl', 'vi')
self.assertEqual(properties.language, 'sl')
self.assertEqual(properties.category, 'zaimek')
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'})
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
def test_featureless(self):
properties = self.converter.msd_to_properties(Msd('V', 'sl'), 'in', 'en')
properties = self.converter.msd_to_properties(Msd('V', 'sl'), 'en')
self.assertEqual(properties.language, 'en')
self.assertEqual(properties.category, 'conjunction')
self.assertEqual(properties.lexeme_feature_map, {})

@ -1,5 +1,4 @@
import os.path
import lxml.etree as lxml
import unittest
from conversion_utils.jos_msds_and_properties import SpecificationsParser, Converter, Properties

Loading…
Cancel
Save