Implemented specifications parse and msd->properties conversion
This commit is contained in:
parent
0669f68a8d
commit
458b49e7ac
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1 +1,2 @@
|
||||||
*.pyc
|
*.pyc
|
||||||
|
venv
|
||||||
|
|
192
conversion_utils/jos_msds_and_properties.py
Normal file
192
conversion_utils/jos_msds_and_properties.py
Normal file
|
@ -0,0 +1,192 @@
|
||||||
|
import lxml.etree as lxml
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from conversion_utils.utils import xpath_find, get_xml_id
|
||||||
|
|
||||||
|
LANGUAGE_INDEX_MAP = {'en':0, 'sl':1}
|
||||||
|
|
||||||
|
LEXEME_FEATURE_MAP = {'noun':{1,2},
|
||||||
|
'verb':{1,2},
|
||||||
|
'adjective':{1},
|
||||||
|
'adverb':{1},
|
||||||
|
'pronoun':{1,2,6,7,8},
|
||||||
|
'numeral':{1,2},
|
||||||
|
'preposition':{1},
|
||||||
|
'conjunction':{1},
|
||||||
|
'particle':set(),
|
||||||
|
'interjection':set(),
|
||||||
|
'abbreviation':set(),
|
||||||
|
'residual':{1},
|
||||||
|
'punctuation':set()}
|
||||||
|
|
||||||
|
LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'),
|
||||||
|
('pronoun', 2, 'medme'),
|
||||||
|
('pronoun', 2, 'nadme'),
|
||||||
|
('pronoun', 2, 'name'),
|
||||||
|
('pronoun', 2, 'obme'),
|
||||||
|
('pronoun', 2, 'podme'),
|
||||||
|
('pronoun', 2, 'pome'),
|
||||||
|
('pronoun', 2, 'predme'),
|
||||||
|
('pronoun', 2, 'skozme'),
|
||||||
|
('pronoun', 2, 'vame'),
|
||||||
|
('pronoun', 2, 'zame'),
|
||||||
|
('pronoun', 3, 'tadva'),
|
||||||
|
('pronoun', 4, 'tadva'),
|
||||||
|
('pronoun', 5, 'čezme'),
|
||||||
|
('pronoun', 5, 'medme'),
|
||||||
|
('pronoun', 5, 'nadme'),
|
||||||
|
('pronoun', 5, 'name'),
|
||||||
|
('pronoun', 5, 'obme'),
|
||||||
|
('pronoun', 5, 'podme'),
|
||||||
|
('pronoun', 5, 'pome'),
|
||||||
|
('pronoun', 5, 'predme'),
|
||||||
|
('pronoun', 5, 'skozme'),
|
||||||
|
('pronoun', 5, 'vame'),
|
||||||
|
('pronoun', 5, 'zame'),
|
||||||
|
('pronoun', 7, 'njegov'),
|
||||||
|
('pronoun', 8, 'jaz'),
|
||||||
|
('pronoun', 8, 'on'),
|
||||||
|
('pronoun', 8, 'se'),
|
||||||
|
('pronoun', 8, 'ti')}
|
||||||
|
|
||||||
|
|
||||||
|
class Specifications:
|
||||||
|
|
||||||
|
def __init__(self, *categories):
|
||||||
|
self.categories = list(categories)
|
||||||
|
|
||||||
|
def add_category(self, category):
|
||||||
|
self.categories.append(category)
|
||||||
|
|
||||||
|
def find_category(self, char, language_index):
|
||||||
|
return next((category for category in self.categories if category.char_pair[language_index] == char), None)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return 'categories:{categories}'.format(categories=self.categories)
|
||||||
|
|
||||||
|
|
||||||
|
class Category:
|
||||||
|
|
||||||
|
def __init__(self, string_pair, char_pair, *features):
|
||||||
|
self.string_pair = string_pair
|
||||||
|
self.char_pair = char_pair
|
||||||
|
self.features = list(features)
|
||||||
|
|
||||||
|
def add_feature(self, feature):
|
||||||
|
self.features.append(feature)
|
||||||
|
|
||||||
|
def find_feature(self, position):
|
||||||
|
return next((feature for feature in self.features if feature.position == position), None)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return 'strings:{strings}, chars:{chars}, features:{features}'.\
|
||||||
|
format(strings=self.string_pair, chars=self.char_pair, features=self.features)
|
||||||
|
|
||||||
|
|
||||||
|
class Feature:
|
||||||
|
|
||||||
|
def __init__(self, string_pair, position, lexeme_level_flag, *values):
|
||||||
|
self.string_pair = string_pair
|
||||||
|
self.position = position
|
||||||
|
self.lexeme_level_flag = lexeme_level_flag
|
||||||
|
self.values = list(values)
|
||||||
|
|
||||||
|
def add_value(self, value):
|
||||||
|
self.values.append(value)
|
||||||
|
|
||||||
|
def find_value(self, char, language_index):
|
||||||
|
return next((value for value in self.values if value.char_pair[language_index] == char), None)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return 'strings:{strings}, position:{position}, level:{level}, values:{values}'.\
|
||||||
|
format(strings=self.string_pair, position=self.position, level='level' if self.lexeme_level_flag else 'form', values=self.values)
|
||||||
|
|
||||||
|
|
||||||
|
class Value:
|
||||||
|
|
||||||
|
def __init__(self, string_pair, char_pair):
|
||||||
|
self.char_pair = char_pair
|
||||||
|
self.string_pair = string_pair
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return 'chars:{chars}, strings:{strings}'.\
|
||||||
|
format(chars=self.char_pair, strings=self.strings_pair)
|
||||||
|
|
||||||
|
|
||||||
|
class SpecificationsBuilder:
|
||||||
|
|
||||||
|
def build(self, file_name):
|
||||||
|
root = lxml.parse(file_name).getroot()
|
||||||
|
specifications = Specifications()
|
||||||
|
div_elements = xpath_find(root, 'tei:div')
|
||||||
|
for div_element in div_elements:
|
||||||
|
if (re.match(r'^msd\..-sl', get_xml_id(div_element))):
|
||||||
|
category_element = xpath_find(div_element, 'tei:table/tei:row[@role="type"]')[0]
|
||||||
|
category_string_pair = self.get_cell_pair(category_element, 'value')
|
||||||
|
category_char_pair = self.get_cell_pair(category_element, 'code')
|
||||||
|
category = Category(category_string_pair, category_char_pair)
|
||||||
|
specifications.add_category(category)
|
||||||
|
feature_elements = xpath_find(div_element, 'tei:table/tei:row[@role="attribute"]')
|
||||||
|
for feature_element in feature_elements:
|
||||||
|
feature_string_pair = self.get_cell_pair(feature_element, 'name')
|
||||||
|
feature_position = int(self.get_cell(feature_element, 'position'))
|
||||||
|
lexeme_level_flag = feature_position in LEXEME_FEATURE_MAP[category_string_pair[0]]
|
||||||
|
feature = Feature(feature_string_pair, feature_position, lexeme_level_flag)
|
||||||
|
category.add_feature(feature)
|
||||||
|
value_elements = xpath_find(feature_element, 'tei:cell[@role="values"]/tei:table/tei:row[@role="value"]')
|
||||||
|
for value_element in value_elements:
|
||||||
|
value_char_pair = self.get_cell_pair(value_element, 'name')
|
||||||
|
value_string_pair = self.get_cell_pair(value_element, 'code')
|
||||||
|
value = Value(value_char_pair, value_string_pair)
|
||||||
|
feature.add_value(value)
|
||||||
|
return specifications
|
||||||
|
|
||||||
|
def get_cell(self, row, role, language=None):
|
||||||
|
language_condition = ' and @xml:lang="' + language + '"' if language is not None else ''
|
||||||
|
expression = 'tei:cell[@role="' + role + '"' + language_condition + ']'
|
||||||
|
text = xpath_find(row, expression)[0].text.lower()
|
||||||
|
if (text == 'adposition'): text = 'preposition'
|
||||||
|
return text
|
||||||
|
|
||||||
|
def get_cell_pair(self, row, role):
|
||||||
|
return (self.get_cell(row, role, 'en'), self.get_cell(row, role, 'sl'))
|
||||||
|
|
||||||
|
|
||||||
|
class Properties:
|
||||||
|
|
||||||
|
def __init__(self, lemma, category, feature_value_list, language_index):
|
||||||
|
self.language = next(l for (l, i) in LANGUAGE_INDEX_MAP.items() if i == language_index)
|
||||||
|
self.category = category.string_pair[language_index]
|
||||||
|
self.lexeme_feature_map = {}
|
||||||
|
self.form_feature_map = {}
|
||||||
|
for (feature, value) in feature_value_list:
|
||||||
|
feature_name = feature.string_pair[language_index]
|
||||||
|
feature_value = value.string_pair[language_index]
|
||||||
|
level_exception_flag = (category.string_pair[0], feature.position, lemma) in LEVEL_EXCEPTIONS
|
||||||
|
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
|
||||||
|
if (lexeme_level_flag):
|
||||||
|
self.lexeme_feature_map[feature_name] = feature_value
|
||||||
|
else:
|
||||||
|
self.form_feature_map[feature_name] = feature_value
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return 'language={language}, category={category}, lexeme features={lexeme_features}, form_features={form_features}'.\
|
||||||
|
format(language=self.language, category=self.category, lexeme_features=str(self.lexeme_feature_map), form_features=str(self.form_feature_map))
|
||||||
|
|
||||||
|
|
||||||
|
class MsdToProperties:
|
||||||
|
|
||||||
|
def convert(self, specifications, lemma, msd, msd_language, properties_language):
|
||||||
|
msd_language_index = LANGUAGE_INDEX_MAP[msd_language]
|
||||||
|
properties_language_index = LANGUAGE_INDEX_MAP[properties_language]
|
||||||
|
category_char = msd[0].lower()
|
||||||
|
value_chars = msd[1:]
|
||||||
|
category = specifications.find_category(category_char, msd_language_index)
|
||||||
|
feature_value_list = []
|
||||||
|
for (index, value_char) in enumerate(value_chars, start=1):
|
||||||
|
if (value_char != '-'):
|
||||||
|
feature = category.find_feature(index)
|
||||||
|
value = feature.find_value(value_char, msd_language_index)
|
||||||
|
feature_value_list.append((feature, value))
|
||||||
|
return Properties(lemma, category, feature_value_list, properties_language_index)
|
10
conversion_utils/utils.py
Normal file
10
conversion_utils/utils.py
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
|
||||||
|
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
|
||||||
|
|
||||||
|
def xpath_find(element,expression):
|
||||||
|
"""Executes XPath expression, with TEI namespace."""
|
||||||
|
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
|
||||||
|
|
||||||
|
def get_xml_id(element):
|
||||||
|
"""Returns the element's @xml:id attribute."""
|
||||||
|
return element.get(XML_ID_ATTRIBUTE_NAME)
|
37867
resources/msd-sl.spc.xml
Normal file
37867
resources/msd-sl.spc.xml
Normal file
File diff suppressed because it is too large
Load Diff
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
62
tests/test_jos_msd_to_properties.py
Normal file
62
tests/test_jos_msd_to_properties.py
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
import os.path
|
||||||
|
import lxml.etree as lxml
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from conversion_utils.jos_msds_and_properties import SpecificationsBuilder, MsdToProperties
|
||||||
|
|
||||||
|
class JosMsdToPropertiesTestCase(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
specifications_file_name = os.path.join(os.path.dirname(__file__), '../resources/msd-sl.spc.xml')
|
||||||
|
builder = SpecificationsBuilder()
|
||||||
|
self.specifications = builder.build(specifications_file_name)
|
||||||
|
self.converter = MsdToProperties()
|
||||||
|
|
||||||
|
def test_en_en(self):
|
||||||
|
properties = self.converter.convert(self.specifications, 'miza', 'Ncfpd', 'en', 'en')
|
||||||
|
self.assertEqual(properties.language, 'en')
|
||||||
|
self.assertEqual(properties.category, 'noun')
|
||||||
|
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
|
||||||
|
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
|
||||||
|
|
||||||
|
def test_en_sl(self):
|
||||||
|
properties = self.converter.convert(self.specifications, 'miza', 'Ncfpd', 'en', 'sl')
|
||||||
|
self.assertEqual(properties.language, 'sl')
|
||||||
|
self.assertEqual(properties.category, 'samostalnik')
|
||||||
|
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'})
|
||||||
|
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
||||||
|
|
||||||
|
def test_sl_en(self):
|
||||||
|
properties = self.converter.convert(self.specifications, 'miza', 'Sozmd', 'sl', 'en')
|
||||||
|
self.assertEqual(properties.language, 'en')
|
||||||
|
self.assertEqual(properties.category, 'noun')
|
||||||
|
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
|
||||||
|
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
|
||||||
|
|
||||||
|
def test_sl_sl(self):
|
||||||
|
properties = self.converter.convert(self.specifications, 'miza', 'Sozmd', 'sl', 'sl')
|
||||||
|
self.assertEqual(properties.language, 'sl')
|
||||||
|
self.assertEqual(properties.category, 'samostalnik')
|
||||||
|
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'občno_ime', 'spol':'ženski'})
|
||||||
|
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
||||||
|
|
||||||
|
def test_exception_feature_level(self): # testing level exception: ('pronoun', 8, 'ti')
|
||||||
|
properties = self.converter.convert(self.specifications, 'ti', 'Pp2-sd--y', 'en', 'sl')
|
||||||
|
self.assertEqual(properties.language, 'sl')
|
||||||
|
self.assertEqual(properties.category, 'zaimek')
|
||||||
|
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga'})
|
||||||
|
self.assertEqual(properties.form_feature_map, {'število':'ednina', 'sklon':'dajalnik', 'naslonskost':'klitična'})
|
||||||
|
|
||||||
|
def test_normal_feature_level(self): # invalid msd, but useful for testing contrast with previous test
|
||||||
|
properties = self.converter.convert(self.specifications, 'vi', 'Pp2-pd--y', 'en', 'sl')
|
||||||
|
self.assertEqual(properties.language, 'sl')
|
||||||
|
self.assertEqual(properties.category, 'zaimek')
|
||||||
|
self.assertEqual(properties.lexeme_feature_map, {'vrsta':'osebni', 'oseba':'druga', 'naslonskost':'klitična'})
|
||||||
|
self.assertEqual(properties.form_feature_map, {'število':'množina', 'sklon':'dajalnik'})
|
||||||
|
|
||||||
|
def test_featureless(self):
|
||||||
|
properties = self.converter.convert(self.specifications, 'in', 'V', 'sl', 'en')
|
||||||
|
self.assertEqual(properties.language, 'en')
|
||||||
|
self.assertEqual(properties.category, 'conjunction')
|
||||||
|
self.assertEqual(properties.lexeme_feature_map, {})
|
||||||
|
self.assertEqual(properties.form_feature_map, {})
|
Loading…
Reference in New Issue
Block a user