Replaced JOS XML specifications with preprocessed pickle

This commit is contained in:
Cyprian Laskowski 2021-09-30 00:22:43 +02:00
parent eca02ebdd3
commit a088025026
9 changed files with 38 additions and 37878 deletions

1
MANIFEST.in Normal file
View File

@ -0,0 +1 @@
include conversion_utils/resources/jos_specifications.pickle

View File

@ -1,8 +1,12 @@
import lxml.etree as lxml import lxml.etree as lxml
import re import re
import pickle
import importlib.resources as pkg_resources
from conversion_utils.utils import xpath_find, get_xml_id from conversion_utils.utils import xpath_find, get_xml_id
JOS_SPECIFICATIONS_PICKLE_RESOURCE = 'jos_specifications.pickle'
## Positions of lexeme-level features for each category ## Positions of lexeme-level features for each category
LEXEME_FEATURE_MAP = {'noun':{1,2}, LEXEME_FEATURE_MAP = {'noun':{1,2},
'verb':{1,2}, 'verb':{1,2},
@ -219,8 +223,22 @@ class Msd:
class Converter: class Converter:
"""Converter between Msd and Properties objects.""" """Converter between Msd and Properties objects."""
def __init__(self, specifications): def __init__(self, xml_file_name=None):
self.specifications = specifications if (xml_file_name is None):
if (pkg_resources.is_resource('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE)):
try:
with pkg_resources.open_binary('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE) as pickle_file:
self.specifications = pickle.load(pickle_file)
except:
exit('Could not parse specifications pickle file installed.')
else:
exit('No pickle installed or xml provided.')
else:
parser = SpecificationsParser()
try:
self.specifications = parser.parse(xml_file_name)
except:
exit('Could not parse specifications xml file provided.')
def msd_to_properties(self, msd, language, lemma=None): def msd_to_properties(self, msd, language, lemma=None):
"""Convert Msd to Properties (possibly in the other language).""" """Convert Msd to Properties (possibly in the other language)."""

View File

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,13 @@
import pickle
import argparse
from conversion_utils.jos_msds_and_properties import SpecificationsParser
arg_parser = argparse.ArgumentParser(description='Parse source TEI specifications file and save as pickle.')
arg_parser.add_argument('-xml', type=str, help='input XML file', required=True)
arg_parser.add_argument('-pickle', type=str, help='output pickle file', required=True)
arguments = arg_parser.parse_args()
parser = SpecificationsParser()
specifications = parser.parse(arguments.xml)
with open(arguments.pickle, 'wb') as pickle_file:
pickle.dump(specifications, pickle_file)

View File

@ -6,5 +6,6 @@ setup(name='conversion_utils',
url='https://gitea.cjvt.si/generic/conversion_utils', url='https://gitea.cjvt.si/generic/conversion_utils',
author='Cyprian Laskowski', author='Cyprian Laskowski',
author_email='cyp@cjvt.si', author_email='cyp@cjvt.si',
packages=['conversion_utils'], packages=['conversion_utils', 'conversion_utils.resources'],
include_package_data=True,
zip_safe=True) zip_safe=True)

View File

@ -6,10 +6,7 @@ from conversion_utils.jos_msds_and_properties import SpecificationsParser, Conve
class JosMsdToPropertiesTestCase(unittest.TestCase): class JosMsdToPropertiesTestCase(unittest.TestCase):
def setUp(self): def setUp(self):
specifications_file_name = os.path.join(os.path.dirname(__file__), '../resources/msd-sl.spc.xml') self.converter = Converter()
parser = SpecificationsParser()
specifications = parser.parse(specifications_file_name)
self.converter = Converter(specifications)
def test_en_en(self): def test_en_en(self):
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'en') properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'en')

View File

@ -6,10 +6,7 @@ from conversion_utils.jos_msds_and_properties import SpecificationsParser, Conve
class JosPropertiesToMsdTestCase(unittest.TestCase): class JosPropertiesToMsdTestCase(unittest.TestCase):
def setUp(self): def setUp(self):
specifications_file_name = os.path.join(os.path.dirname(__file__), '../resources/msd-sl.spc.xml') self.converter = Converter()
parser = SpecificationsParser()
specifications = parser.parse(specifications_file_name)
self.converter = Converter(specifications)
def test_en_en(self): def test_en_en(self):
msd = self.converter.properties_to_msd(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'en') msd = self.converter.properties_to_msd(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'en')