Compare commits

...

8 Commits

5
.gitignore vendored

@ -1,2 +1,7 @@
*.pyc
venv
data
.idea
build
dist
*.egg-info

@ -0,0 +1,22 @@
MIT License
Copyright (c) 2023 CLARIN.SI
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

@ -1,7 +1,8 @@
## Conversion utilities
## CJVT conversion utilities
This repository is currently intended for common conversions needed by CJVT developers. For the
moment, this is limited to JOS msds and properties.
This repository is intended for common conversions needed by CJVT developers. It can of course also
be used more broadly, but most of the scripts (with the exception of `jos_msds_and_properties.py`)
were written with specific tasks in mind, and may not generalise as expected. Use at your own risk.
### JOS msds and properties

@ -1,23 +1,36 @@
"""Convert a series of CoNNL-U files to a TEI file.
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
"""
import argparse
import re
import sys
from glob import glob
from lxml import etree
class Sentence:
def __init__(self, _id, no_ud=False, system='jos'):
self._id = _id
self.items = []
self.links = []
self.srl_links = []
self.no_ud = no_ud
self.system = system
def add_item(self, token, lemma, upos, upos_other, xpos, misc):
self.items.append([token, lemma, upos, upos_other, xpos, "SpaceAfter=No" in misc.split('|')])
no_space_after = 'SpaceAfter' in misc and misc['SpaceAfter'] == 'No'
ner = misc['NER'] if 'NER' in misc else 'O'
self.items.append([token, lemma, upos, upos_other, xpos, no_space_after, ner])
def add_link(self, link_ref, link_type):
self.links.append([link_ref, link_type])
def add_srl_link(self, link_ref, link_type):
self.srl_links.append([link_ref, link_type])
def as_xml(self, id_prefix=None):
if id_prefix:
xml_id = id_prefix + '.' + self._id
@ -27,8 +40,24 @@ class Sentence:
set_xml_attr(base, 'id', xml_id)
id_counter = 1
in_seg = False
sentence_base = base
for item in self.items:
token, lemma, upos, upos_other, xpos, no_space_after = item
token, lemma, upos, upos_other, xpos, no_space_after, ner = item
if ner[0] == 'B':
if in_seg:
sentence_base.append(base)
in_seg = True
base = etree.Element('seg')
base.set('type', 'name')
base.set('subtype', f'{ner[2:].lower()}')
elif ner[0] == 'O':
if in_seg:
sentence_base.append(base)
base = sentence_base
in_seg = False
if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one
to_add = etree.Element('pc')
@ -53,6 +82,11 @@ class Sentence:
base.append(to_add)
if in_seg:
sentence_base.append(base)
base = sentence_base
# depparsing linkGrp
link_grp = etree.Element('linkGrp')
link_grp.set('corresp', '#'+xml_id)
link_grp.set('targFunc', 'head argument')
@ -67,6 +101,23 @@ class Sentence:
link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
link_grp.append(link)
base.append(link_grp)
# srl linkGrp
if self.srl_links:
link_grp = etree.Element('linkGrp')
link_grp.set('corresp', '#' + xml_id)
link_grp.set('targFunc', 'head argument')
link_grp.set('type', 'SRL')
for link_id, item in enumerate(self.srl_links):
link_ref, link_type = item
link = etree.Element('link')
link.set('ana', 'srl:' + link_type.replace(':', '_'))
if link_ref == u'0':
link.set('target', '#' + xml_id + ' #' + xml_id + '.' + str(link_id + 1))
else:
link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
link_grp.append(link)
base.append(link_grp)
return base
@ -234,7 +285,7 @@ def construct_sentence(sent_id, lines):
upos_other = tokens[5]
depparse_link = tokens[6]
depparse_link_name = tokens[7]
misc = tokens[9]
misc = {el.split('=')[0]: el.split('=')[1] for el in tokens[9].split('|')} if tokens[9] != '_' else {}
sentence.add_item(
token,
@ -247,6 +298,11 @@ def construct_sentence(sent_id, lines):
sentence.add_link(
depparse_link,
depparse_link_name)
if 'SRL' in misc:
sentence.add_srl_link(
depparse_link,
misc['SRL'])
return sentence
@ -267,13 +323,10 @@ def convert_file(input_file_name, output_file_name):
if __name__ == '__main__':
import argparse
from glob import glob
parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
parser.add_argument('files', nargs='+', help='CoNNL-U file')
parser.add_argument('-o', '--out-file', dest='out', default=None,
help='Write output to file instead of stdout.')
parser.add_argument('-o', '--out-file', dest='out', default=None, help='Write output to file instead of stdout.')
parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
args = parser.parse_args()

@ -1,7 +1,7 @@
import lxml.etree as lxml
import re
import pickle
import importlib_resources as pkg_resources
from importlib_resources import files
from conversion_utils.utils import xpath_find, get_xml_id
@ -230,7 +230,10 @@ class Msd:
return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language
class ConverterException(Exception):
class CustomException(Exception):
pass
class MsdException(CustomException):
pass
class Converter:
@ -238,9 +241,10 @@ class Converter:
def __init__(self, xml_file_name=None):
if (xml_file_name is None):
if (pkg_resources.is_resource('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE)):
resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE)
if (resource.is_file()):
try:
with pkg_resources.open_binary('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE) as pickle_file:
with resource.open('rb') as pickle_file:
self.specifications = pickle.load(pickle_file)
except:
exit('Could not parse specifications pickle file installed.')
@ -253,17 +257,46 @@ class Converter:
except:
exit('Could not parse specifications xml file provided.')
def msd_to_properties(self, msd, language, lemma=None):
"""Convert Msd to Properties (possibly in the other language).
def is_valid_msd(self, msd):
"""Verify if the Msd code is in the standard JOS set."""
return msd.code in self.specifications.codes_map[msd.language]
The level (lexeme vs form) of certain reflexive msd features
depends on the lemma, so set the lemma if you need accurate
level information.
"""
def check_valid_msd(self, msd, require_valid_flag):
"""If the Msd code is not valid, raise an exception or give a warning."""
if (not self.is_valid_msd(msd)):
message = 'The msd {} is unknown'.format(msd.code)
if (require_valid_flag):
raise MsdException(message)
else:
print('[WARN] ' + message)
def msd_to_properties(self, msd, language, lemma=None, require_valid_flag=False, warn_level_flag=False):
"""Convert Msd to Properties.
The language of the generated Properties is specified and can differ from the Msd language.
If require_valid_flag is True, a MsdException is raised if the MSD is not in the standard
JOS set. Otherwise only a warning is given.
If you care about accurate level information (i.e., which properties are lexeme-level and
which are form-level), note that some features depends on the particular lemma. For such
features, if lemma is not provided and warn_level_flag is True, a warning will be given.
if (msd.code not in self.specifications.codes_map[msd.language]):
raise ConverterException('The msd {} is unknown'.format(msd.code))
If a MSD has dashes in place of letters for certain features, they are skipped, so that
these features are not included in the generated Properties object.
Parameters:
msd(Msd): the JOS MSD to convert
language(str): the language for the Properties object to be generated: "en" (English) or "sl" (Slovene)
lemma(str): the lemma of the word form with the MSD
require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is provided
warn_level_flag(boolean): whether to warn if cannot be sure of level of a property
Returns:
Properties: the result of the conversion of the Msd in the language requested
"""
self.check_valid_msd(msd, require_valid_flag)
category_char = msd.code[0].lower()
value_chars = msd.code[1:]
category = self.specifications.find_category_by_code(category_char, msd.language)
@ -277,8 +310,8 @@ class Converter:
value = feature.find_value_by_char(value_char, msd.language)
feature_name = feature.names.get(language)
feature_value = value.names.get(language)
if (lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
print('[WARN] The level of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.'
if (warn_level_flag and lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
print('[WARN] The level (lexeme vs form) of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.'
.format(category=category_name, position=index))
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
@ -289,8 +322,21 @@ class Converter:
form_feature_map[feature_name] = feature_value
return Properties(category_name, lexeme_feature_map, form_feature_map, language)
def properties_to_msd(self, properties, language):
"""Convert Properties to msd (possibly in the other language)."""
def properties_to_msd(self, properties, language, require_valid_flag=False):
"""Convert Properties to Msd.
The language of the generated Msd is specified and can differ from the Properties language.
If require_valid_flag is True, a MsdException is raised if the generated MSD is not in
the standard JOS set. Otherwise only a warning is given.
Any skipped positions among the Properties are represented as dashes in the MSD.
Parameters:
properties(Properties): the properties to convert
language(str): the language for the Msd object to be returned: "en" (English) or "sl" (Slovene)
require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is generated
"""
category = self.specifications.find_category_by_name(properties.category, properties.language)
category_char = category.codes.get(language).upper()
feature_map = properties.lexeme_feature_map.copy()
@ -308,7 +354,9 @@ class Converter:
msd_code += '-'
i += 1
msd_code += position_map[position]
return Msd(msd_code, language)
msd = Msd(msd_code, language)
self.check_valid_msd(msd, require_valid_flag)
return msd
def translate_msd(self, msd, language):
return self.properties_to_msd(self.msd_to_properties(msd, language), language)

@ -1,12 +1,19 @@
"""Convert a TEI file to a XML file of the CJVT standard schema.
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
"""
import argparse
import lxml.etree as lxml
from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER
from conversion_utils.utils import xpath_find, TEI_NAMESPACE_QUALIFIER
def get_parsed_unit_string(parsed_unit):
elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
def convert(input_file_name, output_file_name):
output_root = lxml.Element('dictionary')
@ -55,4 +62,6 @@ if (__name__ == '__main__'):
arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
arguments = arg_parser.parse_args()
input_file_name = arguments.infile
output_file_name = arguments.outfile
convert(input_file_name, output_file_name)

@ -1,6 +1,6 @@
import unittest
from conversion_utils.jos_msds_and_properties import Converter, Msd
from conversion_utils.jos_msds_and_properties import Converter, Msd, MsdException
class JosMsdToPropertiesTestCase(unittest.TestCase):
@ -55,3 +55,25 @@ class JosMsdToPropertiesTestCase(unittest.TestCase):
self.assertEqual(properties.category, 'punctuation')
self.assertEqual(properties.lexeme_feature_map, {})
self.assertEqual(properties.form_feature_map, {})
def test_good_msd_with_require_valid(self):
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'en', require_valid_flag=True)
self.assertEqual(properties.language, 'en')
self.assertEqual(properties.category, 'noun')
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
def test_bad_msd(self):
properties = self.converter.msd_to_properties(Msd('N---d', 'en'), 'en')
self.assertEqual(properties.language, 'en')
self.assertEqual(properties.category, 'noun')
self.assertEqual(properties.lexeme_feature_map, {})
self.assertEqual(properties.form_feature_map, {'case':'dative'})
def test_bad_msd_with_require_valid(self):
try:
self.converter.msd_to_properties(Msd('N---d', 'en'), 'en', require_valid_flag=True)
fails = False
except MsdException:
fails = True
self.assertEqual(fails, True)

@ -1,6 +1,6 @@
import unittest
from conversion_utils.jos_msds_and_properties import Converter, Properties
from conversion_utils.jos_msds_and_properties import Converter, Properties, MsdException
class JosPropertiesToMsdTestCase(unittest.TestCase):
@ -41,3 +41,21 @@ class JosPropertiesToMsdTestCase(unittest.TestCase):
msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl')
self.assertEqual(msd.language, 'sl')
self.assertEqual(msd.code, 'U')
def test_good_msd_with_require_valid(self):
msd = self.converter.properties_to_msd(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'en', require_valid_flag=True)
self.assertEqual(msd.language, 'en')
self.assertEqual(msd.code, 'Ncfdn')
def test_bad_msd(self):
msd = self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en')
self.assertEqual(msd.language, 'en')
self.assertEqual(msd.code, 'Nc-d')
def test_bad_msd_with_require_valid(self):
try:
self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en', require_valid_flag=True)
fails = False
except MsdException:
fails = True
self.assertEqual(fails, True)

@ -1,5 +1,7 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""Convert the MSD and/or syntactic dependency tags in a CoNLL-U file from English to Slovene tags.
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
"""
import argparse
import codecs
@ -8,6 +10,7 @@ from importlib_resources import files
from conversion_utils.jos_msds_and_properties import Converter, Msd
def get_syn_map():
dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
dict_file = codecs.open(dict_file_name, 'r')
@ -15,7 +18,8 @@ def get_syn_map():
dict_file.close()
return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
def translate(input_file_name, output_file_name):
def translate(input_file_name, scope, output_file_name):
syn_map = get_syn_map()
@ -29,8 +33,10 @@ def translate(input_file_name, output_file_name):
if (len(columns) != 10):
output_file.write(line)
else:
columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code
columns[7] = syn_map[columns[7]]
if (scope in {'msd', 'both'}):
columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code
if (scope in {'dep', 'both'}):
columns[7] = syn_map[columns[7]]
output_file.write('\t'.join(columns) + '\n')
input_file.close()
@ -41,6 +47,7 @@ if (__name__ == '__main__'):
arg_parser = argparse.ArgumentParser(description='Translate JOS msds and dependency labels.')
arg_parser.add_argument('-infile', type=str, help='Input conllu')
arg_parser.add_argument('-scope', type=str, options=['msd', 'dep', 'both'], default='both', help='Input conllu')
arg_parser.add_argument('-outfile', type=str, help='Output conllu')
arguments = arg_parser.parse_args()
input_file_name = arguments.infile

@ -1,11 +1,16 @@
"""A few convenience TEI/XML constants and functions."""
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
def xpath_find(element,expression):
"""Executes XPath expression, with TEI namespace."""
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
def get_xml_id(element):
"""Returns the element's @xml:id attribute."""
return element.get(XML_ID_ATTRIBUTE_NAME)

@ -1,3 +1,13 @@
"""Parse source TEI specifications and save as a pickle.
You can use this script to create a new pickle file to replace the one stored at
../conversion_utils/resources/jos_specifications.pickle. The input file is expected to be a version
of https://github.com/clarinsi/mte-msd/blob/master/xml/msd-sl.spc.xml. However, the specifications
are not expected to change, and if they do, the package pickle there should be updated upstream, so
you probably should not have to use this script.
"""
import pickle
import argparse
from conversion_utils.jos_msds_and_properties import SpecificationsParser

@ -1,12 +1,20 @@
from setuptools import setup
import os
setup(name='conversion_utils',
version='0.1',
here = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
long_description = f.read()
setup(name='cjvt_conversion_utils',
version='0.3',
description='CJVT conversion utilities',
long_description=long_description,
long_description_content_type="text/markdown",
url='https://gitea.cjvt.si/generic/conversion_utils',
author='Cyprian Laskowski',
author_email='cyp@cjvt.si',
packages=['conversion_utils', 'conversion_utils.resources'],
install_requires=['importlib_resources'],
author='CJVT',
author_email='pypi@cjvt.si',
license='MIT',
packages=['conversion_utils', 'conversion_utils.resources', 'conversion_utils.tests'],
install_requires=['lxml', 'importlib_resources'],
include_package_data=True,
zip_safe=True)

Loading…
Cancel
Save