Compare commits

..

8 Commits

16 changed files with 227 additions and 139 deletions

3
.gitignore vendored
View File

@ -2,3 +2,6 @@
venv
data
.idea
build
dist
*.egg-info

22
LICENSE.txt Normal file
View File

@ -0,0 +1,22 @@
MIT License
Copyright (c) 2023 CLARIN.SI
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,7 +1,8 @@
## Conversion utilities
## CJVT conversion utilities
This repository is currently intended for common conversions needed by CJVT developers. For the
moment, this is limited to JOS msds and properties.
This repository is intended for common conversions needed by CJVT developers. It can of course also
be used more broadly, but most of the scripts (with the exception of `jos_msds_and_properties.py`)
were written with specific tasks in mind, and may not generalise as expected. Use at your own risk.
### JOS msds and properties

View File

@ -1,16 +1,15 @@
"""Convert a series of CoNNL-U files to a TEI file.
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
"""
import argparse
import re
import sys
from glob import glob
from lxml import etree
from conversion_utils.jos_msds_and_properties import Converter, Msd
converter = Converter()
def translate_msd(msd_text, lang, lemma=None):
""" Translates msd using conversion_utils library. """
return converter.properties_to_msd(converter.msd_to_properties(Msd(msd_text, 'en'), 'sl', lemma),
'sl').code
class Sentence:
def __init__(self, _id, no_ud=False, system='jos'):
@ -66,7 +65,6 @@ class Sentence:
to_add = etree.Element('w')
to_add.set('lemma', lemma)
xpos = translate_msd(xpos,'sl',lemma)
to_add.set('ana', 'mte:' + xpos)
if not self.no_ud:
if upos_other != '_':
@ -146,28 +144,35 @@ class Paragraph:
class TeiDocument:
def __init__(self, _id, paragraphs=list(), metadata=None):
def __init__(self, _id, paragraphs=list()):
self._id = _id
self.metadata = metadata
self.paragraphs = paragraphs
def as_xml(self):
root = etree.Element('div')
root = etree.Element('TEI')
root.set('xmlns', 'http://www.tei-c.org/ns/1.0')
set_xml_attr(root, 'lang', 'sl')
xml_id = self._id
if xml_id is not None:
set_xml_attr(root, 'id', xml_id)
bibl = etree.Element('bibl')
bibl.set('corresp', f'#{xml_id}')
bibl.set('n', f'#{xml_id}')
for k, v in self.metadata.items():
bibl_el = etree.Element(k)
bibl_el.text = v
bibl.append(bibl_el)
root.append(bibl)
tei_header = etree.SubElement(root, 'teiHeader')
text = etree.SubElement(root, 'text')
body = etree.SubElement(text, 'body')
for para in self.paragraphs:
root.append(para.as_xml(id_prefix=xml_id))
body.append(para.as_xml(id_prefix=xml_id))
encoding_desc = etree.SubElement(tei_header, 'encodingDesc')
tags_decl = etree.SubElement(encoding_desc, 'tagsDecl')
namespace = etree.SubElement(tags_decl, 'namespace')
namespace.set('name', 'http://www.tei-c.org/ns/1.0')
for tag in ['p', 's', 'pc', 'w']:
count = int(text.xpath('count(.//{})'.format(tag)))
tag_usage = etree.SubElement(namespace, 'tagUsage')
tag_usage.set('gi', tag)
tag_usage.set('occurs', str(count))
return root
def add_paragraph(self, paragraph):
@ -175,13 +180,10 @@ class TeiDocument:
def build_tei_etrees(documents):
root = etree.Element('body')
root.set('xmlns', 'http://www.tei-c.org/ns/1.0')
set_xml_attr(root, 'base', 'korpus.xml')
set_xml_attr(root, 'lang', 'sl')
elements = []
for document in documents:
root.append(document.as_xml())
return root
elements.append(document.as_xml())
return elements
def set_xml_attr(node, attribute, value):
@ -204,12 +206,11 @@ def is_metaline(line):
return False
def construct_tei_documents(conllu_lines, metadata):
def construct_tei_documents(conllu_lines):
documents = []
doc_id = None
doc_id_num = 0
document_paragraphs = []
document_paragraphs = []
para_id = None
para_buffer = []
@ -221,12 +222,9 @@ def construct_tei_documents(conllu_lines, metadata):
if len(para_buffer) > 0:
document_paragraphs.append(construct_paragraph(para_id, para_buffer))
if len(document_paragraphs) > 0:
print(metadata)
print(doc_id_num)
documents.append(
TeiDocument(doc_id, document_paragraphs, metadata[doc_id_num]))
TeiDocument(doc_id, document_paragraphs))
document_paragraphs = []
doc_id_num += 1
doc_id = val
elif key == 'newpar id':
if len(para_buffer) > 0:
@ -244,8 +242,7 @@ def construct_tei_documents(conllu_lines, metadata):
if len(document_paragraphs) > 0:
documents.append(
TeiDocument(doc_id, document_paragraphs, metadata[doc_id_num]))
doc_id_num += 1
TeiDocument(doc_id, document_paragraphs))
return documents
@ -288,7 +285,7 @@ def construct_sentence(sent_id, lines):
upos_other = tokens[5]
depparse_link = tokens[6]
depparse_link_name = tokens[7]
misc = {el.split('=')[0]: el.split('=')[1] for el in tokens[9].split('|')}
misc = {el.split('=')[0]: el.split('=')[1] for el in tokens[9].split('|')} if tokens[9] != '_' else {}
sentence.add_item(
token,
@ -309,14 +306,14 @@ def construct_sentence(sent_id, lines):
return sentence
def construct_tei_etrees(conllu_lines, metadata):
documents = construct_tei_documents(conllu_lines, metadata)
def construct_tei_etrees(conllu_lines):
documents = construct_tei_documents(conllu_lines)
return build_tei_etrees(documents)
def convert_file(input_file_name, output_file_name, metadata):
def convert_file(input_file_name, output_file_name):
input_file = open(input_file_name, 'r')
root = construct_tei_etrees(input_file, metadata)
root = construct_tei_etrees(input_file)[0]
tree = etree.ElementTree(root)
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
input_file.close()
@ -326,13 +323,10 @@ def convert_file(input_file_name, output_file_name, metadata):
if __name__ == '__main__':
import argparse
from glob import glob
parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
parser.add_argument('files', nargs='+', help='CoNNL-U file')
parser.add_argument('-o', '--out-file', dest='out', default=None,
help='Write output to file instead of stdout.')
parser.add_argument('-o', '--out-file', dest='out', default=None, help='Write output to file instead of stdout.')
parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
args = parser.parse_args()

View File

@ -1,7 +1,7 @@
import lxml.etree as lxml
import re
import pickle
import importlib_resources as pkg_resources
from importlib_resources import files
from conversion_utils.utils import xpath_find, get_xml_id
@ -230,7 +230,10 @@ class Msd:
return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language
class ConverterException(Exception):
class CustomException(Exception):
pass
class MsdException(CustomException):
pass
class Converter:
@ -238,9 +241,10 @@ class Converter:
def __init__(self, xml_file_name=None):
if (xml_file_name is None):
if (pkg_resources.is_resource('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE)):
resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE)
if (resource.is_file()):
try:
with pkg_resources.open_binary('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE) as pickle_file:
with resource.open('rb') as pickle_file:
self.specifications = pickle.load(pickle_file)
except:
exit('Could not parse specifications pickle file installed.')
@ -253,17 +257,46 @@ class Converter:
except:
exit('Could not parse specifications xml file provided.')
def msd_to_properties(self, msd, language, lemma=None):
"""Convert Msd to Properties (possibly in the other language).
def is_valid_msd(self, msd):
"""Verify if the Msd code is in the standard JOS set."""
return msd.code in self.specifications.codes_map[msd.language]
def check_valid_msd(self, msd, require_valid_flag):
"""If the Msd code is not valid, raise an exception or give a warning."""
if (not self.is_valid_msd(msd)):
message = 'The msd {} is unknown'.format(msd.code)
if (require_valid_flag):
raise MsdException(message)
else:
print('[WARN] ' + message)
def msd_to_properties(self, msd, language, lemma=None, require_valid_flag=False, warn_level_flag=False):
"""Convert Msd to Properties.
The language of the generated Properties is specified and can differ from the Msd language.
If require_valid_flag is True, a MsdException is raised if the MSD is not in the standard
JOS set. Otherwise only a warning is given.
If you care about accurate level information (i.e., which properties are lexeme-level and
which are form-level), note that some features depends on the particular lemma. For such
features, if lemma is not provided and warn_level_flag is True, a warning will be given.
If a MSD has dashes in place of letters for certain features, they are skipped, so that
these features are not included in the generated Properties object.
Parameters:
msd(Msd): the JOS MSD to convert
language(str): the language for the Properties object to be generated: "en" (English) or "sl" (Slovene)
lemma(str): the lemma of the word form with the MSD
require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is provided
warn_level_flag(boolean): whether to warn if cannot be sure of level of a property
Returns:
Properties: the result of the conversion of the Msd in the language requested
The level (lexeme vs form) of certain reflexive msd features
depends on the lemma, so set the lemma if you need accurate
level information.
"""
# if (msd.code not in self.specifications.codes_map[msd.language]):
# raise ConverterException('The msd {} is unknown'.format(msd.code))
self.check_valid_msd(msd, require_valid_flag)
category_char = msd.code[0].lower()
value_chars = msd.code[1:]
category = self.specifications.find_category_by_code(category_char, msd.language)
@ -277,8 +310,8 @@ class Converter:
value = feature.find_value_by_char(value_char, msd.language)
feature_name = feature.names.get(language)
feature_value = value.names.get(language)
if (lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
print('[WARN] The level of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.'
if (warn_level_flag and lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
print('[WARN] The level (lexeme vs form) of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.'
.format(category=category_name, position=index))
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
@ -289,8 +322,21 @@ class Converter:
form_feature_map[feature_name] = feature_value
return Properties(category_name, lexeme_feature_map, form_feature_map, language)
def properties_to_msd(self, properties, language):
"""Convert Properties to msd (possibly in the other language)."""
def properties_to_msd(self, properties, language, require_valid_flag=False):
"""Convert Properties to Msd.
The language of the generated Msd is specified and can differ from the Properties language.
If require_valid_flag is True, a MsdException is raised if the generated MSD is not in
the standard JOS set. Otherwise only a warning is given.
Any skipped positions among the Properties are represented as dashes in the MSD.
Parameters:
properties(Properties): the properties to convert
language(str): the language for the Msd object to be returned: "en" (English) or "sl" (Slovene)
require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is generated
"""
category = self.specifications.find_category_by_name(properties.category, properties.language)
category_char = category.codes.get(language).upper()
feature_map = properties.lexeme_feature_map.copy()
@ -308,7 +354,9 @@ class Converter:
msd_code += '-'
i += 1
msd_code += position_map[position]
return Msd(msd_code, language)
msd = Msd(msd_code, language)
self.check_valid_msd(msd, require_valid_flag)
return msd
def translate_msd(self, msd, language):
return self.properties_to_msd(self.msd_to_properties(msd, language), language)

View File

@ -1,12 +1,19 @@
"""Convert a TEI file to a XML file of the CJVT standard schema.
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
"""
import argparse
import lxml.etree as lxml
from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER
from conversion_utils.utils import xpath_find, TEI_NAMESPACE_QUALIFIER
def get_parsed_unit_string(parsed_unit):
elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
def convert(input_file_name, output_file_name):
output_root = lxml.Element('dictionary')
@ -55,4 +62,6 @@ if (__name__ == '__main__'):
arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
arguments = arg_parser.parse_args()
input_file_name = arguments.infile
output_file_name = arguments.outfile
convert(input_file_name, output_file_name)

View File

@ -1,6 +1,6 @@
import unittest
from conversion_utils.jos_msds_and_properties import Converter, Msd
from conversion_utils.jos_msds_and_properties import Converter, Msd, MsdException
class JosMsdToPropertiesTestCase(unittest.TestCase):
@ -55,3 +55,25 @@ class JosMsdToPropertiesTestCase(unittest.TestCase):
self.assertEqual(properties.category, 'punctuation')
self.assertEqual(properties.lexeme_feature_map, {})
self.assertEqual(properties.form_feature_map, {})
def test_good_msd_with_require_valid(self):
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'en', require_valid_flag=True)
self.assertEqual(properties.language, 'en')
self.assertEqual(properties.category, 'noun')
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
def test_bad_msd(self):
properties = self.converter.msd_to_properties(Msd('N---d', 'en'), 'en')
self.assertEqual(properties.language, 'en')
self.assertEqual(properties.category, 'noun')
self.assertEqual(properties.lexeme_feature_map, {})
self.assertEqual(properties.form_feature_map, {'case':'dative'})
def test_bad_msd_with_require_valid(self):
try:
self.converter.msd_to_properties(Msd('N---d', 'en'), 'en', require_valid_flag=True)
fails = False
except MsdException:
fails = True
self.assertEqual(fails, True)

View File

@ -1,6 +1,6 @@
import unittest
from conversion_utils.jos_msds_and_properties import Converter, Properties
from conversion_utils.jos_msds_and_properties import Converter, Properties, MsdException
class JosPropertiesToMsdTestCase(unittest.TestCase):
@ -41,3 +41,21 @@ class JosPropertiesToMsdTestCase(unittest.TestCase):
msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl')
self.assertEqual(msd.language, 'sl')
self.assertEqual(msd.code, 'U')
def test_good_msd_with_require_valid(self):
msd = self.converter.properties_to_msd(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'en', require_valid_flag=True)
self.assertEqual(msd.language, 'en')
self.assertEqual(msd.code, 'Ncfdn')
def test_bad_msd(self):
msd = self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en')
self.assertEqual(msd.language, 'en')
self.assertEqual(msd.code, 'Nc-d')
def test_bad_msd_with_require_valid(self):
try:
self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en', require_valid_flag=True)
fails = False
except MsdException:
fails = True
self.assertEqual(fails, True)

View File

@ -1,5 +1,7 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""Convert the MSD and/or syntactic dependency tags in a CoNLL-U file from English to Slovene tags.
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
"""
import argparse
import codecs
@ -8,6 +10,7 @@ from importlib_resources import files
from conversion_utils.jos_msds_and_properties import Converter, Msd
def get_syn_map():
dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
dict_file = codecs.open(dict_file_name, 'r')
@ -15,7 +18,8 @@ def get_syn_map():
dict_file.close()
return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
def translate(input_file_name, output_file_name):
def translate(input_file_name, scope, output_file_name):
syn_map = get_syn_map()
@ -29,8 +33,10 @@ def translate(input_file_name, output_file_name):
if (len(columns) != 10):
output_file.write(line)
else:
columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code
columns[7] = syn_map[columns[7]]
if (scope in {'msd', 'both'}):
columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code
if (scope in {'dep', 'both'}):
columns[7] = syn_map[columns[7]]
output_file.write('\t'.join(columns) + '\n')
input_file.close()
@ -41,6 +47,7 @@ if (__name__ == '__main__'):
arg_parser = argparse.ArgumentParser(description='Translate JOS msds and dependency labels.')
arg_parser.add_argument('-infile', type=str, help='Input conllu')
arg_parser.add_argument('-scope', type=str, options=['msd', 'dep', 'both'], default='both', help='Input conllu')
arg_parser.add_argument('-outfile', type=str, help='Output conllu')
arguments = arg_parser.parse_args()
input_file_name = arguments.infile

View File

@ -1,11 +1,16 @@
"""A few convenience TEI/XML constants and functions."""
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
def xpath_find(element,expression):
"""Executes XPath expression, with TEI namespace."""
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
def get_xml_id(element):
"""Returns the element's @xml:id attribute."""
return element.get(XML_ID_ATTRIBUTE_NAME)

59
run.py
View File

@ -1,59 +0,0 @@
import os
from conversion_utils.conllu_to_tei import convert_file
import csv
# dir_path = 'data/conllu'
# out_dir_path = 'data/tei'
# for filename in os.listdir(dir_path):
# in_name = os.path.join(dir_path, filename)
# out_filename = filename.split('.')[:-1]
# out_filename = '.'.join(out_filename) + '.xml'
# out_name = os.path.join(out_dir_path, out_filename)
# convert_file(in_name, out_name)
metadata_list = []
with open('data/metadata.csv', newline='') as csvfile:
for line in csv.reader(csvfile):
metadata_list.append(line)
metadata = [{} for i in range(len(metadata_list[0]) - 1)]
for i in range(1, len(metadata_list[0])):
metadata[i - 1]['title'] = metadata_list[0][i]
metadata[i - 1]['subtitle'] = metadata_list[1][i]
metadata[i - 1]['authors'] = metadata_list[2][i]
metadata[i - 1]['first_edition'] = metadata_list[3][i]
metadata[i - 1]['edition_in_corpus'] = metadata_list[4][i]
metadata[i - 1]['layer_according_to_SEJO'] = metadata_list[5][i]
metadata[i - 1]['audience'] = metadata_list[6][i]
metadata[i - 1]['hours_of_classes'] = metadata_list[7][i]
metadata[i - 1]['publisher'] = metadata_list[8][i]
metadata[i - 1]['file_name'] = metadata_list[9][i]
dir_path = 'data/conllu'
out_path = 'data/tei/tei.xml'
out_dir = 'data/conllu.conllu'
# out_dir = dir_path + '/conllu_small.conllu'
metadata_indices = [6, 16, 13, 7, 2, 1, 3, 14, 15, 0, 8, 4, 11, 9, 12, 5, 10]
out_file = open(out_dir, 'w')
metadata_indices = []
for fn_i, filename in enumerate(os.listdir(dir_path)):
in_name = os.path.join(dir_path, filename)
out_filename = filename.split('.')[:-1]
out_filename = '.'.join(out_filename)
for m_i, el in enumerate(metadata):
if el['file_name'] == out_filename:
metadata_indices.append(m_i)
out_filename = out_filename + '.xml'
out_name = os.path.join(out_dir, out_filename)
in_file = open(in_name, 'r')
data = f'# newdoc id = doc{str(fn_i+1)}\n'
data += in_file.read()
in_file.close()
out_file.write(data)
out_file.close()
shuffled_metadata = [metadata[el] for el in metadata_indices]
convert_file(out_dir, out_path, shuffled_metadata)

View File

@ -1,3 +1,13 @@
"""Parse source TEI specifications and save as a pickle.
You can use this script to create a new pickle file to replace the one stored at
../conversion_utils/resources/jos_specifications.pickle. The input file is expected to be a version
of https://github.com/clarinsi/mte-msd/blob/master/xml/msd-sl.spc.xml. However, the specifications
are not expected to change, and if they do, the package pickle there should be updated upstream, so
you probably should not have to use this script.
"""
import pickle
import argparse
from conversion_utils.jos_msds_and_properties import SpecificationsParser

View File

@ -1,12 +1,20 @@
from setuptools import setup
import os
setup(name='conversion_utils',
version='0.1',
here = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
long_description = f.read()
setup(name='cjvt_conversion_utils',
version='0.3',
description='CJVT conversion utilities',
long_description=long_description,
long_description_content_type="text/markdown",
url='https://gitea.cjvt.si/generic/conversion_utils',
author='Cyprian Laskowski',
author_email='cyp@cjvt.si',
packages=['conversion_utils', 'conversion_utils.resources'],
install_requires=['importlib_resources'],
author='CJVT',
author_email='pypi@cjvt.si',
license='MIT',
packages=['conversion_utils', 'conversion_utils.resources', 'conversion_utils.tests'],
install_requires=['lxml', 'importlib_resources'],
include_package_data=True,
zip_safe=True)