From 03ce9f8ac719c1dc31ba9f22b3f3e28cb8e264c2 Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Thu, 26 Oct 2023 17:13:54 +0200 Subject: [PATCH] Added rudimentary module documentation and made a couple of basic fixes --- README.md | 7 ++++--- conversion_utils/conllu_to_tei.py | 14 +++++++++----- conversion_utils/tei_to_dictionary.py | 11 ++++++++++- conversion_utils/translate_conllu_jos.py | 6 ++++-- conversion_utils/utils.py | 5 +++++ scripts/install_jos_specifications.py | 10 ++++++++++ setup.py | 2 +- 7 files changed, 43 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index e3ceebd..6cf72b9 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ -## Conversion utilities +## CJVT conversion utilities -This repository is currently intended for common conversions needed by CJVT developers. For the -moment, this is limited to JOS msds and properties. +This repository is intended for common conversions needed by CJVT developers. It can of course also +be used more broadly, but most of the scripts (with the exception of `jos_msds_and_properties.py`) +were written with specific tasks in mind, and may not generalise as expected. Use at your own risk. ### JOS msds and properties diff --git a/conversion_utils/conllu_to_tei.py b/conversion_utils/conllu_to_tei.py index db6c911..3542280 100644 --- a/conversion_utils/conllu_to_tei.py +++ b/conversion_utils/conllu_to_tei.py @@ -1,9 +1,16 @@ +"""Convert a series of CoNNL-U files to a TEI file. + +This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk. +""" + + import argparse import re import sys - +from glob import glob from lxml import etree + class Sentence: def __init__(self, _id, no_ud=False, system='jos'): self._id = _id @@ -316,13 +323,10 @@ def convert_file(input_file_name, output_file_name): if __name__ == '__main__': - import argparse - from glob import glob parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.') parser.add_argument('files', nargs='+', help='CoNNL-U file') - parser.add_argument('-o', '--out-file', dest='out', default=None, - help='Write output to file instead of stdout.') + parser.add_argument('-o', '--out-file', dest='out', default=None, help='Write output to file instead of stdout.') parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud']) args = parser.parse_args() diff --git a/conversion_utils/tei_to_dictionary.py b/conversion_utils/tei_to_dictionary.py index 1b58447..75b5c8c 100644 --- a/conversion_utils/tei_to_dictionary.py +++ b/conversion_utils/tei_to_dictionary.py @@ -1,12 +1,19 @@ +"""Convert a TEI file to a XML file of the CJVT standard schema. + +This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk. +""" + import argparse import lxml.etree as lxml -from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER +from conversion_utils.utils import xpath_find, TEI_NAMESPACE_QUALIFIER + def get_parsed_unit_string(parsed_unit): elements = xpath_find(parsed_unit, 'tei:w|tei:pc') return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip() + def convert(input_file_name, output_file_name): output_root = lxml.Element('dictionary') @@ -55,4 +62,6 @@ if (__name__ == '__main__'): arg_parser.add_argument('-infile', type=str, help='Input TEI xml') arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema') arguments = arg_parser.parse_args() + input_file_name = arguments.infile + output_file_name = arguments.outfile convert(input_file_name, output_file_name) diff --git a/conversion_utils/translate_conllu_jos.py b/conversion_utils/translate_conllu_jos.py index dee614b..b92ac0c 100644 --- a/conversion_utils/translate_conllu_jos.py +++ b/conversion_utils/translate_conllu_jos.py @@ -1,5 +1,7 @@ -#!/usr/bin/python3 -# -*- coding: utf-8 -*- +"""Convert the MSD and/or syntactic dependency tags in a CoNLL-U file from English to Slovene tags. + +This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk. +""" import argparse import codecs diff --git a/conversion_utils/utils.py b/conversion_utils/utils.py index dfd750d..04906c5 100644 --- a/conversion_utils/utils.py +++ b/conversion_utils/utils.py @@ -1,11 +1,16 @@ +"""A few convenience TEI/XML constants and functions.""" + + TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0' TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}' XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id' + def xpath_find(element,expression): """Executes XPath expression, with TEI namespace.""" return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE}) + def get_xml_id(element): """Returns the element's @xml:id attribute.""" return element.get(XML_ID_ATTRIBUTE_NAME) diff --git a/scripts/install_jos_specifications.py b/scripts/install_jos_specifications.py index 32fa10c..122db9a 100644 --- a/scripts/install_jos_specifications.py +++ b/scripts/install_jos_specifications.py @@ -1,3 +1,13 @@ +"""Parse source TEI specifications and save as a pickle. + +You can use this script to create a new pickle file to replace the one stored at +../conversion_utils/resources/jos_specifications.pickle. The input file is expected to be a version +of https://github.com/clarinsi/mte-msd/blob/master/xml/msd-sl.spc.xml. However, the specifications +are not expected to change, and if they do, the package pickle there should be updated upstream, so +you probably should not have to use this script. +""" + + import pickle import argparse from conversion_utils.jos_msds_and_properties import SpecificationsParser diff --git a/setup.py b/setup.py index d94c296..f8f0a24 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup setup(name='conversion_utils', - version='0.1', + version='0.2', description='CJVT conversion utilities', url='https://gitea.cjvt.si/generic/conversion_utils', author='Cyprian Laskowski',