Added rudimentary module documentation and made a couple of basic fixes

master
Cyprian Laskowski 6 months ago
parent f28b5a3a01
commit 03ce9f8ac7

@ -1,7 +1,8 @@
## Conversion utilities
## CJVT conversion utilities
This repository is currently intended for common conversions needed by CJVT developers. For the
moment, this is limited to JOS msds and properties.
This repository is intended for common conversions needed by CJVT developers. It can of course also
be used more broadly, but most of the scripts (with the exception of `jos_msds_and_properties.py`)
were written with specific tasks in mind, and may not generalise as expected. Use at your own risk.
### JOS msds and properties

@ -1,9 +1,16 @@
"""Convert a series of CoNNL-U files to a TEI file.
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
"""
import argparse
import re
import sys
from glob import glob
from lxml import etree
class Sentence:
def __init__(self, _id, no_ud=False, system='jos'):
self._id = _id
@ -316,13 +323,10 @@ def convert_file(input_file_name, output_file_name):
if __name__ == '__main__':
import argparse
from glob import glob
parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
parser.add_argument('files', nargs='+', help='CoNNL-U file')
parser.add_argument('-o', '--out-file', dest='out', default=None,
help='Write output to file instead of stdout.')
parser.add_argument('-o', '--out-file', dest='out', default=None, help='Write output to file instead of stdout.')
parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
args = parser.parse_args()

@ -1,12 +1,19 @@
"""Convert a TEI file to a XML file of the CJVT standard schema.
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
"""
import argparse
import lxml.etree as lxml
from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER
from conversion_utils.utils import xpath_find, TEI_NAMESPACE_QUALIFIER
def get_parsed_unit_string(parsed_unit):
elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
def convert(input_file_name, output_file_name):
output_root = lxml.Element('dictionary')
@ -55,4 +62,6 @@ if (__name__ == '__main__'):
arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
arguments = arg_parser.parse_args()
input_file_name = arguments.infile
output_file_name = arguments.outfile
convert(input_file_name, output_file_name)

@ -1,5 +1,7 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""Convert the MSD and/or syntactic dependency tags in a CoNLL-U file from English to Slovene tags.
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
"""
import argparse
import codecs

@ -1,11 +1,16 @@
"""A few convenience TEI/XML constants and functions."""
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
def xpath_find(element,expression):
"""Executes XPath expression, with TEI namespace."""
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
def get_xml_id(element):
"""Returns the element's @xml:id attribute."""
return element.get(XML_ID_ATTRIBUTE_NAME)

@ -1,3 +1,13 @@
"""Parse source TEI specifications and save as a pickle.
You can use this script to create a new pickle file to replace the one stored at
../conversion_utils/resources/jos_specifications.pickle. The input file is expected to be a version
of https://github.com/clarinsi/mte-msd/blob/master/xml/msd-sl.spc.xml. However, the specifications
are not expected to change, and if they do, the package pickle there should be updated upstream, so
you probably should not have to use this script.
"""
import pickle
import argparse
from conversion_utils.jos_msds_and_properties import SpecificationsParser

@ -1,7 +1,7 @@
from setuptools import setup
setup(name='conversion_utils',
version='0.1',
version='0.2',
description='CJVT conversion utilities',
url='https://gitea.cjvt.si/generic/conversion_utils',
author='Cyprian Laskowski',

Loading…
Cancel
Save