Added rudimentary module documentation and made a couple of basic fixes

This commit is contained in:
Cyprian Laskowski 2023-10-26 17:13:54 +02:00
parent f28b5a3a01
commit 03ce9f8ac7
7 changed files with 43 additions and 12 deletions

View File

@ -1,7 +1,8 @@
## Conversion utilities ## CJVT conversion utilities
This repository is currently intended for common conversions needed by CJVT developers. For the This repository is intended for common conversions needed by CJVT developers. It can of course also
moment, this is limited to JOS msds and properties. be used more broadly, but most of the scripts (with the exception of `jos_msds_and_properties.py`)
were written with specific tasks in mind, and may not generalise as expected. Use at your own risk.
### JOS msds and properties ### JOS msds and properties

View File

@ -1,9 +1,16 @@
"""Convert a series of CoNNL-U files to a TEI file.
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
"""
import argparse import argparse
import re import re
import sys import sys
from glob import glob
from lxml import etree from lxml import etree
class Sentence: class Sentence:
def __init__(self, _id, no_ud=False, system='jos'): def __init__(self, _id, no_ud=False, system='jos'):
self._id = _id self._id = _id
@ -316,13 +323,10 @@ def convert_file(input_file_name, output_file_name):
if __name__ == '__main__': if __name__ == '__main__':
import argparse
from glob import glob
parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.') parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
parser.add_argument('files', nargs='+', help='CoNNL-U file') parser.add_argument('files', nargs='+', help='CoNNL-U file')
parser.add_argument('-o', '--out-file', dest='out', default=None, parser.add_argument('-o', '--out-file', dest='out', default=None, help='Write output to file instead of stdout.')
help='Write output to file instead of stdout.')
parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud']) parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
args = parser.parse_args() args = parser.parse_args()

View File

@ -1,12 +1,19 @@
"""Convert a TEI file to a XML file of the CJVT standard schema.
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
"""
import argparse import argparse
import lxml.etree as lxml import lxml.etree as lxml
from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER from conversion_utils.utils import xpath_find, TEI_NAMESPACE_QUALIFIER
def get_parsed_unit_string(parsed_unit): def get_parsed_unit_string(parsed_unit):
elements = xpath_find(parsed_unit, 'tei:w|tei:pc') elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip() return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
def convert(input_file_name, output_file_name): def convert(input_file_name, output_file_name):
output_root = lxml.Element('dictionary') output_root = lxml.Element('dictionary')
@ -55,4 +62,6 @@ if (__name__ == '__main__'):
arg_parser.add_argument('-infile', type=str, help='Input TEI xml') arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema') arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
arguments = arg_parser.parse_args() arguments = arg_parser.parse_args()
input_file_name = arguments.infile
output_file_name = arguments.outfile
convert(input_file_name, output_file_name) convert(input_file_name, output_file_name)

View File

@ -1,5 +1,7 @@
#!/usr/bin/python3 """Convert the MSD and/or syntactic dependency tags in a CoNLL-U file from English to Slovene tags.
# -*- coding: utf-8 -*-
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
"""
import argparse import argparse
import codecs import codecs

View File

@ -1,11 +1,16 @@
"""A few convenience TEI/XML constants and functions."""
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0' TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}' TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id' XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
def xpath_find(element,expression): def xpath_find(element,expression):
"""Executes XPath expression, with TEI namespace.""" """Executes XPath expression, with TEI namespace."""
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE}) return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
def get_xml_id(element): def get_xml_id(element):
"""Returns the element's @xml:id attribute.""" """Returns the element's @xml:id attribute."""
return element.get(XML_ID_ATTRIBUTE_NAME) return element.get(XML_ID_ATTRIBUTE_NAME)

View File

@ -1,3 +1,13 @@
"""Parse source TEI specifications and save as a pickle.
You can use this script to create a new pickle file to replace the one stored at
../conversion_utils/resources/jos_specifications.pickle. The input file is expected to be a version
of https://github.com/clarinsi/mte-msd/blob/master/xml/msd-sl.spc.xml. However, the specifications
are not expected to change, and if they do, the package pickle there should be updated upstream, so
you probably should not have to use this script.
"""
import pickle import pickle
import argparse import argparse
from conversion_utils.jos_msds_and_properties import SpecificationsParser from conversion_utils.jos_msds_and_properties import SpecificationsParser

View File

@ -1,7 +1,7 @@
from setuptools import setup from setuptools import setup
setup(name='conversion_utils', setup(name='conversion_utils',
version='0.1', version='0.2',
description='CJVT conversion utilities', description='CJVT conversion utilities',
url='https://gitea.cjvt.si/generic/conversion_utils', url='https://gitea.cjvt.si/generic/conversion_utils',
author='Cyprian Laskowski', author='Cyprian Laskowski',