Added rudimentary module documentation and made a couple of basic fixes
This commit is contained in:
parent
f28b5a3a01
commit
03ce9f8ac7
|
@ -1,7 +1,8 @@
|
||||||
## Conversion utilities
|
## CJVT conversion utilities
|
||||||
|
|
||||||
This repository is currently intended for common conversions needed by CJVT developers. For the
|
This repository is intended for common conversions needed by CJVT developers. It can of course also
|
||||||
moment, this is limited to JOS msds and properties.
|
be used more broadly, but most of the scripts (with the exception of `jos_msds_and_properties.py`)
|
||||||
|
were written with specific tasks in mind, and may not generalise as expected. Use at your own risk.
|
||||||
|
|
||||||
### JOS msds and properties
|
### JOS msds and properties
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,16 @@
|
||||||
|
"""Convert a series of CoNNL-U files to a TEI file.
|
||||||
|
|
||||||
|
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
from glob import glob
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
|
|
||||||
class Sentence:
|
class Sentence:
|
||||||
def __init__(self, _id, no_ud=False, system='jos'):
|
def __init__(self, _id, no_ud=False, system='jos'):
|
||||||
self._id = _id
|
self._id = _id
|
||||||
|
@ -316,13 +323,10 @@ def convert_file(input_file_name, output_file_name):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import argparse
|
|
||||||
from glob import glob
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
|
parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
|
||||||
parser.add_argument('files', nargs='+', help='CoNNL-U file')
|
parser.add_argument('files', nargs='+', help='CoNNL-U file')
|
||||||
parser.add_argument('-o', '--out-file', dest='out', default=None,
|
parser.add_argument('-o', '--out-file', dest='out', default=None, help='Write output to file instead of stdout.')
|
||||||
help='Write output to file instead of stdout.')
|
|
||||||
parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
|
parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
|
@ -1,12 +1,19 @@
|
||||||
|
"""Convert a TEI file to a XML file of the CJVT standard schema.
|
||||||
|
|
||||||
|
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
|
||||||
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import lxml.etree as lxml
|
import lxml.etree as lxml
|
||||||
|
|
||||||
from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER
|
from conversion_utils.utils import xpath_find, TEI_NAMESPACE_QUALIFIER
|
||||||
|
|
||||||
|
|
||||||
def get_parsed_unit_string(parsed_unit):
|
def get_parsed_unit_string(parsed_unit):
|
||||||
elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
|
elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
|
||||||
return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
|
return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
|
||||||
|
|
||||||
|
|
||||||
def convert(input_file_name, output_file_name):
|
def convert(input_file_name, output_file_name):
|
||||||
|
|
||||||
output_root = lxml.Element('dictionary')
|
output_root = lxml.Element('dictionary')
|
||||||
|
@ -55,4 +62,6 @@ if (__name__ == '__main__'):
|
||||||
arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
|
arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
|
||||||
arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
|
arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
|
||||||
arguments = arg_parser.parse_args()
|
arguments = arg_parser.parse_args()
|
||||||
|
input_file_name = arguments.infile
|
||||||
|
output_file_name = arguments.outfile
|
||||||
convert(input_file_name, output_file_name)
|
convert(input_file_name, output_file_name)
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
#!/usr/bin/python3
|
"""Convert the MSD and/or syntactic dependency tags in a CoNLL-U file from English to Slovene tags.
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
|
||||||
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import codecs
|
import codecs
|
||||||
|
|
|
@ -1,11 +1,16 @@
|
||||||
|
"""A few convenience TEI/XML constants and functions."""
|
||||||
|
|
||||||
|
|
||||||
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
|
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
|
||||||
TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
|
TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
|
||||||
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
|
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
|
||||||
|
|
||||||
|
|
||||||
def xpath_find(element,expression):
|
def xpath_find(element,expression):
|
||||||
"""Executes XPath expression, with TEI namespace."""
|
"""Executes XPath expression, with TEI namespace."""
|
||||||
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
|
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
|
||||||
|
|
||||||
|
|
||||||
def get_xml_id(element):
|
def get_xml_id(element):
|
||||||
"""Returns the element's @xml:id attribute."""
|
"""Returns the element's @xml:id attribute."""
|
||||||
return element.get(XML_ID_ATTRIBUTE_NAME)
|
return element.get(XML_ID_ATTRIBUTE_NAME)
|
||||||
|
|
|
@ -1,3 +1,13 @@
|
||||||
|
"""Parse source TEI specifications and save as a pickle.
|
||||||
|
|
||||||
|
You can use this script to create a new pickle file to replace the one stored at
|
||||||
|
../conversion_utils/resources/jos_specifications.pickle. The input file is expected to be a version
|
||||||
|
of https://github.com/clarinsi/mte-msd/blob/master/xml/msd-sl.spc.xml. However, the specifications
|
||||||
|
are not expected to change, and if they do, the package pickle there should be updated upstream, so
|
||||||
|
you probably should not have to use this script.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
import pickle
|
import pickle
|
||||||
import argparse
|
import argparse
|
||||||
from conversion_utils.jos_msds_and_properties import SpecificationsParser
|
from conversion_utils.jos_msds_and_properties import SpecificationsParser
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -1,7 +1,7 @@
|
||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
|
|
||||||
setup(name='conversion_utils',
|
setup(name='conversion_utils',
|
||||||
version='0.1',
|
version='0.2',
|
||||||
description='CJVT conversion utilities',
|
description='CJVT conversion utilities',
|
||||||
url='https://gitea.cjvt.si/generic/conversion_utils',
|
url='https://gitea.cjvt.si/generic/conversion_utils',
|
||||||
author='Cyprian Laskowski',
|
author='Cyprian Laskowski',
|
||||||
|
|
Loading…
Reference in New Issue
Block a user