Added rudimentary module documentation and made a couple of basic fixes
This commit is contained in:
parent
f28b5a3a01
commit
03ce9f8ac7
|
@ -1,7 +1,8 @@
|
|||
## Conversion utilities
|
||||
## CJVT conversion utilities
|
||||
|
||||
This repository is currently intended for common conversions needed by CJVT developers. For the
|
||||
moment, this is limited to JOS msds and properties.
|
||||
This repository is intended for common conversions needed by CJVT developers. It can of course also
|
||||
be used more broadly, but most of the scripts (with the exception of `jos_msds_and_properties.py`)
|
||||
were written with specific tasks in mind, and may not generalise as expected. Use at your own risk.
|
||||
|
||||
### JOS msds and properties
|
||||
|
||||
|
|
|
@ -1,9 +1,16 @@
|
|||
"""Convert a series of CoNNL-U files to a TEI file.
|
||||
|
||||
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
|
||||
"""
|
||||
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
|
||||
from glob import glob
|
||||
from lxml import etree
|
||||
|
||||
|
||||
class Sentence:
|
||||
def __init__(self, _id, no_ud=False, system='jos'):
|
||||
self._id = _id
|
||||
|
@ -316,13 +323,10 @@ def convert_file(input_file_name, output_file_name):
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import argparse
|
||||
from glob import glob
|
||||
|
||||
parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
|
||||
parser.add_argument('files', nargs='+', help='CoNNL-U file')
|
||||
parser.add_argument('-o', '--out-file', dest='out', default=None,
|
||||
help='Write output to file instead of stdout.')
|
||||
parser.add_argument('-o', '--out-file', dest='out', default=None, help='Write output to file instead of stdout.')
|
||||
parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
|
||||
|
||||
args = parser.parse_args()
|
||||
|
|
|
@ -1,12 +1,19 @@
|
|||
"""Convert a TEI file to a XML file of the CJVT standard schema.
|
||||
|
||||
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import lxml.etree as lxml
|
||||
|
||||
from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER
|
||||
from conversion_utils.utils import xpath_find, TEI_NAMESPACE_QUALIFIER
|
||||
|
||||
|
||||
def get_parsed_unit_string(parsed_unit):
|
||||
elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
|
||||
return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
|
||||
|
||||
|
||||
def convert(input_file_name, output_file_name):
|
||||
|
||||
output_root = lxml.Element('dictionary')
|
||||
|
@ -55,4 +62,6 @@ if (__name__ == '__main__'):
|
|||
arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
|
||||
arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
|
||||
arguments = arg_parser.parse_args()
|
||||
input_file_name = arguments.infile
|
||||
output_file_name = arguments.outfile
|
||||
convert(input_file_name, output_file_name)
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Convert the MSD and/or syntactic dependency tags in a CoNLL-U file from English to Slovene tags.
|
||||
|
||||
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import codecs
|
||||
|
|
|
@ -1,11 +1,16 @@
|
|||
"""A few convenience TEI/XML constants and functions."""
|
||||
|
||||
|
||||
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
|
||||
TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
|
||||
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
|
||||
|
||||
|
||||
def xpath_find(element,expression):
|
||||
"""Executes XPath expression, with TEI namespace."""
|
||||
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
|
||||
|
||||
|
||||
def get_xml_id(element):
|
||||
"""Returns the element's @xml:id attribute."""
|
||||
return element.get(XML_ID_ATTRIBUTE_NAME)
|
||||
|
|
|
@ -1,3 +1,13 @@
|
|||
"""Parse source TEI specifications and save as a pickle.
|
||||
|
||||
You can use this script to create a new pickle file to replace the one stored at
|
||||
../conversion_utils/resources/jos_specifications.pickle. The input file is expected to be a version
|
||||
of https://github.com/clarinsi/mte-msd/blob/master/xml/msd-sl.spc.xml. However, the specifications
|
||||
are not expected to change, and if they do, the package pickle there should be updated upstream, so
|
||||
you probably should not have to use this script.
|
||||
"""
|
||||
|
||||
|
||||
import pickle
|
||||
import argparse
|
||||
from conversion_utils.jos_msds_and_properties import SpecificationsParser
|
||||
|
|
Loading…
Reference in New Issue
Block a user