Added rudimentary module documentation and made a couple of basic fixes
This commit is contained in:
		
							parent
							
								
									f28b5a3a01
								
							
						
					
					
						commit
						03ce9f8ac7
					
				@ -1,7 +1,8 @@
 | 
				
			|||||||
## Conversion utilities
 | 
					## CJVT conversion utilities
 | 
				
			||||||
 | 
					
 | 
				
			||||||
This repository is currently intended for common conversions needed by CJVT developers. For the
 | 
					This repository is intended for common conversions needed by CJVT developers. It can of course also
 | 
				
			||||||
moment, this is limited to JOS msds and properties.
 | 
					be used more broadly, but most of the scripts (with the exception of `jos_msds_and_properties.py`)
 | 
				
			||||||
 | 
					were written with specific tasks in mind, and may not generalise as expected. Use at your own risk.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### JOS msds and properties
 | 
					### JOS msds and properties
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -1,9 +1,16 @@
 | 
				
			|||||||
 | 
					"""Convert a series of CoNNL-U files to a TEI file.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import argparse
 | 
					import argparse
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
 | 
					from glob import glob
 | 
				
			||||||
from lxml import etree
 | 
					from lxml import etree
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Sentence:
 | 
					class Sentence:
 | 
				
			||||||
    def __init__(self, _id, no_ud=False, system='jos'):
 | 
					    def __init__(self, _id, no_ud=False, system='jos'):
 | 
				
			||||||
        self._id = _id
 | 
					        self._id = _id
 | 
				
			||||||
@ -316,13 +323,10 @@ def convert_file(input_file_name, output_file_name):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == '__main__':
 | 
					if __name__ == '__main__':
 | 
				
			||||||
    import argparse
 | 
					 | 
				
			||||||
    from glob import glob
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
 | 
					    parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
 | 
				
			||||||
    parser.add_argument('files', nargs='+', help='CoNNL-U file')
 | 
					    parser.add_argument('files', nargs='+', help='CoNNL-U file')
 | 
				
			||||||
    parser.add_argument('-o', '--out-file', dest='out', default=None,
 | 
					    parser.add_argument('-o', '--out-file', dest='out', default=None, help='Write output to file instead of stdout.')
 | 
				
			||||||
                help='Write output to file instead of stdout.')
 | 
					 | 
				
			||||||
    parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
 | 
					    parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    args = parser.parse_args()
 | 
					    args = parser.parse_args()
 | 
				
			||||||
 | 
				
			|||||||
@ -1,12 +1,19 @@
 | 
				
			|||||||
 | 
					"""Convert a TEI file to a XML file of the CJVT standard schema.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import argparse
 | 
					import argparse
 | 
				
			||||||
import lxml.etree as lxml
 | 
					import lxml.etree as lxml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER
 | 
					from conversion_utils.utils import xpath_find, TEI_NAMESPACE_QUALIFIER
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_parsed_unit_string(parsed_unit):
 | 
					def get_parsed_unit_string(parsed_unit):
 | 
				
			||||||
    elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
 | 
					    elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
 | 
				
			||||||
    return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
 | 
					    return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def convert(input_file_name, output_file_name):
 | 
					def convert(input_file_name, output_file_name):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    output_root = lxml.Element('dictionary')
 | 
					    output_root = lxml.Element('dictionary')
 | 
				
			||||||
@ -55,4 +62,6 @@ if (__name__ == '__main__'):
 | 
				
			|||||||
    arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
 | 
					    arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
 | 
				
			||||||
    arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
 | 
					    arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
 | 
				
			||||||
    arguments = arg_parser.parse_args()
 | 
					    arguments = arg_parser.parse_args()
 | 
				
			||||||
 | 
					    input_file_name = arguments.infile
 | 
				
			||||||
 | 
					    output_file_name = arguments.outfile
 | 
				
			||||||
    convert(input_file_name, output_file_name)
 | 
					    convert(input_file_name, output_file_name)
 | 
				
			||||||
 | 
				
			|||||||
@ -1,5 +1,7 @@
 | 
				
			|||||||
#!/usr/bin/python3
 | 
					"""Convert the MSD and/or syntactic dependency tags in a CoNLL-U file from English to Slovene tags.
 | 
				
			||||||
# -*- coding: utf-8 -*-
 | 
					
 | 
				
			||||||
 | 
					This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import argparse
 | 
					import argparse
 | 
				
			||||||
import codecs
 | 
					import codecs
 | 
				
			||||||
 | 
				
			|||||||
@ -1,11 +1,16 @@
 | 
				
			|||||||
 | 
					"""A few convenience TEI/XML constants and functions."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
 | 
					TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
 | 
				
			||||||
TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
 | 
					TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
 | 
				
			||||||
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
 | 
					XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def xpath_find(element,expression):
 | 
					def xpath_find(element,expression):
 | 
				
			||||||
    """Executes XPath expression, with TEI namespace."""
 | 
					    """Executes XPath expression, with TEI namespace."""
 | 
				
			||||||
    return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
 | 
					    return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_xml_id(element):
 | 
					def get_xml_id(element):
 | 
				
			||||||
    """Returns the element's @xml:id attribute."""
 | 
					    """Returns the element's @xml:id attribute."""
 | 
				
			||||||
    return element.get(XML_ID_ATTRIBUTE_NAME)
 | 
					    return element.get(XML_ID_ATTRIBUTE_NAME)
 | 
				
			||||||
 | 
				
			|||||||
@ -1,3 +1,13 @@
 | 
				
			|||||||
 | 
					"""Parse source TEI specifications and save as a pickle.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You can use this script to create a new pickle file to replace the one stored at
 | 
				
			||||||
 | 
					../conversion_utils/resources/jos_specifications.pickle. The input file is expected to be a version
 | 
				
			||||||
 | 
					of https://github.com/clarinsi/mte-msd/blob/master/xml/msd-sl.spc.xml.  However, the specifications
 | 
				
			||||||
 | 
					are not expected to change, and if they do, the package pickle there should be updated upstream, so
 | 
				
			||||||
 | 
					you probably should not have to use this script.
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pickle
 | 
					import pickle
 | 
				
			||||||
import argparse
 | 
					import argparse
 | 
				
			||||||
from conversion_utils.jos_msds_and_properties import SpecificationsParser
 | 
					from conversion_utils.jos_msds_and_properties import SpecificationsParser
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							@ -1,7 +1,7 @@
 | 
				
			|||||||
from setuptools import setup
 | 
					from setuptools import setup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
setup(name='conversion_utils',
 | 
					setup(name='conversion_utils',
 | 
				
			||||||
      version='0.1',
 | 
					      version='0.2',
 | 
				
			||||||
      description='CJVT conversion utilities',
 | 
					      description='CJVT conversion utilities',
 | 
				
			||||||
      url='https://gitea.cjvt.si/generic/conversion_utils',
 | 
					      url='https://gitea.cjvt.si/generic/conversion_utils',
 | 
				
			||||||
      author='Cyprian Laskowski',
 | 
					      author='Cyprian Laskowski',
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user