Compare commits
6 Commits
d7be39d894
...
v0.3
| Author | SHA1 | Date | |
|---|---|---|---|
| f43ea39f1b | |||
| 03ce9f8ac7 | |||
| f28b5a3a01 | |||
| 89be603103 | |||
| 99ac426e4b | |||
| 89bcde58aa |
5
.gitignore
vendored
5
.gitignore
vendored
@@ -1,2 +1,7 @@
|
||||
*.pyc
|
||||
venv
|
||||
data
|
||||
.idea
|
||||
build
|
||||
dist
|
||||
*.egg-info
|
||||
|
||||
22
LICENSE.txt
Normal file
22
LICENSE.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023 CLARIN.SI
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
## Conversion utilities
|
||||
## CJVT conversion utilities
|
||||
|
||||
This repository is currently intended for common conversions needed by CJVT developers. For the
|
||||
moment, this is limited to JOS msds and properties.
|
||||
This repository is intended for common conversions needed by CJVT developers. It can of course also
|
||||
be used more broadly, but most of the scripts (with the exception of `jos_msds_and_properties.py`)
|
||||
were written with specific tasks in mind, and may not generalise as expected. Use at your own risk.
|
||||
|
||||
### JOS msds and properties
|
||||
|
||||
|
||||
@@ -1,23 +1,36 @@
|
||||
"""Convert a series of CoNNL-U files to a TEI file.
|
||||
|
||||
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
|
||||
"""
|
||||
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
|
||||
from glob import glob
|
||||
from lxml import etree
|
||||
|
||||
|
||||
class Sentence:
|
||||
def __init__(self, _id, no_ud=False, system='jos'):
|
||||
self._id = _id
|
||||
self.items = []
|
||||
self.links = []
|
||||
self.srl_links = []
|
||||
self.no_ud = no_ud
|
||||
self.system = system
|
||||
|
||||
def add_item(self, token, lemma, upos, upos_other, xpos, misc):
|
||||
self.items.append([token, lemma, upos, upos_other, xpos, "SpaceAfter=No" in misc.split('|')])
|
||||
no_space_after = 'SpaceAfter' in misc and misc['SpaceAfter'] == 'No'
|
||||
ner = misc['NER'] if 'NER' in misc else 'O'
|
||||
self.items.append([token, lemma, upos, upos_other, xpos, no_space_after, ner])
|
||||
|
||||
def add_link(self, link_ref, link_type):
|
||||
self.links.append([link_ref, link_type])
|
||||
|
||||
def add_srl_link(self, link_ref, link_type):
|
||||
self.srl_links.append([link_ref, link_type])
|
||||
|
||||
def as_xml(self, id_prefix=None):
|
||||
if id_prefix:
|
||||
xml_id = id_prefix + '.' + self._id
|
||||
@@ -27,8 +40,24 @@ class Sentence:
|
||||
set_xml_attr(base, 'id', xml_id)
|
||||
id_counter = 1
|
||||
|
||||
in_seg = False
|
||||
sentence_base = base
|
||||
|
||||
for item in self.items:
|
||||
token, lemma, upos, upos_other, xpos, no_space_after = item
|
||||
token, lemma, upos, upos_other, xpos, no_space_after, ner = item
|
||||
|
||||
if ner[0] == 'B':
|
||||
if in_seg:
|
||||
sentence_base.append(base)
|
||||
in_seg = True
|
||||
base = etree.Element('seg')
|
||||
base.set('type', 'name')
|
||||
base.set('subtype', f'{ner[2:].lower()}')
|
||||
elif ner[0] == 'O':
|
||||
if in_seg:
|
||||
sentence_base.append(base)
|
||||
base = sentence_base
|
||||
in_seg = False
|
||||
|
||||
if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one
|
||||
to_add = etree.Element('pc')
|
||||
@@ -53,6 +82,11 @@ class Sentence:
|
||||
|
||||
base.append(to_add)
|
||||
|
||||
if in_seg:
|
||||
sentence_base.append(base)
|
||||
base = sentence_base
|
||||
|
||||
# depparsing linkGrp
|
||||
link_grp = etree.Element('linkGrp')
|
||||
link_grp.set('corresp', '#'+xml_id)
|
||||
link_grp.set('targFunc', 'head argument')
|
||||
@@ -67,6 +101,23 @@ class Sentence:
|
||||
link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
|
||||
link_grp.append(link)
|
||||
base.append(link_grp)
|
||||
|
||||
# srl linkGrp
|
||||
if self.srl_links:
|
||||
link_grp = etree.Element('linkGrp')
|
||||
link_grp.set('corresp', '#' + xml_id)
|
||||
link_grp.set('targFunc', 'head argument')
|
||||
link_grp.set('type', 'SRL')
|
||||
for link_id, item in enumerate(self.srl_links):
|
||||
link_ref, link_type = item
|
||||
link = etree.Element('link')
|
||||
link.set('ana', 'srl:' + link_type.replace(':', '_'))
|
||||
if link_ref == u'0':
|
||||
link.set('target', '#' + xml_id + ' #' + xml_id + '.' + str(link_id + 1))
|
||||
else:
|
||||
link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
|
||||
link_grp.append(link)
|
||||
base.append(link_grp)
|
||||
return base
|
||||
|
||||
|
||||
@@ -234,7 +285,7 @@ def construct_sentence(sent_id, lines):
|
||||
upos_other = tokens[5]
|
||||
depparse_link = tokens[6]
|
||||
depparse_link_name = tokens[7]
|
||||
misc = tokens[9]
|
||||
misc = {el.split('=')[0]: el.split('=')[1] for el in tokens[9].split('|')} if tokens[9] != '_' else {}
|
||||
|
||||
sentence.add_item(
|
||||
token,
|
||||
@@ -247,6 +298,11 @@ def construct_sentence(sent_id, lines):
|
||||
sentence.add_link(
|
||||
depparse_link,
|
||||
depparse_link_name)
|
||||
|
||||
if 'SRL' in misc:
|
||||
sentence.add_srl_link(
|
||||
depparse_link,
|
||||
misc['SRL'])
|
||||
return sentence
|
||||
|
||||
|
||||
@@ -267,13 +323,10 @@ def convert_file(input_file_name, output_file_name):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import argparse
|
||||
from glob import glob
|
||||
|
||||
parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
|
||||
parser.add_argument('files', nargs='+', help='CoNNL-U file')
|
||||
parser.add_argument('-o', '--out-file', dest='out', default=None,
|
||||
help='Write output to file instead of stdout.')
|
||||
parser.add_argument('-o', '--out-file', dest='out', default=None, help='Write output to file instead of stdout.')
|
||||
parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import lxml.etree as lxml
|
||||
import re
|
||||
import pickle
|
||||
import importlib_resources as pkg_resources
|
||||
from importlib_resources import files
|
||||
|
||||
from conversion_utils.utils import xpath_find, get_xml_id
|
||||
|
||||
@@ -241,9 +241,10 @@ class Converter:
|
||||
|
||||
def __init__(self, xml_file_name=None):
|
||||
if (xml_file_name is None):
|
||||
if (pkg_resources.is_resource('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE)):
|
||||
resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE)
|
||||
if (resource.is_file()):
|
||||
try:
|
||||
with pkg_resources.open_binary('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE) as pickle_file:
|
||||
with resource.open('rb') as pickle_file:
|
||||
self.specifications = pickle.load(pickle_file)
|
||||
except:
|
||||
exit('Could not parse specifications pickle file installed.')
|
||||
|
||||
@@ -1,12 +1,19 @@
|
||||
"""Convert a TEI file to a XML file of the CJVT standard schema.
|
||||
|
||||
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import lxml.etree as lxml
|
||||
|
||||
from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER
|
||||
from conversion_utils.utils import xpath_find, TEI_NAMESPACE_QUALIFIER
|
||||
|
||||
|
||||
def get_parsed_unit_string(parsed_unit):
|
||||
elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
|
||||
return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
|
||||
|
||||
|
||||
def convert(input_file_name, output_file_name):
|
||||
|
||||
output_root = lxml.Element('dictionary')
|
||||
@@ -55,4 +62,6 @@ if (__name__ == '__main__'):
|
||||
arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
|
||||
arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
|
||||
arguments = arg_parser.parse_args()
|
||||
input_file_name = arguments.infile
|
||||
output_file_name = arguments.outfile
|
||||
convert(input_file_name, output_file_name)
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Convert the MSD and/or syntactic dependency tags in a CoNLL-U file from English to Slovene tags.
|
||||
|
||||
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import codecs
|
||||
@@ -8,6 +10,7 @@ from importlib_resources import files
|
||||
|
||||
from conversion_utils.jos_msds_and_properties import Converter, Msd
|
||||
|
||||
|
||||
def get_syn_map():
|
||||
dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
|
||||
dict_file = codecs.open(dict_file_name, 'r')
|
||||
@@ -15,7 +18,8 @@ def get_syn_map():
|
||||
dict_file.close()
|
||||
return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
|
||||
|
||||
def translate(input_file_name, output_file_name):
|
||||
|
||||
def translate(input_file_name, scope, output_file_name):
|
||||
|
||||
syn_map = get_syn_map()
|
||||
|
||||
@@ -29,7 +33,9 @@ def translate(input_file_name, output_file_name):
|
||||
if (len(columns) != 10):
|
||||
output_file.write(line)
|
||||
else:
|
||||
if (scope in {'msd', 'both'}):
|
||||
columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code
|
||||
if (scope in {'dep', 'both'}):
|
||||
columns[7] = syn_map[columns[7]]
|
||||
output_file.write('\t'.join(columns) + '\n')
|
||||
|
||||
@@ -41,6 +47,7 @@ if (__name__ == '__main__'):
|
||||
|
||||
arg_parser = argparse.ArgumentParser(description='Translate JOS msds and dependency labels.')
|
||||
arg_parser.add_argument('-infile', type=str, help='Input conllu')
|
||||
arg_parser.add_argument('-scope', type=str, options=['msd', 'dep', 'both'], default='both', help='Input conllu')
|
||||
arg_parser.add_argument('-outfile', type=str, help='Output conllu')
|
||||
arguments = arg_parser.parse_args()
|
||||
input_file_name = arguments.infile
|
||||
|
||||
@@ -1,11 +1,16 @@
|
||||
"""A few convenience TEI/XML constants and functions."""
|
||||
|
||||
|
||||
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
|
||||
TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
|
||||
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
|
||||
|
||||
|
||||
def xpath_find(element,expression):
|
||||
"""Executes XPath expression, with TEI namespace."""
|
||||
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
|
||||
|
||||
|
||||
def get_xml_id(element):
|
||||
"""Returns the element's @xml:id attribute."""
|
||||
return element.get(XML_ID_ATTRIBUTE_NAME)
|
||||
|
||||
@@ -1,3 +1,13 @@
|
||||
"""Parse source TEI specifications and save as a pickle.
|
||||
|
||||
You can use this script to create a new pickle file to replace the one stored at
|
||||
../conversion_utils/resources/jos_specifications.pickle. The input file is expected to be a version
|
||||
of https://github.com/clarinsi/mte-msd/blob/master/xml/msd-sl.spc.xml. However, the specifications
|
||||
are not expected to change, and if they do, the package pickle there should be updated upstream, so
|
||||
you probably should not have to use this script.
|
||||
"""
|
||||
|
||||
|
||||
import pickle
|
||||
import argparse
|
||||
from conversion_utils.jos_msds_and_properties import SpecificationsParser
|
||||
|
||||
18
setup.py
18
setup.py
@@ -1,12 +1,20 @@
|
||||
from setuptools import setup
|
||||
import os
|
||||
|
||||
setup(name='conversion_utils',
|
||||
version='0.1',
|
||||
here = os.path.abspath(os.path.dirname(__file__))
|
||||
with open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
|
||||
long_description = f.read()
|
||||
|
||||
setup(name='cjvt_conversion_utils',
|
||||
version='0.3',
|
||||
description='CJVT conversion utilities',
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url='https://gitea.cjvt.si/generic/conversion_utils',
|
||||
author='Cyprian Laskowski',
|
||||
author_email='cyp@cjvt.si',
|
||||
author='CJVT',
|
||||
author_email='pypi@cjvt.si',
|
||||
license='MIT',
|
||||
packages=['conversion_utils', 'conversion_utils.resources', 'conversion_utils.tests'],
|
||||
install_requires=['importlib_resources'],
|
||||
install_requires=['lxml', 'importlib_resources'],
|
||||
include_package_data=True,
|
||||
zip_safe=True)
|
||||
|
||||
Reference in New Issue
Block a user