Compare commits
8 Commits
multiple_f
...
master
Author | SHA1 | Date | |
---|---|---|---|
f43ea39f1b | |||
03ce9f8ac7 | |||
f28b5a3a01 | |||
89be603103 | |||
99ac426e4b | |||
89bcde58aa | |||
d7be39d894 | |||
4ca67ec8cc |
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -2,3 +2,6 @@
|
||||||
venv
|
venv
|
||||||
data
|
data
|
||||||
.idea
|
.idea
|
||||||
|
build
|
||||||
|
dist
|
||||||
|
*.egg-info
|
||||||
|
|
22
LICENSE.txt
Normal file
22
LICENSE.txt
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2023 CLARIN.SI
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
## Conversion utilities
|
## CJVT conversion utilities
|
||||||
|
|
||||||
This repository is currently intended for common conversions needed by CJVT developers. For the
|
This repository is intended for common conversions needed by CJVT developers. It can of course also
|
||||||
moment, this is limited to JOS msds and properties.
|
be used more broadly, but most of the scripts (with the exception of `jos_msds_and_properties.py`)
|
||||||
|
were written with specific tasks in mind, and may not generalise as expected. Use at your own risk.
|
||||||
|
|
||||||
### JOS msds and properties
|
### JOS msds and properties
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,15 @@
|
||||||
|
"""Convert a series of CoNNL-U files to a TEI file.
|
||||||
|
|
||||||
|
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
from glob import glob
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from conversion_utils.jos_msds_and_properties import Converter, Msd
|
|
||||||
|
|
||||||
converter = Converter()
|
|
||||||
def translate_msd(msd_text, lang, lemma=None):
|
|
||||||
""" Translates msd using conversion_utils library. """
|
|
||||||
return converter.properties_to_msd(converter.msd_to_properties(Msd(msd_text, 'en'), 'sl', lemma),
|
|
||||||
'sl').code
|
|
||||||
|
|
||||||
class Sentence:
|
class Sentence:
|
||||||
def __init__(self, _id, no_ud=False, system='jos'):
|
def __init__(self, _id, no_ud=False, system='jos'):
|
||||||
|
@ -66,7 +65,6 @@ class Sentence:
|
||||||
to_add = etree.Element('w')
|
to_add = etree.Element('w')
|
||||||
to_add.set('lemma', lemma)
|
to_add.set('lemma', lemma)
|
||||||
|
|
||||||
xpos = translate_msd(xpos,'sl',lemma)
|
|
||||||
to_add.set('ana', 'mte:' + xpos)
|
to_add.set('ana', 'mte:' + xpos)
|
||||||
if not self.no_ud:
|
if not self.no_ud:
|
||||||
if upos_other != '_':
|
if upos_other != '_':
|
||||||
|
@ -146,28 +144,35 @@ class Paragraph:
|
||||||
|
|
||||||
|
|
||||||
class TeiDocument:
|
class TeiDocument:
|
||||||
def __init__(self, _id, paragraphs=list(), metadata=None):
|
def __init__(self, _id, paragraphs=list()):
|
||||||
self._id = _id
|
self._id = _id
|
||||||
self.metadata = metadata
|
|
||||||
self.paragraphs = paragraphs
|
self.paragraphs = paragraphs
|
||||||
|
|
||||||
def as_xml(self):
|
def as_xml(self):
|
||||||
root = etree.Element('div')
|
root = etree.Element('TEI')
|
||||||
|
root.set('xmlns', 'http://www.tei-c.org/ns/1.0')
|
||||||
|
set_xml_attr(root, 'lang', 'sl')
|
||||||
|
|
||||||
xml_id = self._id
|
xml_id = self._id
|
||||||
if xml_id is not None:
|
if xml_id is not None:
|
||||||
set_xml_attr(root, 'id', xml_id)
|
set_xml_attr(root, 'id', xml_id)
|
||||||
|
|
||||||
bibl = etree.Element('bibl')
|
tei_header = etree.SubElement(root, 'teiHeader')
|
||||||
bibl.set('corresp', f'#{xml_id}')
|
|
||||||
bibl.set('n', f'#{xml_id}')
|
text = etree.SubElement(root, 'text')
|
||||||
for k, v in self.metadata.items():
|
body = etree.SubElement(text, 'body')
|
||||||
bibl_el = etree.Element(k)
|
|
||||||
bibl_el.text = v
|
|
||||||
bibl.append(bibl_el)
|
|
||||||
root.append(bibl)
|
|
||||||
for para in self.paragraphs:
|
for para in self.paragraphs:
|
||||||
root.append(para.as_xml(id_prefix=xml_id))
|
body.append(para.as_xml(id_prefix=xml_id))
|
||||||
|
|
||||||
|
encoding_desc = etree.SubElement(tei_header, 'encodingDesc')
|
||||||
|
tags_decl = etree.SubElement(encoding_desc, 'tagsDecl')
|
||||||
|
namespace = etree.SubElement(tags_decl, 'namespace')
|
||||||
|
namespace.set('name', 'http://www.tei-c.org/ns/1.0')
|
||||||
|
for tag in ['p', 's', 'pc', 'w']:
|
||||||
|
count = int(text.xpath('count(.//{})'.format(tag)))
|
||||||
|
tag_usage = etree.SubElement(namespace, 'tagUsage')
|
||||||
|
tag_usage.set('gi', tag)
|
||||||
|
tag_usage.set('occurs', str(count))
|
||||||
return root
|
return root
|
||||||
|
|
||||||
def add_paragraph(self, paragraph):
|
def add_paragraph(self, paragraph):
|
||||||
|
@ -175,13 +180,10 @@ class TeiDocument:
|
||||||
|
|
||||||
|
|
||||||
def build_tei_etrees(documents):
|
def build_tei_etrees(documents):
|
||||||
root = etree.Element('body')
|
elements = []
|
||||||
root.set('xmlns', 'http://www.tei-c.org/ns/1.0')
|
|
||||||
set_xml_attr(root, 'base', 'korpus.xml')
|
|
||||||
set_xml_attr(root, 'lang', 'sl')
|
|
||||||
for document in documents:
|
for document in documents:
|
||||||
root.append(document.as_xml())
|
elements.append(document.as_xml())
|
||||||
return root
|
return elements
|
||||||
|
|
||||||
|
|
||||||
def set_xml_attr(node, attribute, value):
|
def set_xml_attr(node, attribute, value):
|
||||||
|
@ -204,12 +206,11 @@ def is_metaline(line):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def construct_tei_documents(conllu_lines, metadata):
|
def construct_tei_documents(conllu_lines):
|
||||||
documents = []
|
documents = []
|
||||||
|
|
||||||
doc_id = None
|
doc_id = None
|
||||||
doc_id_num = 0
|
document_paragraphs = []
|
||||||
document_paragraphs = []
|
|
||||||
|
|
||||||
para_id = None
|
para_id = None
|
||||||
para_buffer = []
|
para_buffer = []
|
||||||
|
@ -221,12 +222,9 @@ def construct_tei_documents(conllu_lines, metadata):
|
||||||
if len(para_buffer) > 0:
|
if len(para_buffer) > 0:
|
||||||
document_paragraphs.append(construct_paragraph(para_id, para_buffer))
|
document_paragraphs.append(construct_paragraph(para_id, para_buffer))
|
||||||
if len(document_paragraphs) > 0:
|
if len(document_paragraphs) > 0:
|
||||||
print(metadata)
|
|
||||||
print(doc_id_num)
|
|
||||||
documents.append(
|
documents.append(
|
||||||
TeiDocument(doc_id, document_paragraphs, metadata[doc_id_num]))
|
TeiDocument(doc_id, document_paragraphs))
|
||||||
document_paragraphs = []
|
document_paragraphs = []
|
||||||
doc_id_num += 1
|
|
||||||
doc_id = val
|
doc_id = val
|
||||||
elif key == 'newpar id':
|
elif key == 'newpar id':
|
||||||
if len(para_buffer) > 0:
|
if len(para_buffer) > 0:
|
||||||
|
@ -244,8 +242,7 @@ def construct_tei_documents(conllu_lines, metadata):
|
||||||
|
|
||||||
if len(document_paragraphs) > 0:
|
if len(document_paragraphs) > 0:
|
||||||
documents.append(
|
documents.append(
|
||||||
TeiDocument(doc_id, document_paragraphs, metadata[doc_id_num]))
|
TeiDocument(doc_id, document_paragraphs))
|
||||||
doc_id_num += 1
|
|
||||||
|
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
|
@ -288,7 +285,7 @@ def construct_sentence(sent_id, lines):
|
||||||
upos_other = tokens[5]
|
upos_other = tokens[5]
|
||||||
depparse_link = tokens[6]
|
depparse_link = tokens[6]
|
||||||
depparse_link_name = tokens[7]
|
depparse_link_name = tokens[7]
|
||||||
misc = {el.split('=')[0]: el.split('=')[1] for el in tokens[9].split('|')}
|
misc = {el.split('=')[0]: el.split('=')[1] for el in tokens[9].split('|')} if tokens[9] != '_' else {}
|
||||||
|
|
||||||
sentence.add_item(
|
sentence.add_item(
|
||||||
token,
|
token,
|
||||||
|
@ -309,14 +306,14 @@ def construct_sentence(sent_id, lines):
|
||||||
return sentence
|
return sentence
|
||||||
|
|
||||||
|
|
||||||
def construct_tei_etrees(conllu_lines, metadata):
|
def construct_tei_etrees(conllu_lines):
|
||||||
documents = construct_tei_documents(conllu_lines, metadata)
|
documents = construct_tei_documents(conllu_lines)
|
||||||
return build_tei_etrees(documents)
|
return build_tei_etrees(documents)
|
||||||
|
|
||||||
|
|
||||||
def convert_file(input_file_name, output_file_name, metadata):
|
def convert_file(input_file_name, output_file_name):
|
||||||
input_file = open(input_file_name, 'r')
|
input_file = open(input_file_name, 'r')
|
||||||
root = construct_tei_etrees(input_file, metadata)
|
root = construct_tei_etrees(input_file)[0]
|
||||||
tree = etree.ElementTree(root)
|
tree = etree.ElementTree(root)
|
||||||
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
||||||
input_file.close()
|
input_file.close()
|
||||||
|
@ -326,13 +323,10 @@ def convert_file(input_file_name, output_file_name, metadata):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import argparse
|
|
||||||
from glob import glob
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
|
parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
|
||||||
parser.add_argument('files', nargs='+', help='CoNNL-U file')
|
parser.add_argument('files', nargs='+', help='CoNNL-U file')
|
||||||
parser.add_argument('-o', '--out-file', dest='out', default=None,
|
parser.add_argument('-o', '--out-file', dest='out', default=None, help='Write output to file instead of stdout.')
|
||||||
help='Write output to file instead of stdout.')
|
|
||||||
parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
|
parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import lxml.etree as lxml
|
import lxml.etree as lxml
|
||||||
import re
|
import re
|
||||||
import pickle
|
import pickle
|
||||||
import importlib_resources as pkg_resources
|
from importlib_resources import files
|
||||||
|
|
||||||
from conversion_utils.utils import xpath_find, get_xml_id
|
from conversion_utils.utils import xpath_find, get_xml_id
|
||||||
|
|
||||||
|
@ -230,7 +230,10 @@ class Msd:
|
||||||
return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language
|
return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language
|
||||||
|
|
||||||
|
|
||||||
class ConverterException(Exception):
|
class CustomException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class MsdException(CustomException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class Converter:
|
class Converter:
|
||||||
|
@ -238,9 +241,10 @@ class Converter:
|
||||||
|
|
||||||
def __init__(self, xml_file_name=None):
|
def __init__(self, xml_file_name=None):
|
||||||
if (xml_file_name is None):
|
if (xml_file_name is None):
|
||||||
if (pkg_resources.is_resource('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE)):
|
resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE)
|
||||||
|
if (resource.is_file()):
|
||||||
try:
|
try:
|
||||||
with pkg_resources.open_binary('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE) as pickle_file:
|
with resource.open('rb') as pickle_file:
|
||||||
self.specifications = pickle.load(pickle_file)
|
self.specifications = pickle.load(pickle_file)
|
||||||
except:
|
except:
|
||||||
exit('Could not parse specifications pickle file installed.')
|
exit('Could not parse specifications pickle file installed.')
|
||||||
|
@ -253,17 +257,46 @@ class Converter:
|
||||||
except:
|
except:
|
||||||
exit('Could not parse specifications xml file provided.')
|
exit('Could not parse specifications xml file provided.')
|
||||||
|
|
||||||
def msd_to_properties(self, msd, language, lemma=None):
|
def is_valid_msd(self, msd):
|
||||||
"""Convert Msd to Properties (possibly in the other language).
|
"""Verify if the Msd code is in the standard JOS set."""
|
||||||
|
return msd.code in self.specifications.codes_map[msd.language]
|
||||||
|
|
||||||
|
def check_valid_msd(self, msd, require_valid_flag):
|
||||||
|
"""If the Msd code is not valid, raise an exception or give a warning."""
|
||||||
|
if (not self.is_valid_msd(msd)):
|
||||||
|
message = 'The msd {} is unknown'.format(msd.code)
|
||||||
|
if (require_valid_flag):
|
||||||
|
raise MsdException(message)
|
||||||
|
else:
|
||||||
|
print('[WARN] ' + message)
|
||||||
|
|
||||||
|
def msd_to_properties(self, msd, language, lemma=None, require_valid_flag=False, warn_level_flag=False):
|
||||||
|
"""Convert Msd to Properties.
|
||||||
|
|
||||||
|
The language of the generated Properties is specified and can differ from the Msd language.
|
||||||
|
|
||||||
|
If require_valid_flag is True, a MsdException is raised if the MSD is not in the standard
|
||||||
|
JOS set. Otherwise only a warning is given.
|
||||||
|
|
||||||
|
If you care about accurate level information (i.e., which properties are lexeme-level and
|
||||||
|
which are form-level), note that some features depends on the particular lemma. For such
|
||||||
|
features, if lemma is not provided and warn_level_flag is True, a warning will be given.
|
||||||
|
|
||||||
|
If a MSD has dashes in place of letters for certain features, they are skipped, so that
|
||||||
|
these features are not included in the generated Properties object.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
msd(Msd): the JOS MSD to convert
|
||||||
|
language(str): the language for the Properties object to be generated: "en" (English) or "sl" (Slovene)
|
||||||
|
lemma(str): the lemma of the word form with the MSD
|
||||||
|
require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is provided
|
||||||
|
warn_level_flag(boolean): whether to warn if cannot be sure of level of a property
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Properties: the result of the conversion of the Msd in the language requested
|
||||||
|
|
||||||
The level (lexeme vs form) of certain reflexive msd features
|
|
||||||
depends on the lemma, so set the lemma if you need accurate
|
|
||||||
level information.
|
|
||||||
"""
|
"""
|
||||||
|
self.check_valid_msd(msd, require_valid_flag)
|
||||||
# if (msd.code not in self.specifications.codes_map[msd.language]):
|
|
||||||
# raise ConverterException('The msd {} is unknown'.format(msd.code))
|
|
||||||
|
|
||||||
category_char = msd.code[0].lower()
|
category_char = msd.code[0].lower()
|
||||||
value_chars = msd.code[1:]
|
value_chars = msd.code[1:]
|
||||||
category = self.specifications.find_category_by_code(category_char, msd.language)
|
category = self.specifications.find_category_by_code(category_char, msd.language)
|
||||||
|
@ -277,8 +310,8 @@ class Converter:
|
||||||
value = feature.find_value_by_char(value_char, msd.language)
|
value = feature.find_value_by_char(value_char, msd.language)
|
||||||
feature_name = feature.names.get(language)
|
feature_name = feature.names.get(language)
|
||||||
feature_value = value.names.get(language)
|
feature_value = value.names.get(language)
|
||||||
if (lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
|
if (warn_level_flag and lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
|
||||||
print('[WARN] The level of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.'
|
print('[WARN] The level (lexeme vs form) of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.'
|
||||||
.format(category=category_name, position=index))
|
.format(category=category_name, position=index))
|
||||||
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
|
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
|
||||||
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
|
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
|
||||||
|
@ -289,8 +322,21 @@ class Converter:
|
||||||
form_feature_map[feature_name] = feature_value
|
form_feature_map[feature_name] = feature_value
|
||||||
return Properties(category_name, lexeme_feature_map, form_feature_map, language)
|
return Properties(category_name, lexeme_feature_map, form_feature_map, language)
|
||||||
|
|
||||||
def properties_to_msd(self, properties, language):
|
def properties_to_msd(self, properties, language, require_valid_flag=False):
|
||||||
"""Convert Properties to msd (possibly in the other language)."""
|
"""Convert Properties to Msd.
|
||||||
|
|
||||||
|
The language of the generated Msd is specified and can differ from the Properties language.
|
||||||
|
|
||||||
|
If require_valid_flag is True, a MsdException is raised if the generated MSD is not in
|
||||||
|
the standard JOS set. Otherwise only a warning is given.
|
||||||
|
|
||||||
|
Any skipped positions among the Properties are represented as dashes in the MSD.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
properties(Properties): the properties to convert
|
||||||
|
language(str): the language for the Msd object to be returned: "en" (English) or "sl" (Slovene)
|
||||||
|
require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is generated
|
||||||
|
"""
|
||||||
category = self.specifications.find_category_by_name(properties.category, properties.language)
|
category = self.specifications.find_category_by_name(properties.category, properties.language)
|
||||||
category_char = category.codes.get(language).upper()
|
category_char = category.codes.get(language).upper()
|
||||||
feature_map = properties.lexeme_feature_map.copy()
|
feature_map = properties.lexeme_feature_map.copy()
|
||||||
|
@ -308,7 +354,9 @@ class Converter:
|
||||||
msd_code += '-'
|
msd_code += '-'
|
||||||
i += 1
|
i += 1
|
||||||
msd_code += position_map[position]
|
msd_code += position_map[position]
|
||||||
return Msd(msd_code, language)
|
msd = Msd(msd_code, language)
|
||||||
|
self.check_valid_msd(msd, require_valid_flag)
|
||||||
|
return msd
|
||||||
|
|
||||||
def translate_msd(self, msd, language):
|
def translate_msd(self, msd, language):
|
||||||
return self.properties_to_msd(self.msd_to_properties(msd, language), language)
|
return self.properties_to_msd(self.msd_to_properties(msd, language), language)
|
||||||
|
|
|
@ -1,12 +1,19 @@
|
||||||
|
"""Convert a TEI file to a XML file of the CJVT standard schema.
|
||||||
|
|
||||||
|
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
|
||||||
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import lxml.etree as lxml
|
import lxml.etree as lxml
|
||||||
|
|
||||||
from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER
|
from conversion_utils.utils import xpath_find, TEI_NAMESPACE_QUALIFIER
|
||||||
|
|
||||||
|
|
||||||
def get_parsed_unit_string(parsed_unit):
|
def get_parsed_unit_string(parsed_unit):
|
||||||
elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
|
elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
|
||||||
return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
|
return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
|
||||||
|
|
||||||
|
|
||||||
def convert(input_file_name, output_file_name):
|
def convert(input_file_name, output_file_name):
|
||||||
|
|
||||||
output_root = lxml.Element('dictionary')
|
output_root = lxml.Element('dictionary')
|
||||||
|
@ -55,4 +62,6 @@ if (__name__ == '__main__'):
|
||||||
arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
|
arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
|
||||||
arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
|
arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
|
||||||
arguments = arg_parser.parse_args()
|
arguments = arg_parser.parse_args()
|
||||||
|
input_file_name = arguments.infile
|
||||||
|
output_file_name = arguments.outfile
|
||||||
convert(input_file_name, output_file_name)
|
convert(input_file_name, output_file_name)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from conversion_utils.jos_msds_and_properties import Converter, Msd
|
from conversion_utils.jos_msds_and_properties import Converter, Msd, MsdException
|
||||||
|
|
||||||
class JosMsdToPropertiesTestCase(unittest.TestCase):
|
class JosMsdToPropertiesTestCase(unittest.TestCase):
|
||||||
|
|
||||||
|
@ -55,3 +55,25 @@ class JosMsdToPropertiesTestCase(unittest.TestCase):
|
||||||
self.assertEqual(properties.category, 'punctuation')
|
self.assertEqual(properties.category, 'punctuation')
|
||||||
self.assertEqual(properties.lexeme_feature_map, {})
|
self.assertEqual(properties.lexeme_feature_map, {})
|
||||||
self.assertEqual(properties.form_feature_map, {})
|
self.assertEqual(properties.form_feature_map, {})
|
||||||
|
|
||||||
|
def test_good_msd_with_require_valid(self):
|
||||||
|
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'en', require_valid_flag=True)
|
||||||
|
self.assertEqual(properties.language, 'en')
|
||||||
|
self.assertEqual(properties.category, 'noun')
|
||||||
|
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
|
||||||
|
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
|
||||||
|
|
||||||
|
def test_bad_msd(self):
|
||||||
|
properties = self.converter.msd_to_properties(Msd('N---d', 'en'), 'en')
|
||||||
|
self.assertEqual(properties.language, 'en')
|
||||||
|
self.assertEqual(properties.category, 'noun')
|
||||||
|
self.assertEqual(properties.lexeme_feature_map, {})
|
||||||
|
self.assertEqual(properties.form_feature_map, {'case':'dative'})
|
||||||
|
|
||||||
|
def test_bad_msd_with_require_valid(self):
|
||||||
|
try:
|
||||||
|
self.converter.msd_to_properties(Msd('N---d', 'en'), 'en', require_valid_flag=True)
|
||||||
|
fails = False
|
||||||
|
except MsdException:
|
||||||
|
fails = True
|
||||||
|
self.assertEqual(fails, True)
|
|
@ -1,6 +1,6 @@
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from conversion_utils.jos_msds_and_properties import Converter, Properties
|
from conversion_utils.jos_msds_and_properties import Converter, Properties, MsdException
|
||||||
|
|
||||||
class JosPropertiesToMsdTestCase(unittest.TestCase):
|
class JosPropertiesToMsdTestCase(unittest.TestCase):
|
||||||
|
|
||||||
|
@ -41,3 +41,21 @@ class JosPropertiesToMsdTestCase(unittest.TestCase):
|
||||||
msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl')
|
msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl')
|
||||||
self.assertEqual(msd.language, 'sl')
|
self.assertEqual(msd.language, 'sl')
|
||||||
self.assertEqual(msd.code, 'U')
|
self.assertEqual(msd.code, 'U')
|
||||||
|
|
||||||
|
def test_good_msd_with_require_valid(self):
|
||||||
|
msd = self.converter.properties_to_msd(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'en', require_valid_flag=True)
|
||||||
|
self.assertEqual(msd.language, 'en')
|
||||||
|
self.assertEqual(msd.code, 'Ncfdn')
|
||||||
|
|
||||||
|
def test_bad_msd(self):
|
||||||
|
msd = self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en')
|
||||||
|
self.assertEqual(msd.language, 'en')
|
||||||
|
self.assertEqual(msd.code, 'Nc-d')
|
||||||
|
|
||||||
|
def test_bad_msd_with_require_valid(self):
|
||||||
|
try:
|
||||||
|
self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en', require_valid_flag=True)
|
||||||
|
fails = False
|
||||||
|
except MsdException:
|
||||||
|
fails = True
|
||||||
|
self.assertEqual(fails, True)
|
|
@ -1,5 +1,7 @@
|
||||||
#!/usr/bin/python3
|
"""Convert the MSD and/or syntactic dependency tags in a CoNLL-U file from English to Slovene tags.
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
|
||||||
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import codecs
|
import codecs
|
||||||
|
@ -8,6 +10,7 @@ from importlib_resources import files
|
||||||
|
|
||||||
from conversion_utils.jos_msds_and_properties import Converter, Msd
|
from conversion_utils.jos_msds_and_properties import Converter, Msd
|
||||||
|
|
||||||
|
|
||||||
def get_syn_map():
|
def get_syn_map():
|
||||||
dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
|
dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
|
||||||
dict_file = codecs.open(dict_file_name, 'r')
|
dict_file = codecs.open(dict_file_name, 'r')
|
||||||
|
@ -15,7 +18,8 @@ def get_syn_map():
|
||||||
dict_file.close()
|
dict_file.close()
|
||||||
return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
|
return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
|
||||||
|
|
||||||
def translate(input_file_name, output_file_name):
|
|
||||||
|
def translate(input_file_name, scope, output_file_name):
|
||||||
|
|
||||||
syn_map = get_syn_map()
|
syn_map = get_syn_map()
|
||||||
|
|
||||||
|
@ -29,8 +33,10 @@ def translate(input_file_name, output_file_name):
|
||||||
if (len(columns) != 10):
|
if (len(columns) != 10):
|
||||||
output_file.write(line)
|
output_file.write(line)
|
||||||
else:
|
else:
|
||||||
columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code
|
if (scope in {'msd', 'both'}):
|
||||||
columns[7] = syn_map[columns[7]]
|
columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code
|
||||||
|
if (scope in {'dep', 'both'}):
|
||||||
|
columns[7] = syn_map[columns[7]]
|
||||||
output_file.write('\t'.join(columns) + '\n')
|
output_file.write('\t'.join(columns) + '\n')
|
||||||
|
|
||||||
input_file.close()
|
input_file.close()
|
||||||
|
@ -41,6 +47,7 @@ if (__name__ == '__main__'):
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(description='Translate JOS msds and dependency labels.')
|
arg_parser = argparse.ArgumentParser(description='Translate JOS msds and dependency labels.')
|
||||||
arg_parser.add_argument('-infile', type=str, help='Input conllu')
|
arg_parser.add_argument('-infile', type=str, help='Input conllu')
|
||||||
|
arg_parser.add_argument('-scope', type=str, options=['msd', 'dep', 'both'], default='both', help='Input conllu')
|
||||||
arg_parser.add_argument('-outfile', type=str, help='Output conllu')
|
arg_parser.add_argument('-outfile', type=str, help='Output conllu')
|
||||||
arguments = arg_parser.parse_args()
|
arguments = arg_parser.parse_args()
|
||||||
input_file_name = arguments.infile
|
input_file_name = arguments.infile
|
||||||
|
|
|
@ -1,11 +1,16 @@
|
||||||
|
"""A few convenience TEI/XML constants and functions."""
|
||||||
|
|
||||||
|
|
||||||
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
|
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
|
||||||
TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
|
TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
|
||||||
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
|
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
|
||||||
|
|
||||||
|
|
||||||
def xpath_find(element,expression):
|
def xpath_find(element,expression):
|
||||||
"""Executes XPath expression, with TEI namespace."""
|
"""Executes XPath expression, with TEI namespace."""
|
||||||
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
|
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
|
||||||
|
|
||||||
|
|
||||||
def get_xml_id(element):
|
def get_xml_id(element):
|
||||||
"""Returns the element's @xml:id attribute."""
|
"""Returns the element's @xml:id attribute."""
|
||||||
return element.get(XML_ID_ATTRIBUTE_NAME)
|
return element.get(XML_ID_ATTRIBUTE_NAME)
|
||||||
|
|
59
run.py
59
run.py
|
@ -1,59 +0,0 @@
|
||||||
import os
|
|
||||||
|
|
||||||
from conversion_utils.conllu_to_tei import convert_file
|
|
||||||
import csv
|
|
||||||
# dir_path = 'data/conllu'
|
|
||||||
# out_dir_path = 'data/tei'
|
|
||||||
# for filename in os.listdir(dir_path):
|
|
||||||
# in_name = os.path.join(dir_path, filename)
|
|
||||||
# out_filename = filename.split('.')[:-1]
|
|
||||||
# out_filename = '.'.join(out_filename) + '.xml'
|
|
||||||
# out_name = os.path.join(out_dir_path, out_filename)
|
|
||||||
# convert_file(in_name, out_name)
|
|
||||||
|
|
||||||
metadata_list = []
|
|
||||||
with open('data/metadata.csv', newline='') as csvfile:
|
|
||||||
for line in csv.reader(csvfile):
|
|
||||||
metadata_list.append(line)
|
|
||||||
|
|
||||||
metadata = [{} for i in range(len(metadata_list[0]) - 1)]
|
|
||||||
for i in range(1, len(metadata_list[0])):
|
|
||||||
metadata[i - 1]['title'] = metadata_list[0][i]
|
|
||||||
metadata[i - 1]['subtitle'] = metadata_list[1][i]
|
|
||||||
metadata[i - 1]['authors'] = metadata_list[2][i]
|
|
||||||
metadata[i - 1]['first_edition'] = metadata_list[3][i]
|
|
||||||
metadata[i - 1]['edition_in_corpus'] = metadata_list[4][i]
|
|
||||||
metadata[i - 1]['layer_according_to_SEJO'] = metadata_list[5][i]
|
|
||||||
metadata[i - 1]['audience'] = metadata_list[6][i]
|
|
||||||
metadata[i - 1]['hours_of_classes'] = metadata_list[7][i]
|
|
||||||
metadata[i - 1]['publisher'] = metadata_list[8][i]
|
|
||||||
metadata[i - 1]['file_name'] = metadata_list[9][i]
|
|
||||||
|
|
||||||
|
|
||||||
dir_path = 'data/conllu'
|
|
||||||
out_path = 'data/tei/tei.xml'
|
|
||||||
out_dir = 'data/conllu.conllu'
|
|
||||||
# out_dir = dir_path + '/conllu_small.conllu'
|
|
||||||
metadata_indices = [6, 16, 13, 7, 2, 1, 3, 14, 15, 0, 8, 4, 11, 9, 12, 5, 10]
|
|
||||||
out_file = open(out_dir, 'w')
|
|
||||||
metadata_indices = []
|
|
||||||
for fn_i, filename in enumerate(os.listdir(dir_path)):
|
|
||||||
in_name = os.path.join(dir_path, filename)
|
|
||||||
out_filename = filename.split('.')[:-1]
|
|
||||||
out_filename = '.'.join(out_filename)
|
|
||||||
for m_i, el in enumerate(metadata):
|
|
||||||
if el['file_name'] == out_filename:
|
|
||||||
metadata_indices.append(m_i)
|
|
||||||
out_filename = out_filename + '.xml'
|
|
||||||
out_name = os.path.join(out_dir, out_filename)
|
|
||||||
in_file = open(in_name, 'r')
|
|
||||||
data = f'# newdoc id = doc{str(fn_i+1)}\n'
|
|
||||||
data += in_file.read()
|
|
||||||
in_file.close()
|
|
||||||
out_file.write(data)
|
|
||||||
out_file.close()
|
|
||||||
|
|
||||||
shuffled_metadata = [metadata[el] for el in metadata_indices]
|
|
||||||
|
|
||||||
|
|
||||||
convert_file(out_dir, out_path, shuffled_metadata)
|
|
|
@ -1,3 +1,13 @@
|
||||||
|
"""Parse source TEI specifications and save as a pickle.
|
||||||
|
|
||||||
|
You can use this script to create a new pickle file to replace the one stored at
|
||||||
|
../conversion_utils/resources/jos_specifications.pickle. The input file is expected to be a version
|
||||||
|
of https://github.com/clarinsi/mte-msd/blob/master/xml/msd-sl.spc.xml. However, the specifications
|
||||||
|
are not expected to change, and if they do, the package pickle there should be updated upstream, so
|
||||||
|
you probably should not have to use this script.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
import pickle
|
import pickle
|
||||||
import argparse
|
import argparse
|
||||||
from conversion_utils.jos_msds_and_properties import SpecificationsParser
|
from conversion_utils.jos_msds_and_properties import SpecificationsParser
|
||||||
|
|
20
setup.py
20
setup.py
|
@ -1,12 +1,20 @@
|
||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
|
import os
|
||||||
|
|
||||||
setup(name='conversion_utils',
|
here = os.path.abspath(os.path.dirname(__file__))
|
||||||
version='0.1',
|
with open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
|
||||||
|
long_description = f.read()
|
||||||
|
|
||||||
|
setup(name='cjvt_conversion_utils',
|
||||||
|
version='0.3',
|
||||||
description='CJVT conversion utilities',
|
description='CJVT conversion utilities',
|
||||||
|
long_description=long_description,
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
url='https://gitea.cjvt.si/generic/conversion_utils',
|
url='https://gitea.cjvt.si/generic/conversion_utils',
|
||||||
author='Cyprian Laskowski',
|
author='CJVT',
|
||||||
author_email='cyp@cjvt.si',
|
author_email='pypi@cjvt.si',
|
||||||
packages=['conversion_utils', 'conversion_utils.resources'],
|
license='MIT',
|
||||||
install_requires=['importlib_resources'],
|
packages=['conversion_utils', 'conversion_utils.resources', 'conversion_utils.tests'],
|
||||||
|
install_requires=['lxml', 'importlib_resources'],
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
zip_safe=True)
|
zip_safe=True)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user