Compare commits

2 Commits

10 changed files with 138 additions and 171 deletions

2
.gitignore vendored
View File

@@ -1,4 +1,2 @@
*.pyc
venv
data
.idea

View File

@@ -4,34 +4,20 @@ import sys
from lxml import etree
from conversion_utils.jos_msds_and_properties import Converter, Msd
converter = Converter()
def translate_msd(msd_text, lang, lemma=None):
""" Translates msd using conversion_utils library. """
return converter.properties_to_msd(converter.msd_to_properties(Msd(msd_text, 'en'), 'sl', lemma),
'sl').code
class Sentence:
def __init__(self, _id, no_ud=False, system='jos'):
self._id = _id
self.items = []
self.links = []
self.srl_links = []
self.no_ud = no_ud
self.system = system
def add_item(self, token, lemma, upos, upos_other, xpos, misc):
no_space_after = 'SpaceAfter' in misc and misc['SpaceAfter'] == 'No'
ner = misc['NER'] if 'NER' in misc else 'O'
self.items.append([token, lemma, upos, upos_other, xpos, no_space_after, ner])
self.items.append([token, lemma, upos, upos_other, xpos, "SpaceAfter=No" in misc.split('|')])
def add_link(self, link_ref, link_type):
self.links.append([link_ref, link_type])
def add_srl_link(self, link_ref, link_type):
self.srl_links.append([link_ref, link_type])
def as_xml(self, id_prefix=None):
if id_prefix:
xml_id = id_prefix + '.' + self._id
@@ -41,24 +27,8 @@ class Sentence:
set_xml_attr(base, 'id', xml_id)
id_counter = 1
in_seg = False
sentence_base = base
for item in self.items:
token, lemma, upos, upos_other, xpos, no_space_after, ner = item
if ner[0] == 'B':
if in_seg:
sentence_base.append(base)
in_seg = True
base = etree.Element('seg')
base.set('type', 'name')
base.set('subtype', f'{ner[2:].lower()}')
elif ner[0] == 'O':
if in_seg:
sentence_base.append(base)
base = sentence_base
in_seg = False
token, lemma, upos, upos_other, xpos, no_space_after = item
if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one
to_add = etree.Element('pc')
@@ -66,7 +36,6 @@ class Sentence:
to_add = etree.Element('w')
to_add.set('lemma', lemma)
xpos = translate_msd(xpos,'sl',lemma)
to_add.set('ana', 'mte:' + xpos)
if not self.no_ud:
if upos_other != '_':
@@ -84,11 +53,6 @@ class Sentence:
base.append(to_add)
if in_seg:
sentence_base.append(base)
base = sentence_base
# depparsing linkGrp
link_grp = etree.Element('linkGrp')
link_grp.set('corresp', '#'+xml_id)
link_grp.set('targFunc', 'head argument')
@@ -103,23 +67,6 @@ class Sentence:
link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
link_grp.append(link)
base.append(link_grp)
# srl linkGrp
if self.srl_links:
link_grp = etree.Element('linkGrp')
link_grp.set('corresp', '#' + xml_id)
link_grp.set('targFunc', 'head argument')
link_grp.set('type', 'SRL')
for link_id, item in enumerate(self.srl_links):
link_ref, link_type = item
link = etree.Element('link')
link.set('ana', 'srl:' + link_type.replace(':', '_'))
if link_ref == u'0':
link.set('target', '#' + xml_id + ' #' + xml_id + '.' + str(link_id + 1))
else:
link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
link_grp.append(link)
base.append(link_grp)
return base
@@ -146,28 +93,35 @@ class Paragraph:
class TeiDocument:
def __init__(self, _id, paragraphs=list(), metadata=None):
def __init__(self, _id, paragraphs=list()):
self._id = _id
self.metadata = metadata
self.paragraphs = paragraphs
def as_xml(self):
root = etree.Element('div')
root = etree.Element('TEI')
root.set('xmlns', 'http://www.tei-c.org/ns/1.0')
set_xml_attr(root, 'lang', 'sl')
xml_id = self._id
if xml_id is not None:
set_xml_attr(root, 'id', xml_id)
tei_header = etree.SubElement(root, 'teiHeader')
bibl = etree.Element('bibl')
bibl.set('corresp', f'#{xml_id}')
bibl.set('n', f'#{xml_id}')
for k, v in self.metadata.items():
bibl_el = etree.Element(k)
bibl_el.text = v
bibl.append(bibl_el)
root.append(bibl)
text = etree.SubElement(root, 'text')
body = etree.SubElement(text, 'body')
for para in self.paragraphs:
root.append(para.as_xml(id_prefix=xml_id))
body.append(para.as_xml(id_prefix=xml_id))
encoding_desc = etree.SubElement(tei_header, 'encodingDesc')
tags_decl = etree.SubElement(encoding_desc, 'tagsDecl')
namespace = etree.SubElement(tags_decl, 'namespace')
namespace.set('name', 'http://www.tei-c.org/ns/1.0')
for tag in ['p', 's', 'pc', 'w']:
count = int(text.xpath('count(.//{})'.format(tag)))
tag_usage = etree.SubElement(namespace, 'tagUsage')
tag_usage.set('gi', tag)
tag_usage.set('occurs', str(count))
return root
def add_paragraph(self, paragraph):
@@ -175,13 +129,10 @@ class TeiDocument:
def build_tei_etrees(documents):
root = etree.Element('body')
root.set('xmlns', 'http://www.tei-c.org/ns/1.0')
set_xml_attr(root, 'base', 'korpus.xml')
set_xml_attr(root, 'lang', 'sl')
elements = []
for document in documents:
root.append(document.as_xml())
return root
elements.append(document.as_xml())
return elements
def set_xml_attr(node, attribute, value):
@@ -204,12 +155,11 @@ def is_metaline(line):
return False
def construct_tei_documents(conllu_lines, metadata):
def construct_tei_documents(conllu_lines):
documents = []
doc_id = None
doc_id_num = 0
document_paragraphs = []
document_paragraphs = []
para_id = None
para_buffer = []
@@ -221,12 +171,9 @@ def construct_tei_documents(conllu_lines, metadata):
if len(para_buffer) > 0:
document_paragraphs.append(construct_paragraph(para_id, para_buffer))
if len(document_paragraphs) > 0:
print(metadata)
print(doc_id_num)
documents.append(
TeiDocument(doc_id, document_paragraphs, metadata[doc_id_num]))
TeiDocument(doc_id, document_paragraphs))
document_paragraphs = []
doc_id_num += 1
doc_id = val
elif key == 'newpar id':
if len(para_buffer) > 0:
@@ -244,8 +191,7 @@ def construct_tei_documents(conllu_lines, metadata):
if len(document_paragraphs) > 0:
documents.append(
TeiDocument(doc_id, document_paragraphs, metadata[doc_id_num]))
doc_id_num += 1
TeiDocument(doc_id, document_paragraphs))
return documents
@@ -288,7 +234,7 @@ def construct_sentence(sent_id, lines):
upos_other = tokens[5]
depparse_link = tokens[6]
depparse_link_name = tokens[7]
misc = {el.split('=')[0]: el.split('=')[1] for el in tokens[9].split('|')}
misc = tokens[9]
sentence.add_item(
token,
@@ -301,22 +247,17 @@ def construct_sentence(sent_id, lines):
sentence.add_link(
depparse_link,
depparse_link_name)
if 'SRL' in misc:
sentence.add_srl_link(
depparse_link,
misc['SRL'])
return sentence
def construct_tei_etrees(conllu_lines, metadata):
documents = construct_tei_documents(conllu_lines, metadata)
def construct_tei_etrees(conllu_lines):
documents = construct_tei_documents(conllu_lines)
return build_tei_etrees(documents)
def convert_file(input_file_name, output_file_name, metadata):
def convert_file(input_file_name, output_file_name):
input_file = open(input_file_name, 'r')
root = construct_tei_etrees(input_file, metadata)
root = construct_tei_etrees(input_file)[0]
tree = etree.ElementTree(root)
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
input_file.close()

View File

@@ -230,7 +230,10 @@ class Msd:
return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language
class ConverterException(Exception):
class CustomException(Exception):
pass
class MsdException(CustomException):
pass
class Converter:
@@ -253,17 +256,46 @@ class Converter:
except:
exit('Could not parse specifications xml file provided.')
def msd_to_properties(self, msd, language, lemma=None):
"""Convert Msd to Properties (possibly in the other language).
def is_valid_msd(self, msd):
"""Verify if the Msd code is in the standard JOS set."""
return msd.code in self.specifications.codes_map[msd.language]
def check_valid_msd(self, msd, require_valid_flag):
"""If the Msd code is not valid, raise an exception or give a warning."""
if (not self.is_valid_msd(msd)):
message = 'The msd {} is unknown'.format(msd.code)
if (require_valid_flag):
raise MsdException(message)
else:
print('[WARN] ' + message)
def msd_to_properties(self, msd, language, lemma=None, require_valid_flag=False, warn_level_flag=False):
"""Convert Msd to Properties.
The language of the generated Properties is specified and can differ from the Msd language.
If require_valid_flag is True, a MsdException is raised if the MSD is not in the standard
JOS set. Otherwise only a warning is given.
If you care about accurate level information (i.e., which properties are lexeme-level and
which are form-level), note that some features depends on the particular lemma. For such
features, if lemma is not provided and warn_level_flag is True, a warning will be given.
If a MSD has dashes in place of letters for certain features, they are skipped, so that
these features are not included in the generated Properties object.
Parameters:
msd(Msd): the JOS MSD to convert
language(str): the language for the Properties object to be generated: "en" (English) or "sl" (Slovene)
lemma(str): the lemma of the word form with the MSD
require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is provided
warn_level_flag(boolean): whether to warn if cannot be sure of level of a property
Returns:
Properties: the result of the conversion of the Msd in the language requested
The level (lexeme vs form) of certain reflexive msd features
depends on the lemma, so set the lemma if you need accurate
level information.
"""
# if (msd.code not in self.specifications.codes_map[msd.language]):
# raise ConverterException('The msd {} is unknown'.format(msd.code))
self.check_valid_msd(msd, require_valid_flag)
category_char = msd.code[0].lower()
value_chars = msd.code[1:]
category = self.specifications.find_category_by_code(category_char, msd.language)
@@ -277,8 +309,8 @@ class Converter:
value = feature.find_value_by_char(value_char, msd.language)
feature_name = feature.names.get(language)
feature_value = value.names.get(language)
if (lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
print('[WARN] The level of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.'
if (warn_level_flag and lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
print('[WARN] The level (lexeme vs form) of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.'
.format(category=category_name, position=index))
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
@@ -289,8 +321,21 @@ class Converter:
form_feature_map[feature_name] = feature_value
return Properties(category_name, lexeme_feature_map, form_feature_map, language)
def properties_to_msd(self, properties, language):
"""Convert Properties to msd (possibly in the other language)."""
def properties_to_msd(self, properties, language, require_valid_flag=False):
"""Convert Properties to Msd.
The language of the generated Msd is specified and can differ from the Properties language.
If require_valid_flag is True, a MsdException is raised if the generated MSD is not in
the standard JOS set. Otherwise only a warning is given.
Any skipped positions among the Properties are represented as dashes in the MSD.
Parameters:
properties(Properties): the properties to convert
language(str): the language for the Msd object to be returned: "en" (English) or "sl" (Slovene)
require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is generated
"""
category = self.specifications.find_category_by_name(properties.category, properties.language)
category_char = category.codes.get(language).upper()
feature_map = properties.lexeme_feature_map.copy()
@@ -308,7 +353,9 @@ class Converter:
msd_code += '-'
i += 1
msd_code += position_map[position]
return Msd(msd_code, language)
msd = Msd(msd_code, language)
self.check_valid_msd(msd, require_valid_flag)
return msd
def translate_msd(self, msd, language):
return self.properties_to_msd(self.msd_to_properties(msd, language), language)

View File

@@ -1,6 +1,6 @@
import unittest
from conversion_utils.jos_msds_and_properties import Converter, Msd
from conversion_utils.jos_msds_and_properties import Converter, Msd, MsdException
class JosMsdToPropertiesTestCase(unittest.TestCase):
@@ -55,3 +55,25 @@ class JosMsdToPropertiesTestCase(unittest.TestCase):
self.assertEqual(properties.category, 'punctuation')
self.assertEqual(properties.lexeme_feature_map, {})
self.assertEqual(properties.form_feature_map, {})
def test_good_msd_with_require_valid(self):
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'en', require_valid_flag=True)
self.assertEqual(properties.language, 'en')
self.assertEqual(properties.category, 'noun')
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
def test_bad_msd(self):
properties = self.converter.msd_to_properties(Msd('N---d', 'en'), 'en')
self.assertEqual(properties.language, 'en')
self.assertEqual(properties.category, 'noun')
self.assertEqual(properties.lexeme_feature_map, {})
self.assertEqual(properties.form_feature_map, {'case':'dative'})
def test_bad_msd_with_require_valid(self):
try:
self.converter.msd_to_properties(Msd('N---d', 'en'), 'en', require_valid_flag=True)
fails = False
except MsdException:
fails = True
self.assertEqual(fails, True)

View File

@@ -1,6 +1,6 @@
import unittest
from conversion_utils.jos_msds_and_properties import Converter, Properties
from conversion_utils.jos_msds_and_properties import Converter, Properties, MsdException
class JosPropertiesToMsdTestCase(unittest.TestCase):
@@ -41,3 +41,21 @@ class JosPropertiesToMsdTestCase(unittest.TestCase):
msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl')
self.assertEqual(msd.language, 'sl')
self.assertEqual(msd.code, 'U')
def test_good_msd_with_require_valid(self):
msd = self.converter.properties_to_msd(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'en', require_valid_flag=True)
self.assertEqual(msd.language, 'en')
self.assertEqual(msd.code, 'Ncfdn')
def test_bad_msd(self):
msd = self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en')
self.assertEqual(msd.language, 'en')
self.assertEqual(msd.code, 'Nc-d')
def test_bad_msd_with_require_valid(self):
try:
self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en', require_valid_flag=True)
fails = False
except MsdException:
fails = True
self.assertEqual(fails, True)

59
run.py
View File

@@ -1,59 +0,0 @@
import os
from conversion_utils.conllu_to_tei import convert_file
import csv
# dir_path = 'data/conllu'
# out_dir_path = 'data/tei'
# for filename in os.listdir(dir_path):
# in_name = os.path.join(dir_path, filename)
# out_filename = filename.split('.')[:-1]
# out_filename = '.'.join(out_filename) + '.xml'
# out_name = os.path.join(out_dir_path, out_filename)
# convert_file(in_name, out_name)
metadata_list = []
with open('data/metadata.csv', newline='') as csvfile:
for line in csv.reader(csvfile):
metadata_list.append(line)
metadata = [{} for i in range(len(metadata_list[0]) - 1)]
for i in range(1, len(metadata_list[0])):
metadata[i - 1]['title'] = metadata_list[0][i]
metadata[i - 1]['subtitle'] = metadata_list[1][i]
metadata[i - 1]['authors'] = metadata_list[2][i]
metadata[i - 1]['first_edition'] = metadata_list[3][i]
metadata[i - 1]['edition_in_corpus'] = metadata_list[4][i]
metadata[i - 1]['layer_according_to_SEJO'] = metadata_list[5][i]
metadata[i - 1]['audience'] = metadata_list[6][i]
metadata[i - 1]['hours_of_classes'] = metadata_list[7][i]
metadata[i - 1]['publisher'] = metadata_list[8][i]
metadata[i - 1]['file_name'] = metadata_list[9][i]
dir_path = 'data/conllu'
out_path = 'data/tei/tei.xml'
out_dir = 'data/conllu.conllu'
# out_dir = dir_path + '/conllu_small.conllu'
metadata_indices = [6, 16, 13, 7, 2, 1, 3, 14, 15, 0, 8, 4, 11, 9, 12, 5, 10]
out_file = open(out_dir, 'w')
metadata_indices = []
for fn_i, filename in enumerate(os.listdir(dir_path)):
in_name = os.path.join(dir_path, filename)
out_filename = filename.split('.')[:-1]
out_filename = '.'.join(out_filename)
for m_i, el in enumerate(metadata):
if el['file_name'] == out_filename:
metadata_indices.append(m_i)
out_filename = out_filename + '.xml'
out_name = os.path.join(out_dir, out_filename)
in_file = open(in_name, 'r')
data = f'# newdoc id = doc{str(fn_i+1)}\n'
data += in_file.read()
in_file.close()
out_file.write(data)
out_file.close()
shuffled_metadata = [metadata[el] for el in metadata_indices]
convert_file(out_dir, out_path, shuffled_metadata)

View File

@@ -6,7 +6,7 @@ setup(name='conversion_utils',
url='https://gitea.cjvt.si/generic/conversion_utils',
author='Cyprian Laskowski',
author_email='cyp@cjvt.si',
packages=['conversion_utils', 'conversion_utils.resources'],
packages=['conversion_utils', 'conversion_utils.resources', 'conversion_utils.tests'],
install_requires=['importlib_resources'],
include_package_data=True,
zip_safe=True)