@ -1,15 +1,16 @@
""" Convert a series of CoNNL-U files to a TEI file.
This script was developed in the context of a specific task and may not generalise as expected . Use at your own risk .
"""
import argparse
import re
import sys
from glob import glob
from lxml import etree
from conversion_utils . jos_msds_and_properties import Converter , Msd
converter = Converter ( )
def translate_msd ( msd_text , lang , lemma = None ) :
""" Translates msd using conversion_utils library. """
return converter . properties_to_msd ( converter . msd_to_properties ( Msd ( msd_text , ' en ' ) , ' sl ' , lemma ) ,
' sl ' ) . code
class Sentence :
def __init__ ( self , _id , no_ud = False , system = ' jos ' ) :
@ -65,6 +66,7 @@ class Sentence:
to_add = etree . Element ( ' w ' )
to_add . set ( ' lemma ' , lemma )
xpos = translate_msd ( xpos , ' sl ' , lemma )
to_add . set ( ' ana ' , ' mte: ' + xpos )
if not self . no_ud :
if upos_other != ' _ ' :
@ -144,35 +146,28 @@ class Paragraph:
class TeiDocument :
def __init__ ( self , _id , paragraphs = list ( ) ):
def __init__ ( self , _id , paragraphs = list ( ) , metadata = None ):
self . _id = _id
self . metadata = metadata
self . paragraphs = paragraphs
def as_xml ( self ) :
root = etree . Element ( ' TEI ' )
root . set ( ' xmlns ' , ' http://www.tei-c.org/ns/1.0 ' )
set_xml_attr ( root , ' lang ' , ' sl ' )
root = etree . Element ( ' div ' )
xml_id = self . _id
if xml_id is not None :
set_xml_attr ( root , ' id ' , xml_id )
tei_header = etree . SubElement ( root , ' teiHeader ' )
text = etree . SubElement ( root , ' text ' )
body = etree . SubElement ( text , ' body ' )
bibl = etree . Element ( ' bibl ' )
bibl . set ( ' corresp ' , f ' # { xml_id } ' )
bibl . set ( ' n ' , f ' # { xml_id } ' )
for k , v in self . metadata . items ( ) :
bibl_el = etree . Element ( k )
bibl_el . text = v
bibl . append ( bibl_el )
root . append ( bibl )
for para in self . paragraphs :
body . append ( para . as_xml ( id_prefix = xml_id ) )
encoding_desc = etree . SubElement ( tei_header , ' encodingDesc ' )
tags_decl = etree . SubElement ( encoding_desc , ' tagsDecl ' )
namespace = etree . SubElement ( tags_decl , ' namespace ' )
namespace . set ( ' name ' , ' http://www.tei-c.org/ns/1.0 ' )
for tag in [ ' p ' , ' s ' , ' pc ' , ' w ' ] :
count = int ( text . xpath ( ' count(.// {} ) ' . format ( tag ) ) )
tag_usage = etree . SubElement ( namespace , ' tagUsage ' )
tag_usage . set ( ' gi ' , tag )
tag_usage . set ( ' occurs ' , str ( count ) )
root . append ( para . as_xml ( id_prefix = xml_id ) )
return root
def add_paragraph ( self , paragraph ) :
@ -180,10 +175,13 @@ class TeiDocument:
def build_tei_etrees ( documents ) :
elements = [ ]
root = etree . Element ( ' body ' )
root . set ( ' xmlns ' , ' http://www.tei-c.org/ns/1.0 ' )
set_xml_attr ( root , ' base ' , ' korpus.xml ' )
set_xml_attr ( root , ' lang ' , ' sl ' )
for document in documents :
elements . append ( document . as_xml ( ) )
return elements
root . append ( document . as_xml ( ) )
return root
def set_xml_attr ( node , attribute , value ) :
@ -206,11 +204,12 @@ def is_metaline(line):
return False
def construct_tei_documents ( conllu_lines ):
def construct_tei_documents ( conllu_lines , metadata ):
documents = [ ]
doc_id = None
document_paragraphs = [ ]
doc_id_num = 0
document_paragraphs = [ ]
para_id = None
para_buffer = [ ]
@ -222,9 +221,12 @@ def construct_tei_documents(conllu_lines):
if len ( para_buffer ) > 0 :
document_paragraphs . append ( construct_paragraph ( para_id , para_buffer ) )
if len ( document_paragraphs ) > 0 :
print ( metadata )
print ( doc_id_num )
documents . append (
TeiDocument ( doc_id , document_paragraphs ) )
TeiDocument ( doc_id , document_paragraphs , metadata [ doc_id_num ] ))
document_paragraphs = [ ]
doc_id_num + = 1
doc_id = val
elif key == ' newpar id ' :
if len ( para_buffer ) > 0 :
@ -242,7 +244,8 @@ def construct_tei_documents(conllu_lines):
if len ( document_paragraphs ) > 0 :
documents . append (
TeiDocument ( doc_id , document_paragraphs ) )
TeiDocument ( doc_id , document_paragraphs , metadata [ doc_id_num ] ) )
doc_id_num + = 1
return documents
@ -285,7 +288,7 @@ def construct_sentence(sent_id, lines):
upos_other = tokens [ 5 ]
depparse_link = tokens [ 6 ]
depparse_link_name = tokens [ 7 ]
misc = { el . split ( ' = ' ) [ 0 ] : el . split ( ' = ' ) [ 1 ] for el in tokens [ 9 ] . split ( ' | ' ) } if tokens [ 9 ] != ' _ ' else { }
misc = { el . split ( ' = ' ) [ 0 ] : el . split ( ' = ' ) [ 1 ] for el in tokens [ 9 ] . split ( ' | ' ) }
sentence . add_item (
token ,
@ -306,14 +309,14 @@ def construct_sentence(sent_id, lines):
return sentence
def construct_tei_etrees ( conllu_lines ):
documents = construct_tei_documents ( conllu_lines )
def construct_tei_etrees ( conllu_lines , metadata ):
documents = construct_tei_documents ( conllu_lines , metadata )
return build_tei_etrees ( documents )
def convert_file ( input_file_name , output_file_name ):
def convert_file ( input_file_name , output_file_name , metadata ):
input_file = open ( input_file_name , ' r ' )
root = construct_tei_etrees ( input_file )[ 0 ]
root = construct_tei_etrees ( input_file , metadata )
tree = etree . ElementTree ( root )
tree . write ( output_file_name , encoding = ' UTF-8 ' , pretty_print = True )
input_file . close ( )
@ -323,10 +326,13 @@ def convert_file(input_file_name, output_file_name):
if __name__ == ' __main__ ' :
import argparse
from glob import glob
parser = argparse . ArgumentParser ( description = ' Convert CoNNL-U to TEI. ' )
parser . add_argument ( ' files ' , nargs = ' + ' , help = ' CoNNL-U file ' )
parser . add_argument ( ' -o ' , ' --out-file ' , dest = ' out ' , default = None , help = ' Write output to file instead of stdout. ' )
parser . add_argument ( ' -o ' , ' --out-file ' , dest = ' out ' , default = None ,
help = ' Write output to file instead of stdout. ' )
parser . add_argument ( ' -s ' , ' --system ' , dest = ' system ' , default = ' jos ' , choices = [ ' jos ' , ' ud ' ] )
args = parser . parse_args ( )