tei_conllu_conversions/switch_tei_tags.py

42 lines
1.3 KiB
Python
Raw Normal View History

#!/usr/bin/python
# -*- coding: utf-8 -*-
# python switch_tei_tags.py ~/slovnica/nova_slovnica/output/tei_pipeline/tei_tokenised ~/slovnica/nova_slovnica/output/tei_pipeline/tei_for_tagger
import sys
import os
import shutil
import codecs
import lxml.etree as lxml
input_directory = sys.argv[1]
output_directory = sys.argv[2]
shutil.rmtree(output_directory,True)
os.makedirs(output_directory)
tei_namespace = 'http://www.tei-c.org/ns/1.0'
tei_namespace_qualifier = '{' + tei_namespace + '}'
def do_file(input_file_name):
if (input_file_name.endswith('.xml')):
with open(log, 'a') as fp:
fp.write(input_file_name + "\n")
tree = lxml.parse(unicode(input_directory + '/' + input_file_name))
root = tree.getroot()
spaces = root.xpath('.//tei:c', namespaces={'tei':tei_namespace})
for space in spaces:
space.tag = tei_namespace_qualifier + 'S'
space.text = None
punctuations = root.xpath('.//tei:pc', namespaces={'tei':tei_namespace})
for punctuation in punctuations:
punctuation.tag = tei_namespace_qualifier + 'c'
output_file_name = output_directory + '/' + input_file_name
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
if __name__ == "__main__":
for fname in input_file_names:
do_file(fname)