#!/usr/bin/python # -*- coding: utf-8 -*- # python switch_tei_tags.py ~/slovnica/nova_slovnica/output/tei_pipeline/tei_tokenised ~/slovnica/nova_slovnica/output/tei_pipeline/tei_for_tagger import sys import os import shutil import codecs import lxml.etree as lxml input_directory = sys.argv[1] output_directory = sys.argv[2] shutil.rmtree(output_directory,True) os.makedirs(output_directory) tei_namespace = 'http://www.tei-c.org/ns/1.0' tei_namespace_qualifier = '{' + tei_namespace + '}' def do_file(input_file_name): if (input_file_name.endswith('.xml')): with open(log, 'a') as fp: fp.write(input_file_name + "\n") tree = lxml.parse(unicode(input_directory + '/' + input_file_name)) root = tree.getroot() spaces = root.xpath('.//tei:c', namespaces={'tei':tei_namespace}) for space in spaces: space.tag = tei_namespace_qualifier + 'S' space.text = None punctuations = root.xpath('.//tei:pc', namespaces={'tei':tei_namespace}) for punctuation in punctuations: punctuation.tag = tei_namespace_qualifier + 'c' output_file_name = output_directory + '/' + input_file_name tree.write(output_file_name, encoding='UTF-8', pretty_print=True) if __name__ == "__main__": for fname in input_file_names: do_file(fname)