42 lines
1.3 KiB
Python
42 lines
1.3 KiB
Python
|
#!/usr/bin/python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
# python switch_tei_tags.py ~/slovnica/nova_slovnica/output/tei_pipeline/tei_tokenised ~/slovnica/nova_slovnica/output/tei_pipeline/tei_for_tagger
|
||
|
|
||
|
import sys
|
||
|
import os
|
||
|
import shutil
|
||
|
import codecs
|
||
|
|
||
|
import lxml.etree as lxml
|
||
|
|
||
|
input_directory = sys.argv[1]
|
||
|
output_directory = sys.argv[2]
|
||
|
|
||
|
shutil.rmtree(output_directory,True)
|
||
|
os.makedirs(output_directory)
|
||
|
|
||
|
tei_namespace = 'http://www.tei-c.org/ns/1.0'
|
||
|
tei_namespace_qualifier = '{' + tei_namespace + '}'
|
||
|
|
||
|
def do_file(input_file_name):
|
||
|
if (input_file_name.endswith('.xml')):
|
||
|
with open(log, 'a') as fp:
|
||
|
fp.write(input_file_name + "\n")
|
||
|
|
||
|
tree = lxml.parse(unicode(input_directory + '/' + input_file_name))
|
||
|
root = tree.getroot()
|
||
|
spaces = root.xpath('.//tei:c', namespaces={'tei':tei_namespace})
|
||
|
for space in spaces:
|
||
|
space.tag = tei_namespace_qualifier + 'S'
|
||
|
space.text = None
|
||
|
punctuations = root.xpath('.//tei:pc', namespaces={'tei':tei_namespace})
|
||
|
for punctuation in punctuations:
|
||
|
punctuation.tag = tei_namespace_qualifier + 'c'
|
||
|
output_file_name = output_directory + '/' + input_file_name
|
||
|
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
for fname in input_file_names:
|
||
|
do_file(fname)
|