35 lines
1.4 KiB
Python
35 lines
1.4 KiB
Python
import argparse
|
|
import lxml.etree as lxml
|
|
|
|
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
|
arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file')
|
|
arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file')
|
|
arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file')
|
|
arguments = arg_parser.parse_args()
|
|
input_file_name = arguments.infile
|
|
single_file_name = arguments.single
|
|
multiple_file_name = arguments.multiple
|
|
|
|
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
|
|
def xpath_find(element,expression):
|
|
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
|
|
|
|
def count_tokens(paragraph):
|
|
return len(xpath_find(paragraph, './/tei:w|.//tei:pc'))
|
|
|
|
tree = lxml.parse(input_file_name)
|
|
root = tree.getroot()
|
|
paragraphs = xpath_find(root, './/tei:p')
|
|
for paragraph in paragraphs:
|
|
if (count_tokens(paragraph) > 1):
|
|
paragraph.getparent().remove(paragraph)
|
|
tree.write(single_file_name, encoding='UTF-8', pretty_print=True)
|
|
|
|
tree = lxml.parse(input_file_name)
|
|
root = tree.getroot()
|
|
paragraphs = xpath_find(root, './/tei:p')
|
|
for paragraph in paragraphs:
|
|
if (count_tokens(paragraph) == 1):
|
|
paragraph.getparent().remove(paragraph)
|
|
tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True)
|