import argparse import lxml.etree as lxml arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.') arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file') arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file') arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file') arguments = arg_parser.parse_args() input_file_name = arguments.infile single_file_name = arguments.single multiple_file_name = arguments.multiple TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0' def xpath_find(element,expression): return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE}) def count_tokens(paragraph): return len(xpath_find(paragraph, './/tei:w|.//tei:pc')) tree = lxml.parse(input_file_name) root = tree.getroot() paragraphs = xpath_find(root, './/tei:p') for paragraph in paragraphs: if (count_tokens(paragraph) > 1): paragraph.getparent().remove(paragraph) tree.write(single_file_name, encoding='UTF-8', pretty_print=True) tree = lxml.parse(input_file_name) root = tree.getroot() paragraphs = xpath_find(root, './/tei:p') for paragraph in paragraphs: if (count_tokens(paragraph) == 1): paragraph.getparent().remove(paragraph) tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True)