import argparse import re import lxml.etree as lxml arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.') arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary') arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary') arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary') arguments = arg_parser.parse_args() single_file_name = arguments.single multiple_file_name = arguments.multiple output_file_name = arguments.outfile def get_entries(input_file_name): return list(lxml.parse(input_file_name).getroot()) entries = get_entries(single_file_name) + get_entries(multiple_file_name) entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1))) root = lxml.Element('dictionary') for entry in entries: del entry.attrib['sid'] root.append(entry) tree = lxml.ElementTree(root) tree.write(output_file_name, encoding='UTF-8', pretty_print=True)