You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

26 lines
1.0 KiB

import argparse
import re
import lxml.etree as lxml
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary')
arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary')
arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary')
arguments = arg_parser.parse_args()
single_file_name = arguments.single
multiple_file_name = arguments.multiple
output_file_name = arguments.outfile
def get_entries(input_file_name):
return list(lxml.parse(input_file_name).getroot())
entries = get_entries(single_file_name) + get_entries(multiple_file_name)
entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1)))
root = lxml.Element('dictionary')
for entry in entries:
del entry.attrib['sid']
root.append(entry)
tree = lxml.ElementTree(root)
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)