You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

28 lines
1.1 KiB

import argparse
import re
import lxml.etree as lxml
def get_entries(input_file_name):
return list(lxml.parse(input_file_name).getroot())
def merge(single_file_name, multiple_file_name, output_file_name):
entries = get_entries(single_file_name) + get_entries(multiple_file_name)
entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1)))
root = lxml.Element('dictionary')
for entry in entries:
del entry.attrib['sid']
root.append(entry)
tree = lxml.ElementTree(root)
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
if (__name__ == '__main__'):
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary')
arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary')
arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary')
arguments = arg_parser.parse_args()
merge(arguments.single, arguments.multiple, arguments.outfile)