25 lines
1.0 KiB
Python
25 lines
1.0 KiB
Python
|
import argparse
|
||
|
import lxml.etree as lxml
|
||
|
|
||
|
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
||
|
arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary')
|
||
|
arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary')
|
||
|
arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary')
|
||
|
arguments = arg_parser.parse_args()
|
||
|
single_file_name = arguments.single
|
||
|
multiple_file_name = arguments.multiple
|
||
|
output_file_name = arguments.outfile
|
||
|
|
||
|
def get_entries(input_file_name):
|
||
|
return list(lxml.parse(input_file_name).getroot())
|
||
|
|
||
|
entries = get_entries(single_file_name) + get_entries(multiple_file_name)
|
||
|
sort(entries, key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('xml:id')).group(1)))
|
||
|
|
||
|
root = lxml.Element('dictionary')
|
||
|
for entry in entries:
|
||
|
del entry.attrib['xml:id']
|
||
|
root.append(entry)
|
||
|
tree = lxml.ElementTree(root)
|
||
|
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|