2020-12-15 16:00:09 +00:00
|
|
|
import argparse
|
2021-01-13 18:07:35 +00:00
|
|
|
import re
|
2020-12-15 16:00:09 +00:00
|
|
|
import lxml.etree as lxml
|
|
|
|
|
|
|
|
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
|
|
|
arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary')
|
|
|
|
arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary')
|
|
|
|
arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary')
|
|
|
|
arguments = arg_parser.parse_args()
|
|
|
|
single_file_name = arguments.single
|
|
|
|
multiple_file_name = arguments.multiple
|
|
|
|
output_file_name = arguments.outfile
|
|
|
|
|
|
|
|
def get_entries(input_file_name):
|
|
|
|
return list(lxml.parse(input_file_name).getroot())
|
|
|
|
|
|
|
|
entries = get_entries(single_file_name) + get_entries(multiple_file_name)
|
2021-01-13 18:07:35 +00:00
|
|
|
entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1)))
|
2020-12-15 16:00:09 +00:00
|
|
|
|
|
|
|
root = lxml.Element('dictionary')
|
|
|
|
for entry in entries:
|
2021-01-13 18:07:35 +00:00
|
|
|
del entry.attrib['sid']
|
2020-12-15 16:00:09 +00:00
|
|
|
root.append(entry)
|
|
|
|
tree = lxml.ElementTree(root)
|
|
|
|
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|