import argparse import codecs import re def write(output_file, line): output_file.write(line + '\n') def write_paragraph(output_file, output_map): if (output_map is not None): write(output_file, output_map['paragraph']) write(output_file, output_map['sentence']) write(output_file, '# text = ' + ' '.join(output_map['texts'])) for (index, token_line) in enumerate(output_map['tokens'], start=1): write(output_file, '\t'.join([str(index)] + token_line.split('\t')[1:])) write(output_file, '') def tweak(input_file_name, output_file_name): output_file = codecs.open(output_file_name, 'w') input_file = codecs.open(input_file_name, 'r') output_map = None for line in input_file: if (line[0].isdigit()): output_map['tokens'].append(line.strip()) else: match = re.search('^# (.+?) = (.+)$', line) if (match): (name, value) = match.groups() if (name == 'newpar id'): write_paragraph(output_file, output_map) paragraph_line = re.sub('^(# newpar id = )(\d+)$', r'\1p\2', line.strip()) output_map = {'paragraph': paragraph_line, 'sentence':None, 'texts':[], 'tokens':[]} elif (name == 'sent_id'): if (value.endswith('.1')): output_map['sentence'] = re.sub('^(# sent_id = )(\d+\.1)$', r'\1s\2', line.strip()) elif (name == 'text'): output_map['texts'].append(value) write_paragraph(output_file, output_map) input_file.close() output_file.close() if (__name__ == '__main__'): arg_parser = argparse.ArgumentParser(description='Fix invalid XML ids.') arg_parser.add_argument('-infile', type=str, help='Input file') arg_parser.add_argument('-outfile', type=str, help='Output file') arguments = arg_parser.parse_args() input_file_name = arguments.infile output_file_name = arguments.outfile tweak(input_file_name, output_file_name)