From 822ce25add2fc8e4cf5b60230f762f7c770ae362 Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Wed, 2 Dec 2020 23:51:28 +0100 Subject: [PATCH] IssueID #1487: expanded conllu postprocessing and cleaned a bit --- scripts/constants.py | 1 + scripts/fix_xml_ids.py | 21 ------------------- scripts/pipeline1.py | 23 +++++++-------------- scripts/tweak_conllu.py | 46 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 54 insertions(+), 37 deletions(-) delete mode 100644 scripts/fix_xml_ids.py create mode 100644 scripts/tweak_conllu.py diff --git a/scripts/constants.py b/scripts/constants.py index 62c4b11..dd9ad6c 100644 --- a/scripts/constants.py +++ b/scripts/constants.py @@ -2,6 +2,7 @@ TMP_DIRECTORY = '../tmp/structure_assignment' # scripts +CONLLU_TWEAK_SCRIPT_NAME = 'tweak_conllu.py' CONLLU_TEI_SCRIPT_NAME = 'conllu_to_xml.py' MWE_EXTRACTION_SCRIPT_NAME = 'wani.py' STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py' diff --git a/scripts/fix_xml_ids.py b/scripts/fix_xml_ids.py deleted file mode 100644 index 7b0ceea..0000000 --- a/scripts/fix_xml_ids.py +++ /dev/null @@ -1,21 +0,0 @@ -import argparse -import codecs -import re - -arg_parser = argparse.ArgumentParser(description='Fix invalid XML ids.') -arg_parser.add_argument('-infile', type=str, help='Input file') -arg_parser.add_argument('-outfile', type=str, help='Output file') -arguments = arg_parser.parse_args() -input_file_name = arguments.infile -output_file_name = arguments.outfile - -output_file = codecs.open(output_file_name, 'w') -input_file = codecs.open(input_file_name, 'r') - -for line in input_file: - line = re.sub('xml:id="(?=\d)','xml:id="s', line) - line = line.replace('#', '#s') - output_file.write(line) - -input_file.close() -output_file.close() diff --git a/scripts/pipeline1.py b/scripts/pipeline1.py index 18b1aed..d6ea313 100644 --- a/scripts/pipeline1.py +++ b/scripts/pipeline1.py @@ -2,7 +2,6 @@ import argparse import os import shutil import codecs -import re import classla from classla import Document @@ -31,28 +30,20 @@ def run_pipeline(input_file_name, output_file_name): os.makedirs(TMP_DIRECTORY, exist_ok=True) shutil.copyfile(input_file_name, STRING_LIST_FILE_NAME) run_obeliks4J(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME) - fix_xml_ids(OBELIKS_RAW_FILE_NAME, OBELIKS_TWEAKED_FILE_NAME) + tweak_conllu(OBELIKS_RAW_FILE_NAME, OBELIKS_TWEAKED_FILE_NAME) run_classla(OBELIKS_TWEAKED_FILE_NAME, CLASSLA_FILE_NAME) run_tei_conversion(CLASSLA_FILE_NAME, TEI_INIT_FILE_NAME) shutil.copyfile(TEI_INIT_FILE_NAME, output_file_name) -def run_obeliks4J(obeliks_file_name, classla_file_name): +def run_obeliks4J(list_file_name, conllu_file_name): print('Running obeliks ...') - obeliks_command = 'java -jar ' + OBELIKS_JAR_FILE_NAME + ' -d -if ' + STRING_LIST_FILE_NAME + ' -o ' + OBELIKS_RAW_FILE_NAME + obeliks_command = 'java -jar ' + OBELIKS_JAR_FILE_NAME + ' -d -if ' + list_file_name + ' -o ' + conllu_file_name os.system(obeliks_command) -def fix_xml_ids(input_file_name, output_file_name): - print('Fixing xml ids ...') - output_file = codecs.open(output_file_name, 'w') - input_file = codecs.open(input_file_name, 'r') - regexp = r'^(# sent_id = )(\d+\.\d+)$' - for line in input_file: - match = re.search(regexp, line) - if (match): - line = match.group(1) + XML_ID_PREFIX + match.group(2) + '\n' - output_file.write(line) - input_file.close() - output_file.close() +def tweak_conllu(input_file_name, output_file_name): + print('Tweaking conllu results ...') + tweak_command = ' '.join(['python', CONLLU_TWEAK_SCRIPT_NAME, '-infile', input_file_name, '-outfile', output_file_name]) + os.system(tweak_command) def run_classla(obeliks_file_name, classla_file_name): print('Running classla ...') diff --git a/scripts/tweak_conllu.py b/scripts/tweak_conllu.py new file mode 100644 index 0000000..96215fd --- /dev/null +++ b/scripts/tweak_conllu.py @@ -0,0 +1,46 @@ +import argparse +import codecs +import re + +arg_parser = argparse.ArgumentParser(description='Fix invalid XML ids.') +arg_parser.add_argument('-infile', type=str, help='Input file') +arg_parser.add_argument('-outfile', type=str, help='Output file') +arguments = arg_parser.parse_args() +input_file_name = arguments.infile +output_file_name = arguments.outfile + +output_file = codecs.open(output_file_name, 'w') +input_file = codecs.open(input_file_name, 'r') + +def write(output_file, line): + output_file.write(line + '\n') + +def write_paragraph(output_file, output_map): + if (output_map is not None): + write(output_file, output_map['paragraph']) + write(output_file, output_map['sentence']) + write(output_file, '# text = ' + ' '.join(output_map['texts'])) + for (index, token_line) in enumerate(output_map['tokens'], start=1): + write(output_file, '\t'.join([str(index)] + token_line.split('\t')[1:])) + write(output_file, '') + +output_map = None +for line in input_file: + if (line[0].isdigit()): + output_map['tokens'].append(line.strip()) + else: + match = re.search('^# (.+) = (.+)$', line) + if (match): + (name, value) = match.groups() + if (name == 'newpar id'): + write_paragraph(output_file, output_map) + output_map = {'paragraph': line.strip(), 'sentence':None, 'texts':[], 'tokens':[]} + elif (name == 'sent_id'): + if (value.endswith('.1')): + output_map['sentence'] = re.sub('^(# sent_id = )(\d+\.1)$', r'\1s\2', line.strip()) + elif (name == 'text'): + output_map['texts'].append(value) +write_paragraph(output_file, output_map) + +input_file.close() +output_file.close() \ No newline at end of file