diff --git a/scripts/merge_dictionaries.py b/scripts/merge_dictionaries.py index 085f69d..b6e9117 100644 --- a/scripts/merge_dictionaries.py +++ b/scripts/merge_dictionaries.py @@ -1,4 +1,5 @@ import argparse +import re import lxml.etree as lxml arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.') @@ -14,11 +15,11 @@ def get_entries(input_file_name): return list(lxml.parse(input_file_name).getroot()) entries = get_entries(single_file_name) + get_entries(multiple_file_name) -sort(entries, key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('xml:id')).group(1))) +entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1))) root = lxml.Element('dictionary') for entry in entries: - del entry.attrib['xml:id'] + del entry.attrib['sid'] root.append(entry) tree = lxml.ElementTree(root) tree.write(output_file_name, encoding='UTF-8', pretty_print=True) diff --git a/scripts/pipeline2.py b/scripts/pipeline2.py index 6c1f730..37e0292 100644 --- a/scripts/pipeline2.py +++ b/scripts/pipeline2.py @@ -43,7 +43,7 @@ def run_pipeline(input_tei_file_name, output_lexicon_file_name, output_structure def split_tei_input(input_file_name, single_file_name, multiple_file_name): print('Splitting TEI input file ...') - split_command = ' '.join(['python', TEI_SPLIT_SCRIPT_NAME, '--input', input_file_name, '--single', single_file_name, '--multiple', multiple_file_name]) + split_command = ' '.join(['python', TEI_SPLIT_SCRIPT_NAME, '-infile', input_file_name, '-single', single_file_name, '-multiple', multiple_file_name]) os.system(split_command) def run_mwe_extraction(structure_file_name, tei_file_name, mwe_csv_file_name): @@ -70,12 +70,12 @@ def run_structure_creation(input_file_name, tei_file_name, output_file_name): def run_dictionary_conversion(tei_file_name, xml_file_name): print('Converting to dictionary xml format ...') - convert_command = ' '.join(['python', TEI_DICTIONARY_SCRIPT_NAME, '-infile', tei_file_name, '-outfile', xml_file_name, '--keepids', 'true']) + convert_command = ' '.join(['python', TEI_DICTIONARY_SCRIPT_NAME, '-infile', tei_file_name, '-outfile', xml_file_name]) os.system(convert_command) -def merge_dictionaries(single_file_name, multiple_file_name, joint_file_name): +def merge_dictionaries(single_file_name, multiple_file_name, output_file_name): print('Merging dictionary files ...') - merge_command = ' '.join(['python', DICTIONARY_MERGE_SCRIPT_NAME, '-single', single_file_name, '--multiple', multiple_file_name, '--joint', joint_file_name]) + merge_command = ' '.join(['python', DICTIONARY_MERGE_SCRIPT_NAME, '-single', single_file_name, '-multiple', multiple_file_name, '-outfile', output_file_name]) os.system(merge_command) def validate_dictionary(dictionary_file_name): diff --git a/scripts/split_tei.py b/scripts/split_tei.py index 8c3211d..d42599e 100644 --- a/scripts/split_tei.py +++ b/scripts/split_tei.py @@ -19,7 +19,7 @@ def count_tokens(paragraph): tree = lxml.parse(input_file_name) root = tree.getroot() -paragraphs = xpath_find('.//tei:p') +paragraphs = xpath_find(root, './/tei:p') for paragraph in paragraphs: if (count_tokens(paragraph) > 1): paragraph.getparent().remove(paragraph) @@ -27,7 +27,7 @@ tree.write(single_file_name, encoding='UTF-8', pretty_print=True) tree = lxml.parse(input_file_name) root = tree.getroot() -paragraphs = xpath_find('.//tei:p') +paragraphs = xpath_find(root, './/tei:p') for paragraph in paragraphs: if (count_tokens(paragraph) == 1): paragraph.getparent().remove(paragraph)