IssueID #1487: fixed basic bugs
This commit is contained in:
parent
364acf58bc
commit
aae67a3070
|
@ -1,4 +1,5 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import re
|
||||||
import lxml.etree as lxml
|
import lxml.etree as lxml
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
||||||
|
@ -14,11 +15,11 @@ def get_entries(input_file_name):
|
||||||
return list(lxml.parse(input_file_name).getroot())
|
return list(lxml.parse(input_file_name).getroot())
|
||||||
|
|
||||||
entries = get_entries(single_file_name) + get_entries(multiple_file_name)
|
entries = get_entries(single_file_name) + get_entries(multiple_file_name)
|
||||||
sort(entries, key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('xml:id')).group(1)))
|
entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1)))
|
||||||
|
|
||||||
root = lxml.Element('dictionary')
|
root = lxml.Element('dictionary')
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
del entry.attrib['xml:id']
|
del entry.attrib['sid']
|
||||||
root.append(entry)
|
root.append(entry)
|
||||||
tree = lxml.ElementTree(root)
|
tree = lxml.ElementTree(root)
|
||||||
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
||||||
|
|
|
@ -43,7 +43,7 @@ def run_pipeline(input_tei_file_name, output_lexicon_file_name, output_structure
|
||||||
|
|
||||||
def split_tei_input(input_file_name, single_file_name, multiple_file_name):
|
def split_tei_input(input_file_name, single_file_name, multiple_file_name):
|
||||||
print('Splitting TEI input file ...')
|
print('Splitting TEI input file ...')
|
||||||
split_command = ' '.join(['python', TEI_SPLIT_SCRIPT_NAME, '--input', input_file_name, '--single', single_file_name, '--multiple', multiple_file_name])
|
split_command = ' '.join(['python', TEI_SPLIT_SCRIPT_NAME, '-infile', input_file_name, '-single', single_file_name, '-multiple', multiple_file_name])
|
||||||
os.system(split_command)
|
os.system(split_command)
|
||||||
|
|
||||||
def run_mwe_extraction(structure_file_name, tei_file_name, mwe_csv_file_name):
|
def run_mwe_extraction(structure_file_name, tei_file_name, mwe_csv_file_name):
|
||||||
|
@ -70,12 +70,12 @@ def run_structure_creation(input_file_name, tei_file_name, output_file_name):
|
||||||
|
|
||||||
def run_dictionary_conversion(tei_file_name, xml_file_name):
|
def run_dictionary_conversion(tei_file_name, xml_file_name):
|
||||||
print('Converting to dictionary xml format ...')
|
print('Converting to dictionary xml format ...')
|
||||||
convert_command = ' '.join(['python', TEI_DICTIONARY_SCRIPT_NAME, '-infile', tei_file_name, '-outfile', xml_file_name, '--keepids', 'true'])
|
convert_command = ' '.join(['python', TEI_DICTIONARY_SCRIPT_NAME, '-infile', tei_file_name, '-outfile', xml_file_name])
|
||||||
os.system(convert_command)
|
os.system(convert_command)
|
||||||
|
|
||||||
def merge_dictionaries(single_file_name, multiple_file_name, joint_file_name):
|
def merge_dictionaries(single_file_name, multiple_file_name, output_file_name):
|
||||||
print('Merging dictionary files ...')
|
print('Merging dictionary files ...')
|
||||||
merge_command = ' '.join(['python', DICTIONARY_MERGE_SCRIPT_NAME, '-single', single_file_name, '--multiple', multiple_file_name, '--joint', joint_file_name])
|
merge_command = ' '.join(['python', DICTIONARY_MERGE_SCRIPT_NAME, '-single', single_file_name, '-multiple', multiple_file_name, '-outfile', output_file_name])
|
||||||
os.system(merge_command)
|
os.system(merge_command)
|
||||||
|
|
||||||
def validate_dictionary(dictionary_file_name):
|
def validate_dictionary(dictionary_file_name):
|
||||||
|
|
|
@ -19,7 +19,7 @@ def count_tokens(paragraph):
|
||||||
|
|
||||||
tree = lxml.parse(input_file_name)
|
tree = lxml.parse(input_file_name)
|
||||||
root = tree.getroot()
|
root = tree.getroot()
|
||||||
paragraphs = xpath_find('.//tei:p')
|
paragraphs = xpath_find(root, './/tei:p')
|
||||||
for paragraph in paragraphs:
|
for paragraph in paragraphs:
|
||||||
if (count_tokens(paragraph) > 1):
|
if (count_tokens(paragraph) > 1):
|
||||||
paragraph.getparent().remove(paragraph)
|
paragraph.getparent().remove(paragraph)
|
||||||
|
@ -27,7 +27,7 @@ tree.write(single_file_name, encoding='UTF-8', pretty_print=True)
|
||||||
|
|
||||||
tree = lxml.parse(input_file_name)
|
tree = lxml.parse(input_file_name)
|
||||||
root = tree.getroot()
|
root = tree.getroot()
|
||||||
paragraphs = xpath_find('.//tei:p')
|
paragraphs = xpath_find(root, './/tei:p')
|
||||||
for paragraph in paragraphs:
|
for paragraph in paragraphs:
|
||||||
if (count_tokens(paragraph) == 1):
|
if (count_tokens(paragraph) == 1):
|
||||||
paragraph.getparent().remove(paragraph)
|
paragraph.getparent().remove(paragraph)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user