Redmine #1835: turned pipeline2 scripts into modules
parent
5395d8def0
commit
f5d4a009ea
@ -0,0 +1,27 @@
|
|||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
import lxml.etree as lxml
|
||||||
|
|
||||||
|
def get_entries(input_file_name):
|
||||||
|
return list(lxml.parse(input_file_name).getroot())
|
||||||
|
|
||||||
|
|
||||||
|
def merge(single_file_name, multiple_file_name, output_file_name):
|
||||||
|
entries = get_entries(single_file_name) + get_entries(multiple_file_name)
|
||||||
|
entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1)))
|
||||||
|
|
||||||
|
root = lxml.Element('dictionary')
|
||||||
|
for entry in entries:
|
||||||
|
del entry.attrib['sid']
|
||||||
|
root.append(entry)
|
||||||
|
tree = lxml.ElementTree(root)
|
||||||
|
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
||||||
|
|
||||||
|
|
||||||
|
if (__name__ == '__main__'):
|
||||||
|
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
||||||
|
arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary')
|
||||||
|
arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary')
|
||||||
|
arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary')
|
||||||
|
arguments = arg_parser.parse_args()
|
||||||
|
merge(arguments.single, arguments.multiple, arguments.outfile)
|
@ -0,0 +1,38 @@
|
|||||||
|
import argparse
|
||||||
|
import lxml.etree as lxml
|
||||||
|
|
||||||
|
|
||||||
|
def xpath_find(element,expression):
|
||||||
|
return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
|
||||||
|
|
||||||
|
|
||||||
|
def count_tokens(paragraph):
|
||||||
|
return len(xpath_find(paragraph, './/tei:w|.//tei:pc'))
|
||||||
|
|
||||||
|
|
||||||
|
def split(input_file_name, single_file_name, multiple_file_name):
|
||||||
|
|
||||||
|
tree = lxml.parse(input_file_name)
|
||||||
|
root = tree.getroot()
|
||||||
|
paragraphs = xpath_find(root, './/tei:p')
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
if (count_tokens(paragraph) > 1):
|
||||||
|
paragraph.getparent().remove(paragraph)
|
||||||
|
tree.write(single_file_name, encoding='UTF-8', pretty_print=True)
|
||||||
|
|
||||||
|
tree = lxml.parse(input_file_name)
|
||||||
|
root = tree.getroot()
|
||||||
|
paragraphs = xpath_find(root, './/tei:p')
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
if (count_tokens(paragraph) == 1):
|
||||||
|
paragraph.getparent().remove(paragraph)
|
||||||
|
tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True)
|
||||||
|
|
||||||
|
|
||||||
|
if (__name__ == '__main__'):
|
||||||
|
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
||||||
|
arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file')
|
||||||
|
arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file')
|
||||||
|
arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file')
|
||||||
|
arguments = arg_parser.parse_args()
|
||||||
|
split(arguments.infile, arguments.single, arguments.multiple)
|
@ -1,25 +0,0 @@
|
|||||||
import argparse
|
|
||||||
import re
|
|
||||||
import lxml.etree as lxml
|
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
|
||||||
arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary')
|
|
||||||
arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary')
|
|
||||||
arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary')
|
|
||||||
arguments = arg_parser.parse_args()
|
|
||||||
single_file_name = arguments.single
|
|
||||||
multiple_file_name = arguments.multiple
|
|
||||||
output_file_name = arguments.outfile
|
|
||||||
|
|
||||||
def get_entries(input_file_name):
|
|
||||||
return list(lxml.parse(input_file_name).getroot())
|
|
||||||
|
|
||||||
entries = get_entries(single_file_name) + get_entries(multiple_file_name)
|
|
||||||
entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1)))
|
|
||||||
|
|
||||||
root = lxml.Element('dictionary')
|
|
||||||
for entry in entries:
|
|
||||||
del entry.attrib['sid']
|
|
||||||
root.append(entry)
|
|
||||||
tree = lxml.ElementTree(root)
|
|
||||||
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
|
@ -1,34 +0,0 @@
|
|||||||
import argparse
|
|
||||||
import lxml.etree as lxml
|
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
|
||||||
arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file')
|
|
||||||
arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file')
|
|
||||||
arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file')
|
|
||||||
arguments = arg_parser.parse_args()
|
|
||||||
input_file_name = arguments.infile
|
|
||||||
single_file_name = arguments.single
|
|
||||||
multiple_file_name = arguments.multiple
|
|
||||||
|
|
||||||
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
|
|
||||||
def xpath_find(element,expression):
|
|
||||||
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
|
|
||||||
|
|
||||||
def count_tokens(paragraph):
|
|
||||||
return len(xpath_find(paragraph, './/tei:w|.//tei:pc'))
|
|
||||||
|
|
||||||
tree = lxml.parse(input_file_name)
|
|
||||||
root = tree.getroot()
|
|
||||||
paragraphs = xpath_find(root, './/tei:p')
|
|
||||||
for paragraph in paragraphs:
|
|
||||||
if (count_tokens(paragraph) > 1):
|
|
||||||
paragraph.getparent().remove(paragraph)
|
|
||||||
tree.write(single_file_name, encoding='UTF-8', pretty_print=True)
|
|
||||||
|
|
||||||
tree = lxml.parse(input_file_name)
|
|
||||||
root = tree.getroot()
|
|
||||||
paragraphs = xpath_find(root, './/tei:p')
|
|
||||||
for paragraph in paragraphs:
|
|
||||||
if (count_tokens(paragraph) == 1):
|
|
||||||
paragraph.getparent().remove(paragraph)
|
|
||||||
tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True)
|
|
Loading…
Reference in new issue