Redmine #2198: Adapted pipeline to limit wani.py to collocation structures
parent
3c38cdbcae
commit
616e7823df
@ -1,27 +0,0 @@
|
||||
import argparse
|
||||
import re
|
||||
import lxml.etree as lxml
|
||||
|
||||
def get_entries(input_file_name):
|
||||
return list(lxml.parse(input_file_name).getroot())
|
||||
|
||||
|
||||
def merge(single_file_name, multiple_file_name, output_file_name):
|
||||
entries = get_entries(single_file_name) + get_entries(multiple_file_name)
|
||||
entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1)))
|
||||
|
||||
root = lxml.Element('dictionary')
|
||||
for entry in entries:
|
||||
del entry.attrib['sid']
|
||||
root.append(entry)
|
||||
tree = lxml.ElementTree(root)
|
||||
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
||||
|
||||
|
||||
if (__name__ == '__main__'):
|
||||
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
||||
arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary')
|
||||
arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary')
|
||||
arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary')
|
||||
arguments = arg_parser.parse_args()
|
||||
merge(arguments.single, arguments.multiple, arguments.outfile)
|
@ -1,38 +0,0 @@
|
||||
import argparse
|
||||
import lxml.etree as lxml
|
||||
|
||||
|
||||
def xpath_find(element,expression):
|
||||
return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
|
||||
|
||||
|
||||
def count_tokens(paragraph):
|
||||
return len(xpath_find(paragraph, './/tei:w|.//tei:pc'))
|
||||
|
||||
|
||||
def split(input_file_name, single_file_name, multiple_file_name):
|
||||
|
||||
tree = lxml.parse(input_file_name)
|
||||
root = tree.getroot()
|
||||
paragraphs = xpath_find(root, './/tei:p')
|
||||
for paragraph in paragraphs:
|
||||
if (count_tokens(paragraph) > 1):
|
||||
paragraph.getparent().remove(paragraph)
|
||||
tree.write(single_file_name, encoding='UTF-8', pretty_print=True)
|
||||
|
||||
tree = lxml.parse(input_file_name)
|
||||
root = tree.getroot()
|
||||
paragraphs = xpath_find(root, './/tei:p')
|
||||
for paragraph in paragraphs:
|
||||
if (count_tokens(paragraph) == 1):
|
||||
paragraph.getparent().remove(paragraph)
|
||||
tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True)
|
||||
|
||||
|
||||
if (__name__ == '__main__'):
|
||||
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
||||
arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file')
|
||||
arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file')
|
||||
arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file')
|
||||
arguments = arg_parser.parse_args()
|
||||
split(arguments.infile, arguments.single, arguments.multiple)
|
Loading…
Reference in new issue