You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

58 lines
2.4 KiB

import argparse
import csv
import codecs
import re
import lxml.etree as lxml
def xpath_find(element,expression):
return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
def get_xml_id(element):
return element.get('{http://www.w3.org/XML/1998/namespace}id')
def get_id_counter(xml_id):
return int(re.search(r'^s(\d+)\.\d+(?:\.\d+)?$', xml_id).group(1))
def assign(input_file_name, csv_file_name, output_file_name):
csv_file = codecs.open(csv_file_name, 'r')
reader = csv.DictReader(csv_file, delimiter='\t')
mwe_map = {}
for row in reader:
structure_id = row['Structure_ID']
token_ids = [row[key] for key in sorted(row.keys()) if key.endswith('_Token_ID') and len(row[key]) > 0]
index = get_id_counter(token_ids[0])
component_count = len(token_ids)
if (index not in mwe_map):
mwe_map[index] = set()
mwe_map[index].add((structure_id, component_count))
csv_file.close()
xml_tree = lxml.parse(input_file_name)
xml_root = xml_tree.getroot()
mwes_xml = xpath_find(xml_root, './/tei:s')
for mwe_xml in mwes_xml:
index = get_id_counter(get_xml_id(mwe_xml))
mwe_string = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in xpath_find(mwe_xml, 'tei:w|tei:pc')]).strip()
token_count = len(xpath_find(mwe_xml, 'tei:w|tei:pc'))
structure_ids = set()
if (index in mwe_map):
for (structure_id, component_count) in mwe_map[index]:
if (component_count == token_count):
structure_ids.add(int(structure_id))
if (len(structure_ids) > 1):
print('MWE #' + str(index) + ': ' + mwe_string + ': MULTIPLE_COLLOCATION_STRUCTURES (' + str(structure_ids) + ')')
elif (len(structure_ids) == 1):
mwe_xml.set('structure_id', str(list(structure_ids)[0]))
xml_tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
if (__name__ == '__main__'):
arg_parser = argparse.ArgumentParser(description='Assign collocation structure ids to parsed lexical units.')
arg_parser.add_argument('-infile', type=str, help='Input TEI')
arg_parser.add_argument('-csv', type=str, help='CSV file')
arg_parser.add_argument('-outfile', type=str, help='Output TEI')
arguments = arg_parser.parse_args()
assign(arguments.infile, arguments.structures, arguments.outfile)