import argparse import csv import codecs import re import lxml.etree as lxml from collections import defaultdict MWE_INDEX_PATTERN = re.compile(r'^s(\d+)\.\d+$') def xpath_find(element,expression): return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'}) def get_xml_id(element): return element.get('{http://www.w3.org/XML/1998/namespace}id') def get_id_counter(xml_id): return int(MWE_INDEX_PATTERN.search(xml_id).group(1)) def get_mwe_components_map(input_file_name): mwe_components_map = {} root = lxml.parse(input_file_name).getroot() mwes_xml = xpath_find(root, './/tei:s') for mwe_xml in mwes_xml: index = get_id_counter(get_xml_id(mwe_xml)) token_count = len(xpath_find(mwe_xml, 'tei:w|tei:pc')) mwe_components_map[index] = token_count return mwe_components_map def get_structure_components_map(structure_file_name): structure_components_map = {} root = lxml.parse(structure_file_name) structures = root.xpath('syntactic_structure[@type="collocation" or @type="formal"]') for structure in structures: structure_id = int(structure.get('id')) core_count = len(structure.xpath('components/component[@type="core"]')) structure_components_map[structure_id] = core_count return structure_components_map def get_mwe_index_map(mapper_file_name): mwe_index_map = defaultdict(set) mapper_file = codecs.open(mapper_file_name, 'r') reader = csv.DictReader(mapper_file, delimiter='\t') for row in reader: collocation_id = int(row['Collocation_id']) index = get_id_counter(row['Sentence_id']) mwe_index_map[collocation_id].add(index) mapper_file.close() return mwe_index_map def get_mwe_structure_map(csv_file_name, mwe_components_map, mwe_index_map, structure_components_map): mwe_structure_map = defaultdict(set) csv_file = codecs.open(csv_file_name, 'r') reader = csv.DictReader(csv_file, delimiter='\t') for row in reader: structure_id = int(row['Structure_ID']) collocation_id = int(row['Collocation_ID']) for index in mwe_index_map[collocation_id]: if (mwe_components_map[index] == structure_components_map[structure_id]): mwe_structure_map[index].add(structure_id) csv_file.close() return mwe_structure_map def insert_structure_ids(input_file_name, mwe_structure_map, output_file_name): tree = lxml.parse(input_file_name) root = tree.getroot() mwes_xml = xpath_find(root, './/tei:s') for mwe_xml in mwes_xml: index = get_id_counter(get_xml_id(mwe_xml)) if (index in mwe_structure_map): structure_ids = mwe_structure_map[index] mwe_xml.set('structure_id', str(list(structure_ids)[0])) if (len(structure_ids) > 1): mwe_string = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in xpath_find(mwe_xml, 'tei:w|tei:pc')]).strip() print('MWE #' + str(index) + ': ' + mwe_string + ': MULTIPLE_COLLOCATION_STRUCTURES (' + str(structure_ids) + ')') tree.write(output_file_name, encoding='UTF-8', pretty_print=True) def assign(input_file_name, structure_file_name, csv_file_name, mapper_file_name, output_file_name): structure_components_map = get_structure_components_map(structure_file_name) mwe_components_map = get_mwe_components_map(input_file_name) mwe_index_map = get_mwe_index_map(mapper_file_name) mwe_structure_map = get_mwe_structure_map(csv_file_name, mwe_components_map, mwe_index_map, structure_components_map) insert_structure_ids(input_file_name, mwe_structure_map, output_file_name) if (__name__ == '__main__'): arg_parser = argparse.ArgumentParser(description='Assign collocation structure ids to parsed lexical units.') arg_parser.add_argument('-infile', type=str, help='Input TEI') arg_parser.add_argument('-csv', type=str, help='CSV file') arg_parser.add_argument('-outfile', type=str, help='Output TEI') arguments = arg_parser.parse_args() assign(arguments.infile, arguments.structures, arguments.outfile)