structure_assignment/structure_assignment/assign_collocation_structur...

import argparse
import csv
import codecs
import re
import lxml.etree as lxml
from collections import defaultdict

MWE_INDEX_PATTERN = re.compile(r'^s(\d+)\.\d+$')

def xpath_find(element,expression):
    return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})

def get_xml_id(element):
    return element.get('{http://www.w3.org/XML/1998/namespace}id')

def get_id_counter(xml_id):
    return int(MWE_INDEX_PATTERN.search(xml_id).group(1))

def get_mwe_components_map(input_file_name):
    mwe_components_map = {}
    root = lxml.parse(input_file_name).getroot()
    mwes_xml = xpath_find(root, './/tei:s')
    for mwe_xml in mwes_xml:
        index = get_id_counter(get_xml_id(mwe_xml))
        token_count = len(xpath_find(mwe_xml, 'tei:w|tei:pc'))
        mwe_components_map[index] = token_count
    return mwe_components_map

def get_structure_components_map(structure_file_name):
    structure_components_map = {}
    root = lxml.parse(structure_file_name)
    structures = root.xpath('syntactic_structure[@type="collocation" or @type="formal"]')
    for structure in structures:
        structure_id = int(structure.get('id'))
        core_count = len(structure.xpath('components/component[@type="core"]'))
        structure_components_map[structure_id] = core_count
    return structure_components_map

def get_mwe_index_map(mapper_file_name):
    mwe_index_map = defaultdict(set)
    mapper_file = codecs.open(mapper_file_name, 'r')
    reader = csv.DictReader(mapper_file, delimiter='\t')
    for row in reader:
        collocation_id = int(row['Collocation_id'])
        index = get_id_counter(row['Sentence_id'])
        mwe_index_map[collocation_id].add(index)
    mapper_file.close()
    return mwe_index_map

def get_mwe_structure_map(csv_file_name, mwe_components_map, mwe_index_map, structure_components_map):
    mwe_structure_map = defaultdict(set)
    csv_file = codecs.open(csv_file_name, 'r')
    reader = csv.DictReader(csv_file, delimiter='\t')
    for row in reader:
        structure_id = int(row['Structure_ID'])
        collocation_id = int(row['Collocation_ID'])
        for index in mwe_index_map[collocation_id]:
            if (mwe_components_map[index] == structure_components_map[structure_id]):
                mwe_structure_map[index].add(structure_id)
    csv_file.close()
    return mwe_structure_map

def insert_structure_ids(input_file_name, mwe_structure_map, output_file_name):
    tree = lxml.parse(input_file_name)
    root = tree.getroot()
    mwes_xml = xpath_find(root, './/tei:s')
    for mwe_xml in mwes_xml:
        index = get_id_counter(get_xml_id(mwe_xml))
        if (index in mwe_structure_map):
            structure_ids = mwe_structure_map[index]
            mwe_xml.set('structure_id', str(list(structure_ids)[0]))
            if (len(structure_ids) > 1):
                mwe_string = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in xpath_find(mwe_xml, 'tei:w|tei:pc')]).strip()
                print('MWE #' + str(index) + ': ' + mwe_string + ': MULTIPLE_COLLOCATION_STRUCTURES (' + str(structure_ids) + ')')
    tree.write(output_file_name, encoding='UTF-8', pretty_print=True)

def assign(input_file_name, structure_file_name, csv_file_name, mapper_file_name, output_file_name):
    structure_components_map = get_structure_components_map(structure_file_name)
    mwe_components_map = get_mwe_components_map(input_file_name)
    mwe_index_map = get_mwe_index_map(mapper_file_name)
    mwe_structure_map = get_mwe_structure_map(csv_file_name, mwe_components_map, mwe_index_map, structure_components_map)
    insert_structure_ids(input_file_name, mwe_structure_map, output_file_name)

if (__name__ == '__main__'):

    arg_parser = argparse.ArgumentParser(description='Assign collocation structure ids to parsed lexical units.')
    arg_parser.add_argument('-infile', type=str, help='Input TEI')
    arg_parser.add_argument('-csv', type=str, help='CSV file')
    arg_parser.add_argument('-outfile', type=str, help='Output TEI')
    arguments = arg_parser.parse_args()
    assign(arguments.infile, arguments.structures, arguments.outfile)