You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

92 lines
4.0 KiB

import argparse
import csv
import codecs
import re
import lxml.etree as lxml
from collections import defaultdict
MWE_INDEX_PATTERN = re.compile(r'^s(\d+)\.\d+$')
def xpath_find(element,expression):
return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
def get_xml_id(element):
return element.get('{http://www.w3.org/XML/1998/namespace}id')
def get_id_counter(xml_id):
return int(MWE_INDEX_PATTERN.search(xml_id).group(1))
def get_mwe_components_map(input_file_name):
mwe_components_map = {}
root = lxml.parse(input_file_name).getroot()
mwes_xml = xpath_find(root, './/tei:s')
for mwe_xml in mwes_xml:
index = get_id_counter(get_xml_id(mwe_xml))
token_count = len(xpath_find(mwe_xml, 'tei:w|tei:pc'))
mwe_components_map[index] = token_count
return mwe_components_map
def get_structure_components_map(structure_file_name):
structure_components_map = {}
root = lxml.parse(structure_file_name)
structures = root.xpath('syntactic_structure[@type="collocation" or @type="formal"]')
for structure in structures:
structure_id = int(structure.get('id'))
core_count = len(structure.xpath('components/component[@type="core"]'))
structure_components_map[structure_id] = core_count
return structure_components_map
def get_mwe_index_map(mapper_file_name):
mwe_index_map = defaultdict(set)
mapper_file = codecs.open(mapper_file_name, 'r')
reader = csv.DictReader(mapper_file, delimiter='\t')
for row in reader:
collocation_id = int(row['Collocation_id'])
index = get_id_counter(row['Sentence_id'])
mwe_index_map[collocation_id].add(index)
mapper_file.close()
return mwe_index_map
def get_mwe_structure_map(csv_file_name, mwe_components_map, mwe_index_map, structure_components_map):
mwe_structure_map = defaultdict(set)
csv_file = codecs.open(csv_file_name, 'r')
reader = csv.DictReader(csv_file, delimiter='\t')
for row in reader:
structure_id = int(row['Structure_ID'])
collocation_id = int(row['Collocation_ID'])
for index in mwe_index_map[collocation_id]:
if (mwe_components_map[index] == structure_components_map[structure_id]):
mwe_structure_map[index].add(structure_id)
csv_file.close()
return mwe_structure_map
def insert_structure_ids(input_file_name, mwe_structure_map, output_file_name):
tree = lxml.parse(input_file_name)
root = tree.getroot()
mwes_xml = xpath_find(root, './/tei:s')
for mwe_xml in mwes_xml:
index = get_id_counter(get_xml_id(mwe_xml))
if (index in mwe_structure_map):
structure_ids = mwe_structure_map[index]
mwe_xml.set('structure_id', str(list(structure_ids)[0]))
if (len(structure_ids) > 1):
mwe_string = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in xpath_find(mwe_xml, 'tei:w|tei:pc')]).strip()
print('MWE #' + str(index) + ': ' + mwe_string + ': MULTIPLE_COLLOCATION_STRUCTURES (' + str(structure_ids) + ')')
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
def assign(input_file_name, structure_file_name, csv_file_name, mapper_file_name, output_file_name):
structure_components_map = get_structure_components_map(structure_file_name)
mwe_components_map = get_mwe_components_map(input_file_name)
mwe_index_map = get_mwe_index_map(mapper_file_name)
mwe_structure_map = get_mwe_structure_map(csv_file_name, mwe_components_map, mwe_index_map, structure_components_map)
insert_structure_ids(input_file_name, mwe_structure_map, output_file_name)
if (__name__ == '__main__'):
arg_parser = argparse.ArgumentParser(description='Assign collocation structure ids to parsed lexical units.')
arg_parser.add_argument('-infile', type=str, help='Input TEI')
arg_parser.add_argument('-csv', type=str, help='CSV file')
arg_parser.add_argument('-outfile', type=str, help='Output TEI')
arguments = arg_parser.parse_args()
assign(arguments.infile, arguments.structures, arguments.outfile)