|
|
|
@ -3,6 +3,9 @@ import csv
|
|
|
|
|
import codecs
|
|
|
|
|
import re
|
|
|
|
|
import lxml.etree as lxml
|
|
|
|
|
from collections import defaultdict
|
|
|
|
|
|
|
|
|
|
MWE_INDEX_PATTERN = re.compile(r'^s(\d+)\.\d+$')
|
|
|
|
|
|
|
|
|
|
def xpath_find(element,expression):
|
|
|
|
|
return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
|
|
|
|
@ -11,41 +14,72 @@ def get_xml_id(element):
|
|
|
|
|
return element.get('{http://www.w3.org/XML/1998/namespace}id')
|
|
|
|
|
|
|
|
|
|
def get_id_counter(xml_id):
|
|
|
|
|
return int(re.search(r'^s(\d+)\.\d+(?:\.\d+)?$', xml_id).group(1))
|
|
|
|
|
return int(MWE_INDEX_PATTERN.search(xml_id).group(1))
|
|
|
|
|
|
|
|
|
|
def get_mwe_components_map(input_file_name):
|
|
|
|
|
mwe_components_map = {}
|
|
|
|
|
root = lxml.parse(input_file_name).getroot()
|
|
|
|
|
mwes_xml = xpath_find(root, './/tei:s')
|
|
|
|
|
for mwe_xml in mwes_xml:
|
|
|
|
|
index = get_id_counter(get_xml_id(mwe_xml))
|
|
|
|
|
token_count = len(xpath_find(mwe_xml, 'tei:w|tei:pc'))
|
|
|
|
|
mwe_components_map[index] = token_count
|
|
|
|
|
return mwe_components_map
|
|
|
|
|
|
|
|
|
|
def get_structure_components_map(structure_file_name):
|
|
|
|
|
structure_components_map = {}
|
|
|
|
|
root = lxml.parse(structure_file_name)
|
|
|
|
|
structures = root.xpath('syntactic_structure[@type="collocation" or @type="formal"]')
|
|
|
|
|
for structure in structures:
|
|
|
|
|
structure_id = int(structure.get('id'))
|
|
|
|
|
core_count = len(structure.xpath('components/component[@type="core"]'))
|
|
|
|
|
structure_components_map[structure_id] = core_count
|
|
|
|
|
return structure_components_map
|
|
|
|
|
|
|
|
|
|
def assign(input_file_name, csv_file_name, output_file_name):
|
|
|
|
|
def get_mwe_index_map(mapper_file_name):
|
|
|
|
|
mwe_index_map = defaultdict(set)
|
|
|
|
|
mapper_file = codecs.open(mapper_file_name, 'r')
|
|
|
|
|
reader = csv.DictReader(mapper_file, delimiter='\t')
|
|
|
|
|
for row in reader:
|
|
|
|
|
collocation_id = int(row['Collocation_id'])
|
|
|
|
|
index = get_id_counter(row['Sentence_id'])
|
|
|
|
|
mwe_index_map[collocation_id].add(index)
|
|
|
|
|
mapper_file.close()
|
|
|
|
|
return mwe_index_map
|
|
|
|
|
|
|
|
|
|
def get_mwe_structure_map(csv_file_name, mwe_components_map, mwe_index_map, structure_components_map):
|
|
|
|
|
mwe_structure_map = defaultdict(set)
|
|
|
|
|
csv_file = codecs.open(csv_file_name, 'r')
|
|
|
|
|
reader = csv.DictReader(csv_file, delimiter=',')
|
|
|
|
|
mwe_map = {}
|
|
|
|
|
for row in reader:
|
|
|
|
|
structure_id = row['Structure_ID']
|
|
|
|
|
token_ids = [row[key] for key in sorted(row.keys()) if key.endswith('_Token_ID') and len(row[key]) > 0]
|
|
|
|
|
index = get_id_counter(token_ids[0])
|
|
|
|
|
component_count = len(token_ids)
|
|
|
|
|
if (index not in mwe_map):
|
|
|
|
|
mwe_map[index] = set()
|
|
|
|
|
mwe_map[index].add((structure_id, component_count))
|
|
|
|
|
structure_id = int(row['Structure_ID'])
|
|
|
|
|
collocation_id = int(row['Collocation_ID'])
|
|
|
|
|
for index in mwe_index_map[collocation_id]:
|
|
|
|
|
if (mwe_components_map[index] == structure_components_map[structure_id]):
|
|
|
|
|
mwe_structure_map[index].add(structure_id)
|
|
|
|
|
csv_file.close()
|
|
|
|
|
return mwe_structure_map
|
|
|
|
|
|
|
|
|
|
xml_tree = lxml.parse(input_file_name)
|
|
|
|
|
xml_root = xml_tree.getroot()
|
|
|
|
|
mwes_xml = xpath_find(xml_root, './/tei:s')
|
|
|
|
|
def insert_structure_ids(input_file_name, mwe_structure_map, output_file_name):
|
|
|
|
|
tree = lxml.parse(input_file_name)
|
|
|
|
|
root = tree.getroot()
|
|
|
|
|
mwes_xml = xpath_find(root, './/tei:s')
|
|
|
|
|
for mwe_xml in mwes_xml:
|
|
|
|
|
index = get_id_counter(get_xml_id(mwe_xml))
|
|
|
|
|
mwe_string = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in xpath_find(mwe_xml, 'tei:w|tei:pc')]).strip()
|
|
|
|
|
token_count = len(xpath_find(mwe_xml, 'tei:w|tei:pc'))
|
|
|
|
|
structure_ids = set()
|
|
|
|
|
if (index in mwe_map):
|
|
|
|
|
for (structure_id, component_count) in mwe_map[index]:
|
|
|
|
|
if (component_count == token_count):
|
|
|
|
|
structure_ids.add(int(structure_id))
|
|
|
|
|
if (len(structure_ids) > 1):
|
|
|
|
|
print('MWE #' + str(index) + ': ' + mwe_string + ': MULTIPLE_COLLOCATION_STRUCTURES (' + str(structure_ids) + ')')
|
|
|
|
|
elif (len(structure_ids) == 1):
|
|
|
|
|
if (index in mwe_structure_map):
|
|
|
|
|
structure_ids = mwe_structure_map[index]
|
|
|
|
|
mwe_xml.set('structure_id', str(list(structure_ids)[0]))
|
|
|
|
|
xml_tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
|
|
|
|
if (len(structure_ids) > 1):
|
|
|
|
|
mwe_string = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in xpath_find(mwe_xml, 'tei:w|tei:pc')]).strip()
|
|
|
|
|
print('MWE #' + str(index) + ': ' + mwe_string + ': MULTIPLE_COLLOCATION_STRUCTURES (' + str(structure_ids) + ')')
|
|
|
|
|
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
|
|
|
|
|
|
|
|
|
def assign(input_file_name, structure_file_name, csv_file_name, mapper_file_name, output_file_name):
|
|
|
|
|
structure_components_map = get_structure_components_map(structure_file_name)
|
|
|
|
|
mwe_components_map = get_mwe_components_map(input_file_name)
|
|
|
|
|
mwe_index_map = get_mwe_index_map(mapper_file_name)
|
|
|
|
|
mwe_structure_map = get_mwe_structure_map(csv_file_name, mwe_components_map, mwe_index_map, structure_components_map)
|
|
|
|
|
insert_structure_ids(input_file_name, mwe_structure_map, output_file_name)
|
|
|
|
|
|
|
|
|
|
if (__name__ == '__main__'):
|
|
|
|
|
|
|
|
|
|