From 258a7a12a5347f42120609dd45d871687f5406cc Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Mon, 27 Feb 2023 16:26:29 +0100 Subject: [PATCH] Redmine #1461: used mapper to get indexes and split function into cleaner parts --- .../assign_collocation_structures.py | 82 +++++++++++++------ structure_assignment/constants.py | 1 + structure_assignment/pipeline.py | 8 +- 3 files changed, 64 insertions(+), 27 deletions(-) diff --git a/structure_assignment/assign_collocation_structures.py b/structure_assignment/assign_collocation_structures.py index 7c1e36f..c153d78 100644 --- a/structure_assignment/assign_collocation_structures.py +++ b/structure_assignment/assign_collocation_structures.py @@ -3,6 +3,9 @@ import csv import codecs import re import lxml.etree as lxml +from collections import defaultdict + +MWE_INDEX_PATTERN = re.compile(r'^s(\d+)\.\d+$') def xpath_find(element,expression): return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'}) @@ -11,41 +14,72 @@ def get_xml_id(element): return element.get('{http://www.w3.org/XML/1998/namespace}id') def get_id_counter(xml_id): - return int(re.search(r'^s(\d+)\.\d+(?:\.\d+)?$', xml_id).group(1)) + return int(MWE_INDEX_PATTERN.search(xml_id).group(1)) + +def get_mwe_components_map(input_file_name): + mwe_components_map = {} + root = lxml.parse(input_file_name).getroot() + mwes_xml = xpath_find(root, './/tei:s') + for mwe_xml in mwes_xml: + index = get_id_counter(get_xml_id(mwe_xml)) + token_count = len(xpath_find(mwe_xml, 'tei:w|tei:pc')) + mwe_components_map[index] = token_count + return mwe_components_map + +def get_structure_components_map(structure_file_name): + structure_components_map = {} + root = lxml.parse(structure_file_name) + structures = root.xpath('syntactic_structure[@type="collocation" or @type="formal"]') + for structure in structures: + structure_id = int(structure.get('id')) + core_count = len(structure.xpath('components/component[@type="core"]')) + structure_components_map[structure_id] = core_count + return structure_components_map -def assign(input_file_name, csv_file_name, output_file_name): +def get_mwe_index_map(mapper_file_name): + mwe_index_map = defaultdict(set) + mapper_file = codecs.open(mapper_file_name, 'r') + reader = csv.DictReader(mapper_file, delimiter='\t') + for row in reader: + collocation_id = int(row['Collocation_id']) + index = get_id_counter(row['Sentence_id']) + mwe_index_map[collocation_id].add(index) + mapper_file.close() + return mwe_index_map +def get_mwe_structure_map(csv_file_name, mwe_components_map, mwe_index_map, structure_components_map): + mwe_structure_map = defaultdict(set) csv_file = codecs.open(csv_file_name, 'r') reader = csv.DictReader(csv_file, delimiter=',') - mwe_map = {} for row in reader: - structure_id = row['Structure_ID'] - token_ids = [row[key] for key in sorted(row.keys()) if key.endswith('_Token_ID') and len(row[key]) > 0] - index = get_id_counter(token_ids[0]) - component_count = len(token_ids) - if (index not in mwe_map): - mwe_map[index] = set() - mwe_map[index].add((structure_id, component_count)) + structure_id = int(row['Structure_ID']) + collocation_id = int(row['Collocation_ID']) + for index in mwe_index_map[collocation_id]: + if (mwe_components_map[index] == structure_components_map[structure_id]): + mwe_structure_map[index].add(structure_id) csv_file.close() + return mwe_structure_map - xml_tree = lxml.parse(input_file_name) - xml_root = xml_tree.getroot() - mwes_xml = xpath_find(xml_root, './/tei:s') +def insert_structure_ids(input_file_name, mwe_structure_map, output_file_name): + tree = lxml.parse(input_file_name) + root = tree.getroot() + mwes_xml = xpath_find(root, './/tei:s') for mwe_xml in mwes_xml: index = get_id_counter(get_xml_id(mwe_xml)) - mwe_string = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in xpath_find(mwe_xml, 'tei:w|tei:pc')]).strip() - token_count = len(xpath_find(mwe_xml, 'tei:w|tei:pc')) - structure_ids = set() - if (index in mwe_map): - for (structure_id, component_count) in mwe_map[index]: - if (component_count == token_count): - structure_ids.add(int(structure_id)) - if (len(structure_ids) > 1): - print('MWE #' + str(index) + ': ' + mwe_string + ': MULTIPLE_COLLOCATION_STRUCTURES (' + str(structure_ids) + ')') - elif (len(structure_ids) == 1): + if (index in mwe_structure_map): + structure_ids = mwe_structure_map[index] mwe_xml.set('structure_id', str(list(structure_ids)[0])) - xml_tree.write(output_file_name, encoding='UTF-8', pretty_print=True) + if (len(structure_ids) > 1): + mwe_string = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in xpath_find(mwe_xml, 'tei:w|tei:pc')]).strip() + print('MWE #' + str(index) + ': ' + mwe_string + ': MULTIPLE_COLLOCATION_STRUCTURES (' + str(structure_ids) + ')') + tree.write(output_file_name, encoding='UTF-8', pretty_print=True) +def assign(input_file_name, structure_file_name, csv_file_name, mapper_file_name, output_file_name): + structure_components_map = get_structure_components_map(structure_file_name) + mwe_components_map = get_mwe_components_map(input_file_name) + mwe_index_map = get_mwe_index_map(mapper_file_name) + mwe_structure_map = get_mwe_structure_map(csv_file_name, mwe_components_map, mwe_index_map, structure_components_map) + insert_structure_ids(input_file_name, mwe_structure_map, output_file_name) if (__name__ == '__main__'): diff --git a/structure_assignment/constants.py b/structure_assignment/constants.py index 57ad992..336aaa9 100644 --- a/structure_assignment/constants.py +++ b/structure_assignment/constants.py @@ -8,6 +8,7 @@ FILE_MAP = {'strings-list': 'strings.txt', 'tei-ids-collocation': 'tei_ids_collocations.xml', 'tei-ids-all': 'tei_ids_all.xml', 'collocations': 'collocation_matches.csv', + 'collocation-mapper': 'mapper.txt', 'structures-old': 'structures_old.xml', 'structures-new': 'structures_new.xml', 'dictionary': 'dictionary.xml', diff --git a/structure_assignment/pipeline.py b/structure_assignment/pipeline.py index 787afb1..6b167b5 100644 --- a/structure_assignment/pipeline.py +++ b/structure_assignment/pipeline.py @@ -144,16 +144,18 @@ class Pipeline: input_file_name = self.file_map['tei-initial'] output_file_name = self.file_map['collocations'] - extractor = cordex.Pipeline(structure_file_name, fixed_restriction_order=True, statistics=False, collocation_sentence_map_dest=None) + extractor = cordex.Pipeline(structure_file_name, fixed_restriction_order=True, statistics=False, collocation_sentence_map_dest=self.tmp_directory) extraction = extractor(input_file_name) - extraction.write(output_file_name, token_output=True) + extraction.write(output_file_name) def do_assign_collocation_structures(self): print('Assigning ids of collocation structures ...') input_file_name = self.file_map['tei-initial'] + structure_file_name = self.file_map['structures-old'] collocations_file_name = self.file_map['collocations'] + mapper_file_name = self.file_map['collocation-mapper'] output_file_name = self.file_map['tei-ids-collocation'] - assign_collocation_structures(input_file_name, collocations_file_name, output_file_name) + assign_collocation_structures(input_file_name, structure_file_name, collocations_file_name, mapper_file_name, output_file_name) def do_assign_other_structures(self): print('Assigning ids of single and other structures, creating if necessary ...')