From 258a7a12a5347f42120609dd45d871687f5406cc Mon Sep 17 00:00:00 2001
From: Cyprian Laskowski <cyp@cjvt.si>
Date: Mon, 27 Feb 2023 16:26:29 +0100
Subject: [PATCH] Redmine #1461: used mapper to get indexes and split function
 into cleaner parts

---
 .../assign_collocation_structures.py          | 82 +++++++++++++------
 structure_assignment/constants.py             |  1 +
 structure_assignment/pipeline.py              |  8 +-
 3 files changed, 64 insertions(+), 27 deletions(-)

diff --git a/structure_assignment/assign_collocation_structures.py b/structure_assignment/assign_collocation_structures.py
index 7c1e36f..c153d78 100644
--- a/structure_assignment/assign_collocation_structures.py
+++ b/structure_assignment/assign_collocation_structures.py
@@ -3,6 +3,9 @@ import csv
 import codecs
 import re
 import lxml.etree as lxml
+from collections import defaultdict
+
+MWE_INDEX_PATTERN = re.compile(r'^s(\d+)\.\d+$')
 
 def xpath_find(element,expression):
     return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
@@ -11,41 +14,72 @@ def get_xml_id(element):
     return element.get('{http://www.w3.org/XML/1998/namespace}id')
 
 def get_id_counter(xml_id):
-    return int(re.search(r'^s(\d+)\.\d+(?:\.\d+)?$', xml_id).group(1))
+    return int(MWE_INDEX_PATTERN.search(xml_id).group(1))
+
+def get_mwe_components_map(input_file_name):
+    mwe_components_map = {}
+    root = lxml.parse(input_file_name).getroot()
+    mwes_xml = xpath_find(root, './/tei:s')
+    for mwe_xml in mwes_xml:
+        index = get_id_counter(get_xml_id(mwe_xml))
+        token_count = len(xpath_find(mwe_xml, 'tei:w|tei:pc'))
+        mwe_components_map[index] = token_count
+    return mwe_components_map
+
+def get_structure_components_map(structure_file_name):
+    structure_components_map = {}
+    root = lxml.parse(structure_file_name)
+    structures = root.xpath('syntactic_structure[@type="collocation" or @type="formal"]')
+    for structure in structures:
+        structure_id = int(structure.get('id'))
+        core_count = len(structure.xpath('components/component[@type="core"]'))
+        structure_components_map[structure_id] = core_count
+    return structure_components_map
 
-def assign(input_file_name, csv_file_name, output_file_name):
+def get_mwe_index_map(mapper_file_name):
+    mwe_index_map = defaultdict(set)
+    mapper_file = codecs.open(mapper_file_name, 'r')
+    reader = csv.DictReader(mapper_file, delimiter='\t')
+    for row in reader:
+        collocation_id = int(row['Collocation_id'])
+        index = get_id_counter(row['Sentence_id'])
+        mwe_index_map[collocation_id].add(index)
+    mapper_file.close()
+    return mwe_index_map
 
+def get_mwe_structure_map(csv_file_name, mwe_components_map, mwe_index_map, structure_components_map):
+    mwe_structure_map = defaultdict(set)
     csv_file = codecs.open(csv_file_name, 'r')
     reader = csv.DictReader(csv_file, delimiter=',')
-    mwe_map = {}
     for row in reader:
-        structure_id = row['Structure_ID']
-        token_ids = [row[key] for key in sorted(row.keys()) if key.endswith('_Token_ID') and len(row[key]) > 0]
-        index = get_id_counter(token_ids[0])
-        component_count = len(token_ids)
-        if (index not in mwe_map):
-            mwe_map[index] = set()
-        mwe_map[index].add((structure_id, component_count))
+        structure_id = int(row['Structure_ID'])
+        collocation_id = int(row['Collocation_ID'])
+        for index in mwe_index_map[collocation_id]:
+            if (mwe_components_map[index] == structure_components_map[structure_id]):
+                mwe_structure_map[index].add(structure_id)
     csv_file.close()
+    return mwe_structure_map
 
-    xml_tree = lxml.parse(input_file_name)
-    xml_root = xml_tree.getroot()
-    mwes_xml = xpath_find(xml_root, './/tei:s')
+def insert_structure_ids(input_file_name, mwe_structure_map, output_file_name):
+    tree = lxml.parse(input_file_name)
+    root = tree.getroot()
+    mwes_xml = xpath_find(root, './/tei:s')
     for mwe_xml in mwes_xml:
         index = get_id_counter(get_xml_id(mwe_xml))
-        mwe_string = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in xpath_find(mwe_xml, 'tei:w|tei:pc')]).strip()
-        token_count = len(xpath_find(mwe_xml, 'tei:w|tei:pc'))
-        structure_ids = set()
-        if (index in mwe_map):
-            for (structure_id, component_count) in mwe_map[index]:
-                if (component_count == token_count):
-                    structure_ids.add(int(structure_id))
-        if (len(structure_ids) > 1):
-            print('MWE #' + str(index) + ': ' + mwe_string + ': MULTIPLE_COLLOCATION_STRUCTURES (' + str(structure_ids) + ')')
-        elif (len(structure_ids) == 1):
+        if (index in mwe_structure_map):
+            structure_ids = mwe_structure_map[index]
             mwe_xml.set('structure_id', str(list(structure_ids)[0]))
-    xml_tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
+            if (len(structure_ids) > 1):
+                mwe_string = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in xpath_find(mwe_xml, 'tei:w|tei:pc')]).strip()
+                print('MWE #' + str(index) + ': ' + mwe_string + ': MULTIPLE_COLLOCATION_STRUCTURES (' + str(structure_ids) + ')')
+    tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
 
+def assign(input_file_name, structure_file_name, csv_file_name, mapper_file_name, output_file_name):
+    structure_components_map = get_structure_components_map(structure_file_name)
+    mwe_components_map = get_mwe_components_map(input_file_name)
+    mwe_index_map = get_mwe_index_map(mapper_file_name)
+    mwe_structure_map = get_mwe_structure_map(csv_file_name, mwe_components_map, mwe_index_map, structure_components_map)
+    insert_structure_ids(input_file_name, mwe_structure_map, output_file_name)
 
 if (__name__ == '__main__'):
 
diff --git a/structure_assignment/constants.py b/structure_assignment/constants.py
index 57ad992..336aaa9 100644
--- a/structure_assignment/constants.py
+++ b/structure_assignment/constants.py
@@ -8,6 +8,7 @@ FILE_MAP = {'strings-list': 'strings.txt',
             'tei-ids-collocation': 'tei_ids_collocations.xml',
             'tei-ids-all': 'tei_ids_all.xml',
             'collocations': 'collocation_matches.csv',
+            'collocation-mapper': 'mapper.txt',
             'structures-old': 'structures_old.xml',
             'structures-new': 'structures_new.xml',
             'dictionary': 'dictionary.xml',
diff --git a/structure_assignment/pipeline.py b/structure_assignment/pipeline.py
index 787afb1..6b167b5 100644
--- a/structure_assignment/pipeline.py
+++ b/structure_assignment/pipeline.py
@@ -144,16 +144,18 @@ class Pipeline:
         input_file_name = self.file_map['tei-initial']
         output_file_name = self.file_map['collocations']
 
-        extractor = cordex.Pipeline(structure_file_name, fixed_restriction_order=True, statistics=False, collocation_sentence_map_dest=None)
+        extractor = cordex.Pipeline(structure_file_name, fixed_restriction_order=True, statistics=False, collocation_sentence_map_dest=self.tmp_directory)
         extraction = extractor(input_file_name)
-        extraction.write(output_file_name, token_output=True)
+        extraction.write(output_file_name)
 
     def do_assign_collocation_structures(self):
         print('Assigning ids of collocation structures ...')
         input_file_name = self.file_map['tei-initial']
+        structure_file_name = self.file_map['structures-old']
         collocations_file_name = self.file_map['collocations']
+        mapper_file_name = self.file_map['collocation-mapper']
         output_file_name = self.file_map['tei-ids-collocation']
-        assign_collocation_structures(input_file_name, collocations_file_name, output_file_name)
+        assign_collocation_structures(input_file_name, structure_file_name, collocations_file_name, mapper_file_name, output_file_name)
 
     def do_assign_other_structures(self):
         print('Assigning ids of single and other structures, creating if necessary ...')