Redmine #1461: switched from luscenje_struktur to cordex

2023-02-27 14:45:04 +01:00
parent e9eff0658f
commit 24824cd459
5 changed files with 16 additions and 55 deletions
@@ -3,7 +3,6 @@ import argparse
 from structure_assignment.pipeline import Runner

 classla_directory = '../resources/classla'
-wani_file_name = '../resources/wani.py'  # TODO: remove once luscenje_struktur incorporates wani in package

 if (__name__ == '__main__'):

@@ -22,7 +21,7 @@ if (__name__ == '__main__'):
    output_structure_file_name = arguments.outstructs

    nlp_needed = mode in {'strings_to_parse', 'strings_to_dictionary', 'all'}
-    runner = Runner(nlp_needed, classla_directory, wani_file_name)
+    runner = Runner(nlp_needed, classla_directory)
    if (mode == 'strings_to_parse'):
        runner.strings_to_parse(input_file_name, output_file_name)
    elif (mode == 'strings_to_dictionary'):
@@ -10,8 +10,6 @@ setup(name='structure_assignment',
      install_requires=['lxml',
                        'classla',
                        'conversion_utils @ git+https://gitea.cjvt.si/generic/conversion_utils.git',
-                        'luscenje_struktur_loc @ git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git@i2198',  # TODO: switch to master once luscenje_struktur's i2198 is merged into master
-                        'psycopg2cffi',  # TODO: remove once luscenje_struktur takes care of it
-                        'sqlalchemy',  # TODO: remove once luscenje_struktur takes care of it
+                        'cordex @ git+https://github.com/clarinsi/cordex.git',
      ],
      zip_safe=True)
@@ -16,7 +16,7 @@ def get_id_counter(xml_id):
 def assign(input_file_name, csv_file_name, output_file_name):

    csv_file = codecs.open(csv_file_name, 'r')
-    reader = csv.DictReader(csv_file, delimiter='\t')
+    reader = csv.DictReader(csv_file, delimiter=',')
    mwe_map = {}
    for row in reader:
        structure_id = row['Structure_ID']
@@ -97,11 +97,11 @@ def parse_xml_structures(root):
        syntactic_structure.id = int(structure_element.get('id'))
        syntactic_structure.type = structure_element.get('type')
        dependency_tuples = []
-        for dependency in structure_element.xpath('system[@type="JOS"]/dependencies/dependency'):
+        for dependency in structure_element.xpath('dependencies/dependency'):
            dependency_tuples.append((dependency.get('from'), dependency.get('to'), dependency.get('label')))
        syntactic_structure.set_dependencies(dependency_tuples)
        component_maps = []
-        for component in structure_element.xpath('system[@type="JOS"]/definition/component'):
+        for component in structure_element.xpath('definition/component'):
            morphology_features = component.xpath('restriction[@type="morphology"]/feature')
            component_map = {}
            for feature in morphology_features:
@@ -174,9 +174,7 @@ def create_xml_structure(syntactic_structure):
    comment = lxml.Comment(' example: ' + syntactic_structure.example)
    structure_element.append(comment)
    structure_element.set('type', 'other')
-    system = lxml.SubElement(structure_element, 'system')
-    system.set('type', 'JOS')
-    components = lxml.SubElement(system, 'components')
+    components = lxml.SubElement(structure_element, 'components')
    components.set('order', 'fixed')
    for (index, component_map) in enumerate(syntactic_structure.components, start=1):
        component = lxml.SubElement(components, 'component')
@@ -184,7 +182,7 @@ def create_xml_structure(syntactic_structure):
        component.set('type', 'core')
        component.set('label', component_map['label'])
    structure_element.set('label', '-'.join([c.get('label') for c in components]))
-    dependencies = lxml.SubElement(system, 'dependencies')
+    dependencies = lxml.SubElement(structure_element, 'dependencies')
    for dependency_map in syntactic_structure.dependencies:
        dependency = lxml.SubElement(dependencies, 'dependency')
        [from_index, label, to_index] = [dependency_map[key] for key in ['from', 'label', 'to']]
@@ -193,7 +191,7 @@ def create_xml_structure(syntactic_structure):
        dependency.set('from', from_index)
        dependency.set('label', label)
        dependency.set('to', to_index)
-    definition = lxml.SubElement(system, 'definition')
+    definition = lxml.SubElement(structure_element, 'definition')
    for (index, component_map) in enumerate(syntactic_structure.components, start=1):
        component = lxml.SubElement(definition, 'component')
        component.set('cid', str(index))
@@ -1,9 +1,9 @@
 import shutil
 import tempfile
-from types import SimpleNamespace
 import lxml.etree as lxml

 import classla
+import cordex

 from structure_assignment.constants import *
 from structure_assignment.tweak_conllu import tweak as tweak_conllu
@@ -15,19 +15,11 @@ from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary

 class Runner:

-    def __init__(self, nlp_needed, classla_directory=None, wani_file_name=None):
+    def __init__(self, nlp_needed, classla_directory=None):
        self.classla_directory = classla_directory
        if (nlp_needed):
            NLP_CONFIG_MAP['dir'] = classla_directory
            self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
-        if (wani_file_name is not None):
-            self._provide_wani(wani_file_name)
-
-    def _provide_wani(self, wani_file_name):  # TODO: remove once wani is incorporated into luscenje_struktur package
-        self.wani_directory = tempfile.mkdtemp()
-        shutil.copy(wani_file_name, self.wani_directory)
-        import sys
-        sys.path.insert(0, self.wani_directory)

    def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
        pipeline = Pipeline(self.nlp)
@@ -93,7 +85,6 @@ class Runner:
        pipeline.do_tei_to_dictionary()

    def cleanup(self, pipeline):
-        shutil.rmtree(self.wani_directory, True)
        pipeline.cleanup()


@@ -149,38 +140,13 @@ class Pipeline:
    def do_find_collocation_structure_units(self):
        print('Finding units for existing collocation structures ...')

-        from wani import main as wani_main
-        namespace = SimpleNamespace()
+        structure_file_name = self.file_map['structures-old']
+        input_file_name = self.file_map['tei-initial']
+        output_file_name = self.file_map['collocations']

-        # relevant values
-        namespace.structures = self.file_map['structures-old']
-        namespace.input = [self.file_map['tei-initial']]
-        namespace.all = self.file_map['collocations']
-        namespace.skip_id_check = True
-        namespace.fixed_restriction_order = True
-        namespace.new_tei = True
-
-        # default values
-        namespace.sloleks_db = None
-        namespace.out = None
-        namespace.out_no_stat = None
-        namespace.stats = None
-        namespace.no_msd_translate = False
-        namespace.min_freq = 0
-        namespace.verbose = 'info'
-        namespace.count_files = False
-        namespace.multiple_output = False
-        namespace.load_sloleks = False
-        namespace.sort_by = -1
-        namespace.sort_reversed = False
-        namespace.db = None
-        namespace.collocation_sentence_map_dest = None
-        namespace.new_db = False
-        namespace.pc_tag = 'pc'
-        namespace.separator = '\t'
-        namespace.ignore_punctuations = False
-         
-        wani_main(namespace)
+        extractor = cordex.Pipeline(structure_file_name, fixed_restriction_order=True, statistics=False, collocation_sentence_map_dest=None)
+        extraction = extractor(input_file_name)
+        extraction.write(output_file_name, token_output=True)

    def do_assign_collocation_structures(self):
        print('Assigning ids of collocation structures ...')