diff --git a/scripts/process.py b/scripts/process.py index 3b1c6e9..02e62d8 100644 --- a/scripts/process.py +++ b/scripts/process.py @@ -3,7 +3,6 @@ import argparse from structure_assignment.pipeline import Runner classla_directory = '../resources/classla' -wani_file_name = '../resources/wani.py' # TODO: remove once luscenje_struktur incorporates wani in package if (__name__ == '__main__'): @@ -22,7 +21,7 @@ if (__name__ == '__main__'): output_structure_file_name = arguments.outstructs nlp_needed = mode in {'strings_to_parse', 'strings_to_dictionary', 'all'} - runner = Runner(nlp_needed, classla_directory, wani_file_name) + runner = Runner(nlp_needed, classla_directory) if (mode == 'strings_to_parse'): runner.strings_to_parse(input_file_name, output_file_name) elif (mode == 'strings_to_dictionary'): diff --git a/setup.py b/setup.py index ef5ed7d..89fa021 100644 --- a/setup.py +++ b/setup.py @@ -10,8 +10,6 @@ setup(name='structure_assignment', install_requires=['lxml', 'classla', 'conversion_utils @ git+https://gitea.cjvt.si/generic/conversion_utils.git', - 'luscenje_struktur_loc @ git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git@i2198', # TODO: switch to master once luscenje_struktur's i2198 is merged into master - 'psycopg2cffi', # TODO: remove once luscenje_struktur takes care of it - 'sqlalchemy', # TODO: remove once luscenje_struktur takes care of it + 'cordex @ git+https://github.com/clarinsi/cordex.git', ], zip_safe=True) diff --git a/structure_assignment/assign_collocation_structures.py b/structure_assignment/assign_collocation_structures.py index 5744a54..7c1e36f 100644 --- a/structure_assignment/assign_collocation_structures.py +++ b/structure_assignment/assign_collocation_structures.py @@ -16,7 +16,7 @@ def get_id_counter(xml_id): def assign(input_file_name, csv_file_name, output_file_name): csv_file = codecs.open(csv_file_name, 'r') - reader = csv.DictReader(csv_file, delimiter='\t') + reader = csv.DictReader(csv_file, delimiter=',') mwe_map = {} for row in reader: structure_id = row['Structure_ID'] diff --git a/structure_assignment/assign_other_structures.py b/structure_assignment/assign_other_structures.py index 0551646..997e343 100644 --- a/structure_assignment/assign_other_structures.py +++ b/structure_assignment/assign_other_structures.py @@ -97,11 +97,11 @@ def parse_xml_structures(root): syntactic_structure.id = int(structure_element.get('id')) syntactic_structure.type = structure_element.get('type') dependency_tuples = [] - for dependency in structure_element.xpath('system[@type="JOS"]/dependencies/dependency'): + for dependency in structure_element.xpath('dependencies/dependency'): dependency_tuples.append((dependency.get('from'), dependency.get('to'), dependency.get('label'))) syntactic_structure.set_dependencies(dependency_tuples) component_maps = [] - for component in structure_element.xpath('system[@type="JOS"]/definition/component'): + for component in structure_element.xpath('definition/component'): morphology_features = component.xpath('restriction[@type="morphology"]/feature') component_map = {} for feature in morphology_features: @@ -174,9 +174,7 @@ def create_xml_structure(syntactic_structure): comment = lxml.Comment(' example: ' + syntactic_structure.example) structure_element.append(comment) structure_element.set('type', 'other') - system = lxml.SubElement(structure_element, 'system') - system.set('type', 'JOS') - components = lxml.SubElement(system, 'components') + components = lxml.SubElement(structure_element, 'components') components.set('order', 'fixed') for (index, component_map) in enumerate(syntactic_structure.components, start=1): component = lxml.SubElement(components, 'component') @@ -184,7 +182,7 @@ def create_xml_structure(syntactic_structure): component.set('type', 'core') component.set('label', component_map['label']) structure_element.set('label', '-'.join([c.get('label') for c in components])) - dependencies = lxml.SubElement(system, 'dependencies') + dependencies = lxml.SubElement(structure_element, 'dependencies') for dependency_map in syntactic_structure.dependencies: dependency = lxml.SubElement(dependencies, 'dependency') [from_index, label, to_index] = [dependency_map[key] for key in ['from', 'label', 'to']] @@ -193,7 +191,7 @@ def create_xml_structure(syntactic_structure): dependency.set('from', from_index) dependency.set('label', label) dependency.set('to', to_index) - definition = lxml.SubElement(system, 'definition') + definition = lxml.SubElement(structure_element, 'definition') for (index, component_map) in enumerate(syntactic_structure.components, start=1): component = lxml.SubElement(definition, 'component') component.set('cid', str(index)) diff --git a/structure_assignment/pipeline.py b/structure_assignment/pipeline.py index d940b84..787afb1 100644 --- a/structure_assignment/pipeline.py +++ b/structure_assignment/pipeline.py @@ -1,9 +1,9 @@ import shutil import tempfile -from types import SimpleNamespace import lxml.etree as lxml import classla +import cordex from structure_assignment.constants import * from structure_assignment.tweak_conllu import tweak as tweak_conllu @@ -15,19 +15,11 @@ from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary class Runner: - def __init__(self, nlp_needed, classla_directory=None, wani_file_name=None): + def __init__(self, nlp_needed, classla_directory=None): self.classla_directory = classla_directory if (nlp_needed): NLP_CONFIG_MAP['dir'] = classla_directory self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP) - if (wani_file_name is not None): - self._provide_wani(wani_file_name) - - def _provide_wani(self, wani_file_name): # TODO: remove once wani is incorporated into luscenje_struktur package - self.wani_directory = tempfile.mkdtemp() - shutil.copy(wani_file_name, self.wani_directory) - import sys - sys.path.insert(0, self.wani_directory) def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): pipeline = Pipeline(self.nlp) @@ -93,7 +85,6 @@ class Runner: pipeline.do_tei_to_dictionary() def cleanup(self, pipeline): - shutil.rmtree(self.wani_directory, True) pipeline.cleanup() @@ -149,38 +140,13 @@ class Pipeline: def do_find_collocation_structure_units(self): print('Finding units for existing collocation structures ...') - from wani import main as wani_main - namespace = SimpleNamespace() + structure_file_name = self.file_map['structures-old'] + input_file_name = self.file_map['tei-initial'] + output_file_name = self.file_map['collocations'] - # relevant values - namespace.structures = self.file_map['structures-old'] - namespace.input = [self.file_map['tei-initial']] - namespace.all = self.file_map['collocations'] - namespace.skip_id_check = True - namespace.fixed_restriction_order = True - namespace.new_tei = True - - # default values - namespace.sloleks_db = None - namespace.out = None - namespace.out_no_stat = None - namespace.stats = None - namespace.no_msd_translate = False - namespace.min_freq = 0 - namespace.verbose = 'info' - namespace.count_files = False - namespace.multiple_output = False - namespace.load_sloleks = False - namespace.sort_by = -1 - namespace.sort_reversed = False - namespace.db = None - namespace.collocation_sentence_map_dest = None - namespace.new_db = False - namespace.pc_tag = 'pc' - namespace.separator = '\t' - namespace.ignore_punctuations = False - - wani_main(namespace) + extractor = cordex.Pipeline(structure_file_name, fixed_restriction_order=True, statistics=False, collocation_sentence_map_dest=None) + extraction = extractor(input_file_name) + extraction.write(output_file_name, token_output=True) def do_assign_collocation_structures(self): print('Assigning ids of collocation structures ...')