structure_assignment/package/structure_assignment/pipeline.py

import os
import shutil
import tempfile
from types import SimpleNamespace
import lxml.etree as lxml

import obeliks

import classla
from classla import Document
#from classla.models.common.conll import CoNLLFile

from structure_assignment.constants import *
from structure_assignment.tweak_conllu import tweak as tweak_conllu
from nova_slovnica.translate_jos import translate as translate_jos
from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei
from structure_assignment.split_tei import split as split_tei
from nova_slovnica.assign_single_structures import assign as assign_single
from nova_slovnica.assign_structures import assign as assign_multiple
from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary
from nova_slovnica.create_structures import create as create_structures
from structure_assignment.merge_dictionaries import merge as merge_dictionaries

def create_nlp(resource_directory):
    NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
    return classla.Pipeline('sl', **NLP_CONFIG_MAP)

class Pipeline:

    def __init__(self, nlp, resource_directory):
        self.nlp = nlp
        self.tmp_directory = tempfile.mkdtemp()
        resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)]
        for resource_file_name in resource_file_names:
            if (os.path.isfile(resource_file_name)):
                shutil.copy(resource_file_name, self.tmp_directory)
        import sys
        sys.path.insert(0, self.tmp_directory)
        self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}

    def import_file(self, file_name, file_key):
        shutil.copyfile(file_name, self.file_map[file_key])

    def do_tokenise(self):
        input_file_name = self.file_map['strings-list']
        output_file_name = self.file_map['obeliks-tokenised']
        obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True)

    def do_tweak_conllu(self):
        input_file_name = self.file_map['obeliks-tokenised']
        output_file_name = self.file_map['obeliks-tweaked']
        tweak_conllu(input_file_name, output_file_name)

    def do_parse(self):
        input_file_name = self.file_map['obeliks-tweaked']
        output_file_name = self.file_map['classla-parsed']
        doc = Document(text=None)
        conll_file = CoNLLFile(filename=input_file_name)
        doc.conll_file = conll_file
        result = nlp(doc)
        result.conll_file.write_conll(output_file_name)

    def do_translate_jos(self):
        input_file_name = self.file_map['classla-parsed']
        dictionary_file_name = self.file_map['dict']
        output_file_name = self.file_map['classla-translated']
        translate_jos(input_file_name, dictionary_file_name, output_file_name)

    def do_conllu_to_tei(self):
        input_file_name = self.file_map['classla-translated']
        output_file_name = self.file_map['tei-initial']
        conllu_to_tei(input_file_name, output_file_name)

    def do_split_tei(self):
        input_file_name = self.file_map['tei-initial']
        output_single_file_name = self.file_map['tei-single']
        output_multiple_file_name = self.file_map['tei-multiple']
        split_tei(input_file_name, output_single_file_name, output_multiple_file_name)

    def do_assign_single(self):
        input_file_name = self.file_map['tei-single']
        structure_file_name = self.file_map['structures-old']
        output_file_name = self.file_map['tei-single-ids']
        assign_single(input_file_name, structure_file_name, output_file_name)

    def do_tei_to_dictionary_single(self):
        input_file_name = self.file_map['tei-single-ids']
        output_file_name = self.file_map['dictionary-single']
        tei_to_dictionary(input_file_name, output_file_name)

    def do_tei_to_dictionary_multiple(self):
        input_file_name = self.file_map['tei-multiple-ids-2']
        output_file_name = self.file_map['dictionary-multiple']
        tei_to_dictionary(input_file_name, output_file_name)

    def do_find_structure_units_first(self):
        self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1'])

    def do_find_structure_units_second(self):
        self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2'])

    def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name):

        from wani import main as wani_main
        namespace = SimpleNamespace()

        # relevant values
        namespace.structures = structure_file_name
        namespace.input = [tei_file_name]
        namespace.all = csv_file_name
        namespace.skip_id_check = True
        namespace.fixed_restriction_order = True
        namespace.new_tei = True

        # default values
        namespace.sloleks_db = None
        namespace.out = None
        namespace.out_no_stat = None
        namespace.stats = None
        namespace.no_msd_translate = False
        namespace.min_freq = 0
        namespace.verbose = 'info'
        namespace.count_files = False
        namespace.multiple_output = False
        namespace.load_sloleks = False
        namespace.sort_by = -1
        namespace.sort_reversed = False
        namespace.db = None
        namespace.collocation_sentence_map_dest = None
        namespace.new_db = False
        namespace.pc_tag = 'pc'
        namespace.separator = '\t'
        namespace.ignore_punctuations = False

        wani_main(namespace)


    def _find_min_other_id(self, key):
        try:
            root = lxml.parse(self.file_map[key])
            other_ids = [int(oid) for oid in root.xpath('syntactic_structure[@type="other"]/@id')]
            min_id = min(other_ids)
        except:
            min_id = 109 # This is the current value in structures.xml, and is not expected to change. Ugly, but code shouldn't reach here ...
        return min_id

    def do_assign_multiple_first(self):
        min_other_id = self._find_min_other_id('structures-old')
        assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'], min_other_id)

    def do_assign_multiple_second(self):
        min_other_id = self._find_min_other_id('structures-new')
        assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'], min_other_id)

    def do_create_structures(self):
        input_file_name = self.file_map['structures-old']
        tei_file_name = self.file_map['tei-multiple-ids-1']
        output_file_name = self.file_map['structures-new']
        create_structures(input_file_name, tei_file_name, output_file_name)

    def do_merge_dictionaries(self):
        single_file_name = self.file_map['dictionary-single']
        multiple_file_name = self.file_map['dictionary-multiple']
        output_file_name = self.file_map['dictionary']
        merge_dictionaries(single_file_name, multiple_file_name, output_file_name)

    def _do_validate(self, schema_file_name, xml_file_name):
        xml_schema = lxml.XMLSchema(lxml.parse(schema_file_name))
        xml_tree = lxml.parse(xml_file_name)
        xml_schema.assertValid(xml_tree)

    def do_validate_structures(self):
        schema_file_name = self.file_map['structure-schema']
        xml_file_name = self.file_map['structures-new']
        self._do_validate(schema_file_name, xml_file_name)

    def do_validate_dictionary(self):
        schema_file_name = self.file_map['dictionary-schema']
        xml_file_name = self.file_map['dictionary']
        self._do_validate(schema_file_name, xml_file_name)

    def export_file(self, file_name, file_key):
        shutil.copyfile(self.file_map[file_key], file_name)

    def cleanup(self):
        shutil.rmtree(self.tmp_directory, True)