luscenje_struktur/luscenje_struktur/sloleks_db.py

import gc

from luscenje_struktur.codes_tagset import TAGSET, CODES, CODES_TRANSLATION, POSSIBLE_WORD_FORM_FEATURE_VALUES


class SloleksDatabase:
    def __init__(self, db, load_sloleks):
        from psycopg2cffi import compat
        compat.register()

        from sqlalchemy.ext.declarative import declarative_base
        from sqlalchemy.orm import Session
        from sqlalchemy import create_engine

        global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation, FormEncoding
        [db_user, db_password, db_database, db_host] = db.split(':')

        engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
                               pool_recycle=14400)
        Base = declarative_base()
        Base.metadata.reflect(engine)

        class Lexeme(Base):
            __table__ = Base.metadata.tables['jedro_lexeme']

        class LexemeFeature(Base):
            __table__ = Base.metadata.tables['jedro_lexeme_feature']

        class SyntacticStructure(Base):
            __table__ = Base.metadata.tables['jedro_syntacticstructure']

        class StructureComponent(Base):
            __table__ = Base.metadata.tables['jedro_structurecomponent']

        class Feature(Base):
            __table__ = Base.metadata.tables['jedro_feature']

        class LexicalUnitLexeme(Base):
            __table__ = Base.metadata.tables['jedro_lexicalunit_lexeme']

        class LexicalUnit(Base):
            __table__ = Base.metadata.tables['jedro_lexicalunit']

        class LexicalUnitType(Base):
            __table__ = Base.metadata.tables['jedro_lexicalunittype']

        class Category(Base):
            __table__ = Base.metadata.tables['jedro_category']

        class Sense(Base):
            __table__ = Base.metadata.tables['jedro_sense']

        class Measure(Base):
            __table__ = Base.metadata.tables['jedro_measure']

        class LexicalUnitMeasure(Base):
            __table__ = Base.metadata.tables['jedro_lexicalunitmeasure']

        class Corpus(Base):
            __table__ = Base.metadata.tables['jedro_corpus']

        class Definition(Base):
            __table__ = Base.metadata.tables['jedro_definition']

        class WordForm(Base):
            __table__ = Base.metadata.tables['jedro_wordform']

        class WordFormFeature(Base):
            __table__ = Base.metadata.tables['jedro_wordform_feature']

        class FormRepresentation(Base):
            __table__ = Base.metadata.tables['jedro_formrepresentation']

        class FormEncoding(Base):
            __table__ = Base.metadata.tables['jedro_formencoding']

        self.session = Session(engine)

        self.load_sloleks = load_sloleks
        if self.load_sloleks:
            self.init_load_sloleks()

    # def init_load_sloleks2(self):


    def init_load_sloleks(self):
        query_word_form_features = self.session.query(WordFormFeature.word_form_id, WordFormFeature.value)
        word_form_features = query_word_form_features.all()
        query_form_representations = self.session.query(FormRepresentation.word_form_id)
        form_representations = query_form_representations.all()
        query_form_encoding = self.session.query(FormEncoding.form_representation_id, FormEncoding.text)
        form_encodings = query_form_encoding.all()
        query_word_forms = self.session.query(WordForm.id, WordForm.lexeme_id)
        word_forms = query_word_forms.all()
        query_lexemes = self.session.query(Lexeme.id, Lexeme.lemma)
        lexemes = query_lexemes.all()

        self.lemmas = {}
        for lexeme in lexemes:
            if lexeme.lemma not in self.lemmas:
                self.lemmas[lexeme.lemma] = []
            self.lemmas[lexeme.lemma].append(lexeme.id)

        self.word_form_features = {}
        for word_form_feature in word_form_features:
            if word_form_feature.value not in POSSIBLE_WORD_FORM_FEATURE_VALUES:
                continue
            if word_form_feature.word_form_id not in self.word_form_features:
                self.word_form_features[word_form_feature.word_form_id] = set()
            self.word_form_features[word_form_feature.word_form_id].add(word_form_feature.value)

        form_encodings_dict = {form_encoding.form_representation_id: form_encoding.text for form_encoding
                                     in form_encodings}

        self.form_representations = {form_representation.word_form_id: form_encodings_dict[form_representation.word_form_id] for form_representation
                                     in form_representations}

        self.word_forms = {}
        for word_form in word_forms:
            if word_form.lexeme_id not in self.word_forms:
                self.word_forms[word_form.lexeme_id] = []
            self.word_forms[word_form.lexeme_id].append(word_form.id)


        self.connected_lemmas = {}
        for lemma, lemma_ids in self.lemmas.items():
            for lemma_id in lemma_ids:
                if lemma_id in self.word_forms:
                    for word_form_id in self.word_forms[lemma_id]:
                        if word_form_id in self.word_form_features and word_form_id in self.form_representations:
                            if lemma not in self.connected_lemmas:
                                self.connected_lemmas[lemma] = []
                            self.connected_lemmas[lemma].append((self.word_form_features[word_form_id], self.form_representations[word_form_id]))

        del self.lemmas, self.word_form_features, self.form_representations, self.word_forms
        gc.collect()


    def close(self):
        self.session.close()

    def decypher_msd(self, msd):
        t = msd[0]
        decypher = []
        # IF ADDING OR CHANGING ATTRIBUTES HERE ALSO FIX POSSIBLE_WORD_FORM_FEATURE_VALUES
        if t == 'N':
            # gender = CODES_TRANSLATION[t][2][msd[2]]
            number = CODES_TRANSLATION[t][3][msd[3]]
            case = CODES_TRANSLATION[t][4][msd[4]]
            decypher = [number, case]
        elif t == 'V':
            # gender = CODES_TRANSLATION[t][6][msd[6]]
            vform = CODES_TRANSLATION[t][3][msd[3]]
            number = CODES_TRANSLATION[t][5][msd[5]]
            person = 'third'
            decypher = [vform, number, person]
        elif t == 'A':
            gender = CODES_TRANSLATION[t][3][msd[3]]
            number = CODES_TRANSLATION[t][4][msd[4]]
            case = CODES_TRANSLATION[t][5][msd[5]]
            decypher = [gender, number, case]

        return decypher

    def get_word_form(self, lemma, msd, data, align_msd=False):
        # modify msd as required
        from sqlalchemy.orm import aliased
        msd = list(msd)
        if 'msd' in data:
            for key, value in data['msd'].items():
                t = msd[0]
                v = TAGSET[t].index(key.lower())
                if v + 1 >= len(msd):
                    msd = msd + ['-' for _ in range(v - len(msd) + 2)]
                msd[v + 1] = CODES[value]

        if align_msd and 'agreement' in data:
            align_msd = list(align_msd)
            t_align_msd = align_msd[0]
            t = msd[0]

            for att in data['agreement']:
                v_align_msd = TAGSET[t_align_msd].index(att.lower())
                v = TAGSET[t].index(att.lower())
                # fix for verbs with short msds
                if v + 1 >= len(msd):
                    msd = msd + ['-' for _ in range(v - len(msd) + 2)]

                msd[v + 1] = align_msd[v_align_msd + 1]

        decypher_msd = self.decypher_msd(msd)

        if not decypher_msd:
            return None, None, None

        if self.load_sloleks and lemma in self.connected_lemmas:
            for (word_form_features, form_representations) in self.connected_lemmas[lemma]:
                fits = True
                for d_m in decypher_msd:
                    if d_m not in word_form_features:
                        fits = False
                        break
                if fits:
                    break
            return ''.join(msd), lemma, form_representations
        else:
            wfs = [aliased(WordFormFeature) for _ in decypher_msd]
            # self.session.query(FormEncoding.form_representation_id, FormEncoding.text)
            query_preposition = self.session.query(FormEncoding.text) \
                .join(FormRepresentation, FormRepresentation.id == FormEncoding.form_representation_id) \
                .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
                .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
            # query_preposition = self.session.query(FormRepresentation.form) \
            #     .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
            #     .join(Lexeme, Lexeme.id == WordForm.lexeme_id)

            for wf in wfs:
                query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)

            query_preposition = query_preposition.filter(Lexeme.lemma == lemma)

            for wf, msd_el in zip(wfs, decypher_msd):
                query_preposition = query_preposition.filter(wf.value == msd_el)

            pattern_translation_hws = query_preposition.limit(1).all()
            if len(pattern_translation_hws) > 0:
                return ''.join(msd), lemma, pattern_translation_hws[0][0]
        return None, None, None