Improved representations speed + Fixed bug in representations

2020-07-22 11:16:28 +02:00
parent 4c84873ff5
commit f330a37764
6 changed files with 137 additions and 80 deletions
@@ -1 +1 @@
-pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db 'superbaza:A)2U&+3Vfd$Fg]Gb:kolokacije:127.0.0.1' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output 
+pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db '<PUT DB CREDENTIALS HERE!>' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output  --load-sloleks
@@ -1,16 +1,41 @@
+POSSIBLE_WORD_FORM_FEATURE_VALUES = {
+        "singular",
+        "dual",
+        "plural",
+        "nominative",
+        "genitive",
+        "dative",
+        "accusative",
+        "locative",
+        "instrumental",
+        "infinitive",
+        "supine",
+        "participle",
+        "present",
+        "future",
+        "conditional",
+        "imperative",
+        'masculine',
+        'feminine',
+        'neuter',
+}
+
 CODES_TRANSLATION = {
    "N": {
        2: {
+            '-': 'masculine',
            'm': 'masculine',
            'f': 'feminine',
            'n': 'neuter',
        },
        3: {
+            "-": "singular",
            "s": "singular",
            "d": "dual",
            "p": "plural",
        },
        4: {
+            "-": "nominative",
            "n": "nominative",
            "g": "genitive",
            "d": "dative",
@@ -21,10 +46,12 @@ CODES_TRANSLATION = {
    },
    "V": {
        1: {
+            "-": "main",
            "m": "main",
            "a": "auxiliary",
        },
        3: {
+            "-": "infinitive",
            "n": "infinitive",
            "u": "supine",
            "p": "participle",
@@ -34,46 +61,55 @@ CODES_TRANSLATION = {
            "m": "imperative",
        },
        4: {
+            "-": "first",
            "1": "first",
            "2": "second",
            "3": "third",
        },
        5: {
+            "-": "singular",
            "s": "singular",
            "d": "dual",
            "p": "plural",
        },
        6: {
+            '-': 'masculine',
            'm': 'masculine',
            'f': 'feminine',
            'n': 'neuter',
        },
        8: {
+            "-": "no",
            "n": "no",
            "y": "yes",
        },
    },
    "A": {
        1: {
+            "-": "general",
            "g": "general",
            "s": "possessive",
        },
        2: {
+            "-": "positive",
            "p": "positive",
            "c": "comparative",
            "s": "superlative",
        },
        3: {
+            '-': 'masculine',
            'm': 'masculine',
            'f': 'feminine',
            'n': 'neuter',
        },
        4: {
+            "-": "singular",
            "s": "singular",
            "d": "dual",
            "p": "plural",
        },
        5: {
+            "-": "nominative",
            "n": "nominative",
            "g": "genitive",
            "d": "dative",
@@ -82,46 +118,6 @@ CODES_TRANSLATION = {
            "i": "instrumental",
        },
    }
-    # "R": "Adverb",
-    # "P": "Pronoun",
-    # "M": "Numeral",
-    # "S": "Preposition",
-    # "C": "Conjunction",
-    # "Q": "Particle",
-    # "I": "Interjection",
-    # "Y": "Abbreviation",
-    # "X": "Residual",
-    #
-    #
-    # "e": "perfective",
-    # "p": "progressive",
-    # "b": "biaspectual",
-    #
-    #
-    # "p": "personal",
-    # "d": "demonstrative",
-    # "r": "relative",
-    # "x": "reflexive",
-    # "q": "interrogative",
-    # "i": "indefinite",
-    # "z": "negative",
-    # "b": "bound",
-    # "d": "digit",
-    # "r": "roman",
-    # "l": "letter",
-    # "c": "cardinal",
-    # "o": "ordinal",
-    # "p": "pronominal",
-    # "s": "special",
-    # "c": "coordinating",
-    # "s": "subordinating",
-    # "f": "foreign",
-    # "t": "typo",
-    # "p": "program",
-    # "w": "web",
-    # "e": "emo",
-    # "h": "hashtag",
-    # "a: "at""
 }

 CODES = {
@@ -1,5 +1,7 @@
+import gc
 from collections import defaultdict
 from ast import literal_eval
+from time import time

 from match import StructureMatch
 from representation_assigner import RepresentationAssigner
@@ -110,6 +112,7 @@ class MatchStore:

        structures_dict = {s.id: s for s in structures}
        num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0])
+        start_time = time()
        for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
            structure = structures_dict[sid]
            match = StructureMatch.from_db(self.db, cid, structure)
@@ -119,7 +122,9 @@ class MatchStore:
            if len(inserts) > num_inserts:
                self.add_inserts(inserts)
                inserts = []
-
+            if time() - start_time > 5:
+                start_time = time()
+                gc.collect()
        self.add_inserts(inserts)
        self.db.step_is_done(step_name)

@@ -45,7 +45,7 @@ class LexisCR(ComponentRepresentation):
 class WordFormAllCR(ComponentRepresentation):
    def _render(self, sloleks_db=None):
        if len(self.words) == 0:
-            return None
+            return None, None
        else:
            forms = [w.text.lower() for w in self.words]
            msds = [w.msd for w in self.words]
@@ -74,15 +74,17 @@ class WordFormAnyCR(ComponentRepresentation):
            if not all(agreements_matched):
                if sloleks_db is None:
                    raise Exception('sloleks_db not properly setup!')
-                for agr in self.agreement:
+                for i, agr in enumerate(self.agreement):
                    if not agr.match(word_msd):
                        msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
                        if msd is not None:
                            agr.msds[0] = msd
                            agr.words.append(WordDummy(msd, lemma, text))
-                            # agr.words[0].msd = msd
-                            # agr.words[0].text = text
-                            agreements_matched = [agr.match(word_msd) for agr in self.agreement]
+                            # when we find element in sloleks automatically add it (no need for second checks, since msd
+                            # is tailored to pass tests by default)
+                            agr.rendition_candidate = text
+                            agr.rendition_msd_candidate = msd
+                            agreements_matched[i] = True
                        else:
                            break

@@ -101,6 +103,7 @@ class WordFormAnyCR(ComponentRepresentation):
                    agr.confirm_match()

                return text_forms[(word_msd, word_lemma)], word_msd
+        return None, None


 class WordFormMsdCR(WordFormAnyCR):
@@ -153,7 +156,6 @@ class WordFormMsdCR(WordFormAnyCR):
        common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds))) 
                      else msds[0][idx] for idx in range(len(msds[0]))]
        common_msd = "".join(common_msd)
-        iommon_msd = "".join(common_msd)
        return self.word_renderer.common_lemma_msd(self.lemma, common_msd)
    

@@ -1,3 +1,5 @@
+import gc
+
 from psycopg2cffi import compat
 compat.register()

@@ -5,11 +7,11 @@ from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import Session, aliased
 from sqlalchemy import create_engine

-from codes_tagset import TAGSET, CODES, CODES_TRANSLATION
+from codes_tagset import TAGSET, CODES, CODES_TRANSLATION, POSSIBLE_WORD_FORM_FEATURE_VALUES


 class SloleksDatabase:
-    def __init__(self, db):
+    def __init__(self, db, load_sloleks):
        global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
        [db_user, db_password, db_database, db_host] = db.split(':')

@@ -71,12 +73,65 @@ class SloleksDatabase:

        self.session = Session(engine)

+        self.load_sloleks = load_sloleks
+        if self.load_sloleks:
+            self.init_load_sloleks()
+
+    def init_load_sloleks(self):
+        query_word_form_features = self.session.query(WordFormFeature.word_form_id, WordFormFeature.value)
+        word_form_features = query_word_form_features.all()
+        query_form_representations = self.session.query(FormRepresentation.word_form_id, FormRepresentation.form)
+        form_representations = query_form_representations.all()
+        query_word_forms = self.session.query(WordForm.id, WordForm.lexeme_id)
+        word_forms = query_word_forms.all()
+        query_lexemes = self.session.query(Lexeme.id, Lexeme.lemma)
+        lexemes = query_lexemes.all()
+
+        self.lemmas = {}
+        for lexeme in lexemes:
+            if lexeme.lemma not in self.lemmas:
+                self.lemmas[lexeme.lemma] = []
+            self.lemmas[lexeme.lemma].append(lexeme.id)
+
+        self.word_form_features = {}
+        for word_form_feature in word_form_features:
+            if word_form_feature.value not in POSSIBLE_WORD_FORM_FEATURE_VALUES:
+                continue
+            if word_form_feature.word_form_id not in self.word_form_features:
+                self.word_form_features[word_form_feature.word_form_id] = set()
+            self.word_form_features[word_form_feature.word_form_id].add(word_form_feature.value)
+
+        self.form_representations = {form_representation.word_form_id: form_representation.form for form_representation
+                                     in form_representations}
+
+        self.word_forms = {}
+        for word_form in word_forms:
+            if word_form.lexeme_id not in self.word_forms:
+                self.word_forms[word_form.lexeme_id] = []
+            self.word_forms[word_form.lexeme_id].append(word_form.id)
+
+
+        self.connected_lemmas = {}
+        for lemma, lemma_ids in self.lemmas.items():
+            for lemma_id in lemma_ids:
+                if lemma_id in self.word_forms:
+                    for word_form_id in self.word_forms[lemma_id]:
+                        if word_form_id in self.word_form_features and word_form_id in self.form_representations:
+                            if lemma not in self.connected_lemmas:
+                                self.connected_lemmas[lemma] = []
+                            self.connected_lemmas[lemma].append((self.word_form_features[word_form_id], self.form_representations[word_form_id]))
+
+        del self.lemmas, self.word_form_features, self.form_representations, self.word_forms
+        gc.collect()
+
+
    def close(self):
        self.session.close()

    def decypher_msd(self, msd):
        t = msd[0]
        decypher = []
+        # IF ADDING OR CHANGING ATTRIBUTES HERE ALSO FIX POSSIBLE_WORD_FORM_FEATURE_VALUES
        if t == 'N':
            # gender = CODES_TRANSLATION[t][2][msd[2]]
            number = CODES_TRANSLATION[t][3][msd[3]]
@@ -118,7 +173,6 @@ class SloleksDatabase:
                # fix for verbs with short msds
                if v + 1 >= len(msd):
                    msd = msd + ['-' for _ in range(v - len(msd) + 2)]
-                    # return None, None, None

                msd[v + 1] = align_msd[v_align_msd + 1]

@@ -127,6 +181,17 @@ class SloleksDatabase:
        if not decypher_msd:
            return None, None, None

+        if self.load_sloleks and lemma in self.connected_lemmas:
+            for (word_form_features, form_representations) in self.connected_lemmas[lemma]:
+                fits = True
+                for d_m in decypher_msd:
+                    if d_m not in word_form_features:
+                        fits = False
+                        break
+                if fits:
+                    break
+            return ''.join(msd), lemma, form_representations
+        else:
            wfs = [aliased(WordFormFeature) for _ in decypher_msd]
            query_preposition = self.session.query(FormRepresentation.form) \
                .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
@@ -140,7 +205,7 @@ class SloleksDatabase:
            for wf, msd_el in zip(wfs, decypher_msd):
                query_preposition = query_preposition.filter(wf.value == msd_el)

-        pattern_translation_hws = query_preposition.all()
+            pattern_translation_hws = query_preposition.limit(1).all()
            if len(pattern_translation_hws) > 0:
                return ''.join(msd), lemma, pattern_translation_hws[0][0]
        return None, None, None
@@ -31,32 +31,17 @@ def match_file(words, structures, postprocessor):
        for w in words:
            mhere = s.match(w)
            for match in mhere:
-                # colocation_id = [(idx, w.lemma) for idx, w in match.items()]
                colocation_id = [[idx, w.lemma] for idx, w in match.items()]
                colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
                match, collocation_id = postprocessor.process(match, colocation_id)
                colocation_id = tuple(colocation_id)

                matches[s].append((match, colocation_id))
-                # for key, val in matches.items():
-                #     if key.id == '15':
-                #         for el in val:
-                #             if el[0]['1'].lemma == 'biti' and el[0]['2'].lemma == 'po' and el[0]['3'].lemma == 'mnenje':
-                #                 word_id = '.'.join(words[0].id.split('.')[:-1])
-                #                 print(f"ID: {'.'.join(words[0].id.split('.')[:-1])}")
-                #                 print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
-
-                # if s.id == '15':
-                #     if match['1'].lemma == 'biti' and match['2'].lemma == 'po' and match['3'].lemma == 'mnenje':
-                #         word_id = '.'.join(match['1'].id.split('.')[:-1])
-                #         print(f"ID: {word_id}")
-                #         print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))

    return matches


 def main(args):
-    sloleks_db = SloleksDatabase(args.sloleks_db)
    structures, lemma_msds, max_num_components = build_structures(args)
    timeinfo = TimeInfo(len(args.input))

@@ -95,7 +80,9 @@ def main(args):

    # figure out representations!
    if args.out or args.out_no_stat:
+        sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
        match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
+        sloleks_db.close()

    Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)
@@ -106,8 +93,6 @@ def main(args):
    Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)

-    # sloleks_db.get_word_form(lemma, gender, number, case)
-    sloleks_db.close()


 if __name__ == '__main__':
@@ -144,6 +129,10 @@ if __name__ == '__main__':
                        help='Generate one output for each syntactic structure',
                        action='store_true')

+    parser.add_argument('--load-sloleks',
+                        help='Tells weather sloleks is loaded into memory at the beginning of processing or not.',
+                        action='store_true')
+
    parser.add_argument('--sort-by',
                        help="Sort by a this column (index)", type=int, default=-1)
    parser.add_argument('--sort-reversed',