From f330a37764f102f86d761645f955b1cb76b37b3d Mon Sep 17 00:00:00 2001
From: Luka <krsnik.luka92@gmail.com>
Date: Wed, 22 Jul 2020 11:16:28 +0200
Subject: [PATCH] Improved representations speed + Fixed bug in representations

---
 run.sh                |  2 +-
 src/codes_tagset.py   | 76 ++++++++++++++++------------------
 src/match_store.py    |  7 +++-
 src/representation.py | 14 ++++---
 src/sloleks_db.py     | 95 ++++++++++++++++++++++++++++++++++++-------
 src/wani.py           | 23 +++--------
 6 files changed, 137 insertions(+), 80 deletions(-)
diff --git a/run.sh b/run.sh
index 6593b89..7137826 100755
--- a/run.sh
+++ b/run.sh
@@ -1 +1 @@
-pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db 'superbaza:A)2U&+3Vfd$Fg]Gb:kolokacije:127.0.0.1' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output 
+pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db '<PUT DB CREDENTIALS HERE!>' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output  --load-sloleks
\ No newline at end of file
diff --git a/src/codes_tagset.py b/src/codes_tagset.py
index be2ffcf..75d0cdc 100644
--- a/src/codes_tagset.py
+++ b/src/codes_tagset.py
@@ -1,16 +1,41 @@
+POSSIBLE_WORD_FORM_FEATURE_VALUES = {
+        "singular",
+        "dual",
+        "plural",
+        "nominative",
+        "genitive",
+        "dative",
+        "accusative",
+        "locative",
+        "instrumental",
+        "infinitive",
+        "supine",
+        "participle",
+        "present",
+        "future",
+        "conditional",
+        "imperative",
+        'masculine',
+        'feminine',
+        'neuter',
+}
+
 CODES_TRANSLATION = {
     "N": {
         2: {
+            '-': 'masculine',
             'm': 'masculine',
             'f': 'feminine',
             'n': 'neuter',
         },
         3: {
+            "-": "singular",
             "s": "singular",
             "d": "dual",
             "p": "plural",
         },
         4: {
+            "-": "nominative",
             "n": "nominative",
             "g": "genitive",
             "d": "dative",
@@ -21,10 +46,12 @@ CODES_TRANSLATION = {
     },
     "V": {
         1: {
+            "-": "main",
             "m": "main",
             "a": "auxiliary",
         },
         3: {
+            "-": "infinitive",
             "n": "infinitive",
             "u": "supine",
             "p": "participle",
@@ -34,46 +61,55 @@ CODES_TRANSLATION = {
             "m": "imperative",
         },
         4: {
+            "-": "first",
             "1": "first",
             "2": "second",
             "3": "third",
         },
         5: {
+            "-": "singular",
             "s": "singular",
             "d": "dual",
             "p": "plural",
         },
         6: {
+            '-': 'masculine',
             'm': 'masculine',
             'f': 'feminine',
             'n': 'neuter',
         },
         8: {
+            "-": "no",
             "n": "no",
             "y": "yes",
         },
     },
     "A": {
         1: {
+            "-": "general",
             "g": "general",
             "s": "possessive",
         },
         2: {
+            "-": "positive",
             "p": "positive",
             "c": "comparative",
             "s": "superlative",
         },
         3: {
+            '-': 'masculine',
             'm': 'masculine',
             'f': 'feminine',
             'n': 'neuter',
         },
         4: {
+            "-": "singular",
             "s": "singular",
             "d": "dual",
             "p": "plural",
         },
         5: {
+            "-": "nominative",
             "n": "nominative",
             "g": "genitive",
             "d": "dative",
@@ -82,46 +118,6 @@ CODES_TRANSLATION = {
             "i": "instrumental",
         },
     }
-    # "R": "Adverb",
-    # "P": "Pronoun",
-    # "M": "Numeral",
-    # "S": "Preposition",
-    # "C": "Conjunction",
-    # "Q": "Particle",
-    # "I": "Interjection",
-    # "Y": "Abbreviation",
-    # "X": "Residual",
-    #
-    #
-    # "e": "perfective",
-    # "p": "progressive",
-    # "b": "biaspectual",
-    #
-    #
-    # "p": "personal",
-    # "d": "demonstrative",
-    # "r": "relative",
-    # "x": "reflexive",
-    # "q": "interrogative",
-    # "i": "indefinite",
-    # "z": "negative",
-    # "b": "bound",
-    # "d": "digit",
-    # "r": "roman",
-    # "l": "letter",
-    # "c": "cardinal",
-    # "o": "ordinal",
-    # "p": "pronominal",
-    # "s": "special",
-    # "c": "coordinating",
-    # "s": "subordinating",
-    # "f": "foreign",
-    # "t": "typo",
-    # "p": "program",
-    # "w": "web",
-    # "e": "emo",
-    # "h": "hashtag",
-    # "a: "at""
 }
 
 CODES = {
diff --git a/src/match_store.py b/src/match_store.py
index 6025200..ed5bcb0 100644
--- a/src/match_store.py
+++ b/src/match_store.py
@@ -1,5 +1,7 @@
+import gc
 from collections import defaultdict
 from ast import literal_eval
+from time import time
 
 from match import StructureMatch
 from representation_assigner import RepresentationAssigner
@@ -110,6 +112,7 @@ class MatchStore:
 
         structures_dict = {s.id: s for s in structures}
         num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0])
+        start_time = time()
         for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
             structure = structures_dict[sid]
             match = StructureMatch.from_db(self.db, cid, structure)
@@ -119,7 +122,9 @@ class MatchStore:
             if len(inserts) > num_inserts:
                 self.add_inserts(inserts)
                 inserts = []
-
+            if time() - start_time > 5:
+                start_time = time()
+                gc.collect()
         self.add_inserts(inserts)
         self.db.step_is_done(step_name)
 
diff --git a/src/representation.py b/src/representation.py
index 2a7842a..f205d62 100644
--- a/src/representation.py
+++ b/src/representation.py
@@ -45,7 +45,7 @@ class LexisCR(ComponentRepresentation):
 class WordFormAllCR(ComponentRepresentation):
     def _render(self, sloleks_db=None):
         if len(self.words) == 0:
-            return None
+            return None, None
         else:
             forms = [w.text.lower() for w in self.words]
             msds = [w.msd for w in self.words]
@@ -74,15 +74,17 @@ class WordFormAnyCR(ComponentRepresentation):
             if not all(agreements_matched):
                 if sloleks_db is None:
                     raise Exception('sloleks_db not properly setup!')
-                for agr in self.agreement:
+                for i, agr in enumerate(self.agreement):
                     if not agr.match(word_msd):
                         msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
                         if msd is not None:
                             agr.msds[0] = msd
                             agr.words.append(WordDummy(msd, lemma, text))
-                            # agr.words[0].msd = msd
-                            # agr.words[0].text = text
-                            agreements_matched = [agr.match(word_msd) for agr in self.agreement]
+                            # when we find element in sloleks automatically add it (no need for second checks, since msd
+                            # is tailored to pass tests by default)
+                            agr.rendition_candidate = text
+                            agr.rendition_msd_candidate = msd
+                            agreements_matched[i] = True
                         else:
                             break
 
@@ -101,6 +103,7 @@ class WordFormAnyCR(ComponentRepresentation):
                     agr.confirm_match()
 
                 return text_forms[(word_msd, word_lemma)], word_msd
+        return None, None
 
 
 class WordFormMsdCR(WordFormAnyCR):
@@ -153,7 +156,6 @@ class WordFormMsdCR(WordFormAnyCR):
         common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds))) 
                       else msds[0][idx] for idx in range(len(msds[0]))]
         common_msd = "".join(common_msd)
-        iommon_msd = "".join(common_msd)
         return self.word_renderer.common_lemma_msd(self.lemma, common_msd)
     
 
diff --git a/src/sloleks_db.py b/src/sloleks_db.py
index e20b537..ba3d3b6 100644
--- a/src/sloleks_db.py
+++ b/src/sloleks_db.py
@@ -1,3 +1,5 @@
+import gc
+
 from psycopg2cffi import compat
 compat.register()
 
@@ -5,11 +7,11 @@ from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import Session, aliased
 from sqlalchemy import create_engine
 
-from codes_tagset import TAGSET, CODES, CODES_TRANSLATION
+from codes_tagset import TAGSET, CODES, CODES_TRANSLATION, POSSIBLE_WORD_FORM_FEATURE_VALUES
 
 
 class SloleksDatabase:
-    def __init__(self, db):
+    def __init__(self, db, load_sloleks):
         global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
         [db_user, db_password, db_database, db_host] = db.split(':')
 
@@ -71,12 +73,65 @@ class SloleksDatabase:
 
         self.session = Session(engine)
 
+        self.load_sloleks = load_sloleks
+        if self.load_sloleks:
+            self.init_load_sloleks()
+
+    def init_load_sloleks(self):
+        query_word_form_features = self.session.query(WordFormFeature.word_form_id, WordFormFeature.value)
+        word_form_features = query_word_form_features.all()
+        query_form_representations = self.session.query(FormRepresentation.word_form_id, FormRepresentation.form)
+        form_representations = query_form_representations.all()
+        query_word_forms = self.session.query(WordForm.id, WordForm.lexeme_id)
+        word_forms = query_word_forms.all()
+        query_lexemes = self.session.query(Lexeme.id, Lexeme.lemma)
+        lexemes = query_lexemes.all()
+
+        self.lemmas = {}
+        for lexeme in lexemes:
+            if lexeme.lemma not in self.lemmas:
+                self.lemmas[lexeme.lemma] = []
+            self.lemmas[lexeme.lemma].append(lexeme.id)
+
+        self.word_form_features = {}
+        for word_form_feature in word_form_features:
+            if word_form_feature.value not in POSSIBLE_WORD_FORM_FEATURE_VALUES:
+                continue
+            if word_form_feature.word_form_id not in self.word_form_features:
+                self.word_form_features[word_form_feature.word_form_id] = set()
+            self.word_form_features[word_form_feature.word_form_id].add(word_form_feature.value)
+
+        self.form_representations = {form_representation.word_form_id: form_representation.form for form_representation
+                                     in form_representations}
+
+        self.word_forms = {}
+        for word_form in word_forms:
+            if word_form.lexeme_id not in self.word_forms:
+                self.word_forms[word_form.lexeme_id] = []
+            self.word_forms[word_form.lexeme_id].append(word_form.id)
+
+
+        self.connected_lemmas = {}
+        for lemma, lemma_ids in self.lemmas.items():
+            for lemma_id in lemma_ids:
+                if lemma_id in self.word_forms:
+                    for word_form_id in self.word_forms[lemma_id]:
+                        if word_form_id in self.word_form_features and word_form_id in self.form_representations:
+                            if lemma not in self.connected_lemmas:
+                                self.connected_lemmas[lemma] = []
+                            self.connected_lemmas[lemma].append((self.word_form_features[word_form_id], self.form_representations[word_form_id]))
+
+        del self.lemmas, self.word_form_features, self.form_representations, self.word_forms
+        gc.collect()
+
+
     def close(self):
         self.session.close()
 
     def decypher_msd(self, msd):
         t = msd[0]
         decypher = []
+        # IF ADDING OR CHANGING ATTRIBUTES HERE ALSO FIX POSSIBLE_WORD_FORM_FEATURE_VALUES
         if t == 'N':
             # gender = CODES_TRANSLATION[t][2][msd[2]]
             number = CODES_TRANSLATION[t][3][msd[3]]
@@ -118,7 +173,6 @@ class SloleksDatabase:
                 # fix for verbs with short msds
                 if v + 1 >= len(msd):
                     msd = msd + ['-' for _ in range(v - len(msd) + 2)]
-                    # return None, None, None
 
                 msd[v + 1] = align_msd[v_align_msd + 1]
 
@@ -127,20 +181,31 @@ class SloleksDatabase:
         if not decypher_msd:
             return None, None, None
 
-        wfs = [aliased(WordFormFeature) for _ in decypher_msd]
-        query_preposition = self.session.query(FormRepresentation.form) \
-            .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
-            .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
+        if self.load_sloleks and lemma in self.connected_lemmas:
+            for (word_form_features, form_representations) in self.connected_lemmas[lemma]:
+                fits = True
+                for d_m in decypher_msd:
+                    if d_m not in word_form_features:
+                        fits = False
+                        break
+                if fits:
+                    break
+            return ''.join(msd), lemma, form_representations
+        else:
+            wfs = [aliased(WordFormFeature) for _ in decypher_msd]
+            query_preposition = self.session.query(FormRepresentation.form) \
+                .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
+                .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
 
-        for wf in wfs:
-            query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
+            for wf in wfs:
+                query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
 
-        query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
+            query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
 
-        for wf, msd_el in zip(wfs, decypher_msd):
-            query_preposition = query_preposition.filter(wf.value == msd_el)
+            for wf, msd_el in zip(wfs, decypher_msd):
+                query_preposition = query_preposition.filter(wf.value == msd_el)
 
-        pattern_translation_hws = query_preposition.all()
-        if len(pattern_translation_hws) > 0:
-            return ''.join(msd), lemma, pattern_translation_hws[0][0]
+            pattern_translation_hws = query_preposition.limit(1).all()
+            if len(pattern_translation_hws) > 0:
+                return ''.join(msd), lemma, pattern_translation_hws[0][0]
         return None, None, None
diff --git a/src/wani.py b/src/wani.py
index f10b5ef..b7f5b15 100644
--- a/src/wani.py
+++ b/src/wani.py
@@ -31,32 +31,17 @@ def match_file(words, structures, postprocessor):
         for w in words:
             mhere = s.match(w)
             for match in mhere:
-                # colocation_id = [(idx, w.lemma) for idx, w in match.items()]
                 colocation_id = [[idx, w.lemma] for idx, w in match.items()]
                 colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
                 match, collocation_id = postprocessor.process(match, colocation_id)
                 colocation_id = tuple(colocation_id)
 
                 matches[s].append((match, colocation_id))
-                # for key, val in matches.items():
-                #     if key.id == '15':
-                #         for el in val:
-                #             if el[0]['1'].lemma == 'biti' and el[0]['2'].lemma == 'po' and el[0]['3'].lemma == 'mnenje':
-                #                 word_id = '.'.join(words[0].id.split('.')[:-1])
-                #                 print(f"ID: {'.'.join(words[0].id.split('.')[:-1])}")
-                #                 print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
-
-                # if s.id == '15':
-                #     if match['1'].lemma == 'biti' and match['2'].lemma == 'po' and match['3'].lemma == 'mnenje':
-                #         word_id = '.'.join(match['1'].id.split('.')[:-1])
-                #         print(f"ID: {word_id}")
-                #         print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
 
     return matches
 
 
 def main(args):
-    sloleks_db = SloleksDatabase(args.sloleks_db)
     structures, lemma_msds, max_num_components = build_structures(args)
     timeinfo = TimeInfo(len(args.input))
 
@@ -95,7 +80,9 @@ def main(args):
 
     # figure out representations!
     if args.out or args.out_no_stat:
+        sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
         match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
+        sloleks_db.close()
 
     Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
         structures, match_store)
@@ -106,8 +93,6 @@ def main(args):
     Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
         structures, match_store)
 
-    # sloleks_db.get_word_form(lemma, gender, number, case)
-    sloleks_db.close()
 
 
 if __name__ == '__main__':
@@ -144,6 +129,10 @@ if __name__ == '__main__':
                         help='Generate one output for each syntactic structure',
                         action='store_true')
 
+    parser.add_argument('--load-sloleks',
+                        help='Tells weather sloleks is loaded into memory at the beginning of processing or not.',
+                        action='store_true')
+
     parser.add_argument('--sort-by',
                         help="Sort by a this column (index)", type=int, default=-1)
     parser.add_argument('--sort-reversed',