Improved representations speed + Fixed bug in representations

2020-07-22 11:16:28 +02:00
parent 4c84873ff5
commit f330a37764
6 changed files with 137 additions and 80 deletions
@@ -1 +1 @@
-pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db 'superbaza:A)2U&+3Vfd$Fg]Gb:kolokacije:127.0.0.1' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output 
+pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db '<PUT DB CREDENTIALS HERE!>' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output  --load-sloleks
@@ -1,16 +1,41 @@
 POSSIBLE_WORD_FORM_FEATURE_VALUES = {
        "singular",
        "dual",
        "plural",
        "nominative",
        "genitive",
        "dative",
        "accusative",
        "locative",
        "instrumental",
        "infinitive",
        "supine",
        "participle",
        "present",
        "future",
        "conditional",
        "imperative",
        'masculine',
        'feminine',
        'neuter',
 }
 CODES_TRANSLATION = {
    "N": {
        2: {
            '-': 'masculine',
            'm': 'masculine',
            'f': 'feminine',
            'n': 'neuter',
        },
        3: {
            "-": "singular",
            "s": "singular",
            "d": "dual",
            "p": "plural",
        },
        4: {
            "-": "nominative",
            "n": "nominative",
            "g": "genitive",
            "d": "dative",
@@ -21,10 +46,12 @@ CODES_TRANSLATION = {
    },
    "V": {
        1: {
            "-": "main",
            "m": "main",
            "a": "auxiliary",
        },
        3: {
            "-": "infinitive",
            "n": "infinitive",
            "u": "supine",
            "p": "participle",
@@ -34,46 +61,55 @@ CODES_TRANSLATION = {
            "m": "imperative",
        },
        4: {
            "-": "first",
            "1": "first",
            "2": "second",
            "3": "third",
        },
        5: {
            "-": "singular",
            "s": "singular",
            "d": "dual",
            "p": "plural",
        },
        6: {
            '-': 'masculine',
            'm': 'masculine',
            'f': 'feminine',
            'n': 'neuter',
        },
        8: {
            "-": "no",
            "n": "no",
            "y": "yes",
        },
    },
    "A": {
        1: {
            "-": "general",
            "g": "general",
            "s": "possessive",
        },
        2: {
            "-": "positive",
            "p": "positive",
            "c": "comparative",
            "s": "superlative",
        },
        3: {
            '-': 'masculine',
            'm': 'masculine',
            'f': 'feminine',
            'n': 'neuter',
        },
        4: {
            "-": "singular",
            "s": "singular",
            "d": "dual",
            "p": "plural",
        },
        5: {
            "-": "nominative",
            "n": "nominative",
            "g": "genitive",
            "d": "dative",
@@ -82,46 +118,6 @@ CODES_TRANSLATION = {
            "i": "instrumental",
        },
    }
    # "R": "Adverb",
    # "P": "Pronoun",
    # "M": "Numeral",
    # "S": "Preposition",
    # "C": "Conjunction",
    # "Q": "Particle",
    # "I": "Interjection",
    # "Y": "Abbreviation",
    # "X": "Residual",
    #
    #
    # "e": "perfective",
    # "p": "progressive",
    # "b": "biaspectual",
    #
    #
    # "p": "personal",
    # "d": "demonstrative",
    # "r": "relative",
    # "x": "reflexive",
    # "q": "interrogative",
    # "i": "indefinite",
    # "z": "negative",
    # "b": "bound",
    # "d": "digit",
    # "r": "roman",
    # "l": "letter",
    # "c": "cardinal",
    # "o": "ordinal",
    # "p": "pronominal",
    # "s": "special",
    # "c": "coordinating",
    # "s": "subordinating",
    # "f": "foreign",
    # "t": "typo",
    # "p": "program",
    # "w": "web",
    # "e": "emo",
    # "h": "hashtag",
    # "a: "at""
 }
 CODES = {
@@ -1,5 +1,7 @@
 import gc
 from collections import defaultdict
 from ast import literal_eval
 from time import time
 from match import StructureMatch
 from representation_assigner import RepresentationAssigner
@@ -110,6 +112,7 @@ class MatchStore:
        structures_dict = {s.id: s for s in structures}
        num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0])
        start_time = time()
        for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
            structure = structures_dict[sid]
            match = StructureMatch.from_db(self.db, cid, structure)
@@ -119,7 +122,9 @@ class MatchStore:
            if len(inserts) > num_inserts:
                self.add_inserts(inserts)
                inserts = []
-
+            if time() - start_time > 5:
                start_time = time()
                gc.collect()
        self.add_inserts(inserts)
        self.db.step_is_done(step_name)
@@ -45,7 +45,7 @@ class LexisCR(ComponentRepresentation):
 class WordFormAllCR(ComponentRepresentation):
    def _render(self, sloleks_db=None):
        if len(self.words) == 0:
-            return None
+            return None, None
        else:
            forms = [w.text.lower() for w in self.words]
            msds = [w.msd for w in self.words]
@@ -74,15 +74,17 @@ class WordFormAnyCR(ComponentRepresentation):
            if not all(agreements_matched):
                if sloleks_db is None:
                    raise Exception('sloleks_db not properly setup!')
-                for agr in self.agreement:
+                for i, agr in enumerate(self.agreement):
                    if not agr.match(word_msd):
                        msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
                        if msd is not None:
                            agr.msds[0] = msd
                            agr.words.append(WordDummy(msd, lemma, text))
-                            # agr.words[0].msd = msd
+                            # when we find element in sloleks automatically add it (no need for second checks, since msd
-                            # agr.words[0].text = text
+                            # is tailored to pass tests by default)
-                            agreements_matched = [agr.match(word_msd) for agr in self.agreement]
+                            agr.rendition_candidate = text
                            agr.rendition_msd_candidate = msd
                            agreements_matched[i] = True
                        else:
                            break
@@ -101,6 +103,7 @@ class WordFormAnyCR(ComponentRepresentation):
                    agr.confirm_match()
                return text_forms[(word_msd, word_lemma)], word_msd
        return None, None
 class WordFormMsdCR(WordFormAnyCR):
@@ -153,7 +156,6 @@ class WordFormMsdCR(WordFormAnyCR):
        common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds))) 
                      else msds[0][idx] for idx in range(len(msds[0]))]
        common_msd = "".join(common_msd)
        iommon_msd = "".join(common_msd)
        return self.word_renderer.common_lemma_msd(self.lemma, common_msd)
@@ -1,3 +1,5 @@
 import gc
 from psycopg2cffi import compat
 compat.register()
@@ -5,11 +7,11 @@ from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import Session, aliased
 from sqlalchemy import create_engine
-from codes_tagset import TAGSET, CODES, CODES_TRANSLATION
+from codes_tagset import TAGSET, CODES, CODES_TRANSLATION, POSSIBLE_WORD_FORM_FEATURE_VALUES
 class SloleksDatabase:
-    def __init__(self, db):
+    def __init__(self, db, load_sloleks):
        global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
        [db_user, db_password, db_database, db_host] = db.split(':')
@@ -71,12 +73,65 @@ class SloleksDatabase:
        self.session = Session(engine)
        self.load_sloleks = load_sloleks
        if self.load_sloleks:
            self.init_load_sloleks()
    def init_load_sloleks(self):
        query_word_form_features = self.session.query(WordFormFeature.word_form_id, WordFormFeature.value)
        word_form_features = query_word_form_features.all()
        query_form_representations = self.session.query(FormRepresentation.word_form_id, FormRepresentation.form)
        form_representations = query_form_representations.all()
        query_word_forms = self.session.query(WordForm.id, WordForm.lexeme_id)
        word_forms = query_word_forms.all()
        query_lexemes = self.session.query(Lexeme.id, Lexeme.lemma)
        lexemes = query_lexemes.all()
        self.lemmas = {}
        for lexeme in lexemes:
            if lexeme.lemma not in self.lemmas:
                self.lemmas[lexeme.lemma] = []
            self.lemmas[lexeme.lemma].append(lexeme.id)
        self.word_form_features = {}
        for word_form_feature in word_form_features:
            if word_form_feature.value not in POSSIBLE_WORD_FORM_FEATURE_VALUES:
                continue
            if word_form_feature.word_form_id not in self.word_form_features:
                self.word_form_features[word_form_feature.word_form_id] = set()
            self.word_form_features[word_form_feature.word_form_id].add(word_form_feature.value)
        self.form_representations = {form_representation.word_form_id: form_representation.form for form_representation
                                     in form_representations}
        self.word_forms = {}
        for word_form in word_forms:
            if word_form.lexeme_id not in self.word_forms:
                self.word_forms[word_form.lexeme_id] = []
            self.word_forms[word_form.lexeme_id].append(word_form.id)
        self.connected_lemmas = {}
        for lemma, lemma_ids in self.lemmas.items():
            for lemma_id in lemma_ids:
                if lemma_id in self.word_forms:
                    for word_form_id in self.word_forms[lemma_id]:
                        if word_form_id in self.word_form_features and word_form_id in self.form_representations:
                            if lemma not in self.connected_lemmas:
                                self.connected_lemmas[lemma] = []
                            self.connected_lemmas[lemma].append((self.word_form_features[word_form_id], self.form_representations[word_form_id]))
        del self.lemmas, self.word_form_features, self.form_representations, self.word_forms
        gc.collect()
    def close(self):
        self.session.close()
    def decypher_msd(self, msd):
        t = msd[0]
        decypher = []
        # IF ADDING OR CHANGING ATTRIBUTES HERE ALSO FIX POSSIBLE_WORD_FORM_FEATURE_VALUES
        if t == 'N':
            # gender = CODES_TRANSLATION[t][2][msd[2]]
            number = CODES_TRANSLATION[t][3][msd[3]]
@@ -118,7 +173,6 @@ class SloleksDatabase:
                # fix for verbs with short msds
                if v + 1 >= len(msd):
                    msd = msd + ['-' for _ in range(v - len(msd) + 2)]
                    # return None, None, None
                msd[v + 1] = align_msd[v_align_msd + 1]
@@ -127,20 +181,31 @@ class SloleksDatabase:
        if not decypher_msd:
            return None, None, None
-        wfs = [aliased(WordFormFeature) for _ in decypher_msd]
+        if self.load_sloleks and lemma in self.connected_lemmas:
-        query_preposition = self.session.query(FormRepresentation.form) \
+            for (word_form_features, form_representations) in self.connected_lemmas[lemma]:
-            .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
+                fits = True
-            .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
+                for d_m in decypher_msd:
                    if d_m not in word_form_features:
                        fits = False
                        break
                if fits:
                    break
            return ''.join(msd), lemma, form_representations
        else:
            wfs = [aliased(WordFormFeature) for _ in decypher_msd]
            query_preposition = self.session.query(FormRepresentation.form) \
                .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
                .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
-        for wf in wfs:
+            for wf in wfs:
-            query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
+                query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
-        query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
+            query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
-        for wf, msd_el in zip(wfs, decypher_msd):
+            for wf, msd_el in zip(wfs, decypher_msd):
-            query_preposition = query_preposition.filter(wf.value == msd_el)
+                query_preposition = query_preposition.filter(wf.value == msd_el)
-        pattern_translation_hws = query_preposition.all()
+            pattern_translation_hws = query_preposition.limit(1).all()
-        if len(pattern_translation_hws) > 0:
+            if len(pattern_translation_hws) > 0:
-            return ''.join(msd), lemma, pattern_translation_hws[0][0]
+                return ''.join(msd), lemma, pattern_translation_hws[0][0]
        return None, None, None
@@ -31,32 +31,17 @@ def match_file(words, structures, postprocessor):
        for w in words:
            mhere = s.match(w)
            for match in mhere:
                # colocation_id = [(idx, w.lemma) for idx, w in match.items()]
                colocation_id = [[idx, w.lemma] for idx, w in match.items()]
                colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
                match, collocation_id = postprocessor.process(match, colocation_id)
                colocation_id = tuple(colocation_id)
                matches[s].append((match, colocation_id))
                # for key, val in matches.items():
                #     if key.id == '15':
                #         for el in val:
                #             if el[0]['1'].lemma == 'biti' and el[0]['2'].lemma == 'po' and el[0]['3'].lemma == 'mnenje':
                #                 word_id = '.'.join(words[0].id.split('.')[:-1])
                #                 print(f"ID: {'.'.join(words[0].id.split('.')[:-1])}")
                #                 print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
                # if s.id == '15':
                #     if match['1'].lemma == 'biti' and match['2'].lemma == 'po' and match['3'].lemma == 'mnenje':
                #         word_id = '.'.join(match['1'].id.split('.')[:-1])
                #         print(f"ID: {word_id}")
                #         print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
    return matches
 def main(args):
    sloleks_db = SloleksDatabase(args.sloleks_db)
    structures, lemma_msds, max_num_components = build_structures(args)
    timeinfo = TimeInfo(len(args.input))
@@ -95,7 +80,9 @@ def main(args):
    # figure out representations!
    if args.out or args.out_no_stat:
        sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
        match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
        sloleks_db.close()
    Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)
@@ -106,8 +93,6 @@ def main(args):
    Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)
    # sloleks_db.get_word_form(lemma, gender, number, case)
    sloleks_db.close()
 if __name__ == '__main__':
@@ -144,6 +129,10 @@ if __name__ == '__main__':
                        help='Generate one output for each syntactic structure',
                        action='store_true')
    parser.add_argument('--load-sloleks',
                        help='Tells weather sloleks is loaded into memory at the beginning of processing or not.',
                        action='store_true')
    parser.add_argument('--sort-by',
                        help="Sort by a this column (index)", type=int, default=-1)
    parser.add_argument('--sort-reversed',
		`@@ -1 +1 @@`
			`pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db 'superbaza:A)2U&+3Vfd$Fg]Gb:kolokacije:127.0.0.1' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output`				`pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db '<PUT DB CREDENTIALS HERE!>' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output --load-sloleks`