Added s/z, k/h + fixed bug 90 + connecting with sloleks on lemma_fallback

2020-07-08 19:23:56 +02:00 · 2020-07-08 19:23:56 +02:00 · 777791ad1e
commit 777791ad1e
parent ec113f9cd2
12 changed files with 443 additions and 32 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,3 +9,4 @@ __pycache__

 prev
 old
+data
--- a/src/init.py
+++ b/src/init.py
--- a/src/codes_tagset.py
+++ b/src/codes_tagset.py
@ -1,3 +1,129 @@
+CODES_TRANSLATION = {
+    "N": {
+        2: {
+            'm': 'masculine',
+            'f': 'feminine',
+            'n': 'neuter',
+        },
+        3: {
+            "s": "singular",
+            "d": "dual",
+            "p": "plural",
+        },
+        4: {
+            "n": "nominative",
+            "g": "genitive",
+            "d": "dative",
+            "a": "accusative",
+            "l": "locative",
+            "i": "instrumental",
+        },
+    },
+    "V": {
+        1: {
+            "m": "main",
+            "a": "auxiliary",
+        },
+        3: {
+            "n": "infinitive",
+            "u": "supine",
+            "p": "participle",
+            "r": "present",
+            "f": "future",
+            "c": "conditional",
+            "m": "imperative",
+        },
+        4: {
+            "1": "first",
+            "2": "second",
+            "3": "third",
+        },
+        5: {
+            "s": "singular",
+            "d": "dual",
+            "p": "plural",
+        },
+        6: {
+            'm': 'masculine',
+            'f': 'feminine',
+            'n': 'neuter',
+        },
+        8: {
+            "n": "no",
+            "y": "yes",
+        },
+    },
+    "A": {
+        1: {
+            "g": "general",
+            "s": "possessive",
+        },
+        2: {
+            "p": "positive",
+            "c": "comparative",
+            "s": "superlative",
+        },
+        3: {
+            'm': 'masculine',
+            'f': 'feminine',
+            'n': 'neuter',
+        },
+        4: {
+            "s": "singular",
+            "d": "dual",
+            "p": "plural",
+        },
+        5: {
+            "n": "nominative",
+            "g": "genitive",
+            "d": "dative",
+            "a": "accusative",
+            "l": "locative",
+            "i": "instrumental",
+        },
+    }
+    # "R": "Adverb",
+    # "P": "Pronoun",
+    # "M": "Numeral",
+    # "S": "Preposition",
+    # "C": "Conjunction",
+    # "Q": "Particle",
+    # "I": "Interjection",
+    # "Y": "Abbreviation",
+    # "X": "Residual",
+    #
+    #
+    # "e": "perfective",
+    # "p": "progressive",
+    # "b": "biaspectual",
+    #
+    #
+    # "p": "personal",
+    # "d": "demonstrative",
+    # "r": "relative",
+    # "x": "reflexive",
+    # "q": "interrogative",
+    # "i": "indefinite",
+    # "z": "negative",
+    # "b": "bound",
+    # "d": "digit",
+    # "r": "roman",
+    # "l": "letter",
+    # "c": "cardinal",
+    # "o": "ordinal",
+    # "p": "pronominal",
+    # "s": "special",
+    # "c": "coordinating",
+    # "s": "subordinating",
+    # "f": "foreign",
+    # "t": "typo",
+    # "p": "program",
+    # "w": "web",
+    # "e": "emo",
+    # "h": "hashtag",
+    # "a: "at""
+}
+
 CODES = {
    "Noun": "N",
    "Verb": "V",
--- a/src/loader.py
+++ b/src/loader.py
@ -124,9 +124,9 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
                dest = l.get('dep')
            else:
                ana = l.get('ana')
-                if ana[:4] != 'syn:': # dont bother...
+                if ana[:8] != 'jos-syn:': # dont bother...
                    continue
-                ana = ana[4:]
+                ana = ana[8:]
                lfrom, dest = l.get('target').replace('#', '').split()

            if lfrom in words:
--- a/src/match_store.py
+++ b/src/match_store.py
@ -91,7 +91,14 @@ class MatchStore:
                                   (structure.id,)):
            yield StructureMatch.from_db(self.db, cid[0], structure)

-    def set_representations(self, word_renderer, structures):
+    def add_inserts(self, inserts):
+        for match in inserts:
+            for component_id, text in match.representations.items():
+                self.db.execute("""
+                    INSERT INTO Representations (colocation_id, component_id, text) 
+                    VALUES (?,?,?)""", (match.match_id, component_id, text))
+
+    def set_representations(self, word_renderer, structures, sloleks_db=None):
        step_name = 'representation'
        if self.db.is_step_done(step_name):
            print("Representation step already done, skipping")
@ -105,17 +112,14 @@ class MatchStore:
        for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
            structure = structures_dict[sid]
            match = StructureMatch.from_db(self.db, cid, structure)
-            RepresentationAssigner.set_representations(match, word_renderer)
+            RepresentationAssigner.set_representations(match, word_renderer, sloleks_db=sloleks_db)

            inserts.append(match)
            if len(inserts) > num_inserts:
-                for match in inserts:
-                    for component_id, text in match.representations.items():
-                        self.db.execute("""
-                            INSERT INTO Representations (colocation_id, component_id, text) 
-                            VALUES (?,?,?)""", (match.match_id, component_id, text))
+                self.add_inserts(inserts)
                inserts = []

+        self.add_inserts(inserts)
        self.db.step_is_done(step_name)

    def has_colocation_id_enough_frequency(self, colocation_id):
--- a/src/postprocessor.py
+++ b/src/postprocessor.py
@ -0,0 +1,38 @@
+
+class Postprocessor:
+    def __init__(self, fix_one_letter_words=True):
+        self.fix_one_letter_words = fix_one_letter_words
+
+    @staticmethod
+    def fix_sz(next_word):
+        if next_word[0] in ['c', 'č', 'f', 'h', 'k', 'p', 's', 'š', 't']:
+            return 's'
+        return 'z'
+
+    @staticmethod
+    def fix_kh(next_word):
+        if next_word[0] in ['g', 'k']:
+            return 'h'
+        return 'k'
+
+    def process(self, match, collocation_id):
+        # self.matches = matches
+        # if self.fix_one_letter_words:
+        #     for syn_structure_key, syn_structure_value in self.matches.items():
+        #         for match, collocation_id in syn_structure_value:
+        if len(collocation_id) > 2:
+            # a = collocation_id[1:-1]
+            # b = enumerate(collocation_id[1:-1])
+            # for a, c in b:
+            #     print('here')
+            for idx, (col_id, word) in enumerate(collocation_id[1:-1]):
+                if word in ['s', 'z']:
+                    correct_letter = self.fix_sz(collocation_id[idx + 2][1])
+                    collocation_id[idx + 1][1] = correct_letter
+                    match[col_id].text = correct_letter
+                elif word in ['k', 'h']:
+                    correct_letter = self.fix_kh(collocation_id[idx + 2][1])
+                    collocation_id[idx + 1][1] = correct_letter
+                    match[col_id].text = correct_letter
+        collocation_id = [collocation_id[0]] + [tuple(line) for line in collocation_id[1:]]
+        return match, collocation_id
--- a/src/representation.py
+++ b/src/representation.py
@ -4,6 +4,9 @@ from collections import Counter
 from codes_tagset import TAGSET, CODES
 from word import WordMsdOnly

+from src.word import WordDummy
+
+
 class ComponentRepresentation:
    def __init__(self, data, word_renderer):
        self.data = data
@ -19,23 +22,23 @@ class ComponentRepresentation:
    def add_word(self, word):
        self.words.append(word)

-    def render(self):
+    def render(self, sloleks_db=None):
        if self.rendition_text is None:
-            self.rendition_text = self._render()
+            self.rendition_text = self._render(sloleks_db=sloleks_db)

-    def _render(self):
+    def _render(self, sloleks_db=None):
        raise NotImplementedError("Not implemented for class: {}".format(type(self)))

 class LemmaCR(ComponentRepresentation):
-    def _render(self):
+    def _render(self, sloleks_db=None):
        return self.words[0].lemma if len(self.words) > 0 else None

 class LexisCR(ComponentRepresentation):
-    def _render(self):
+    def _render(self, sloleks_db=None):
        return self.data['lexis']

 class WordFormAllCR(ComponentRepresentation):
-    def _render(self):
+    def _render(self, sloleks_db=None):
        if len(self.words) == 0:
            return None
        else:
@ -43,7 +46,7 @@ class WordFormAllCR(ComponentRepresentation):
            return "/".join(set(forms))

 class WordFormAnyCR(ComponentRepresentation):
-    def _render(self):
+    def _render(self, sloleks_db=None):
        text_forms = {}
        msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
        for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
@ -59,7 +62,24 @@ class WordFormAnyCR(ComponentRepresentation):
        for word_msd, word_lemma in sorted_words:
            # check if agreements match
            agreements_matched = [agr.match(word_msd) for agr in self.agreement]
-            
+
+            # in case all agreements do not match try to get data from sloleks and change properly
+            if not all(agreements_matched):
+                if sloleks_db is None:
+                    raise Exception('sloleks_db not properly setup!')
+                for agr in self.agreement:
+                    if not agr.match(word_msd):
+                        msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
+                        if msd is not None:
+                            agr.msds[0] = msd
+                            agr.words.append(WordDummy(msd, lemma, text))
+                            # agr.words[0].msd = msd
+                            # agr.words[0].text = text
+                            agreements_matched = [agr.match(word_msd) for agr in self.agreement]
+                        else:
+                            break
+
+
            # if we are at the last "backup word", then confirm matches 
            # that worked for this one and return
            if word_lemma is None:
@ -109,9 +129,15 @@ class WordFormMsdCR(WordFormAnyCR):
        if self.check_msd(word.msd):
            super().add_word(word)

-    def _render(self):
+    def _render(self, sloleks_db=None):
+        if len(self.words) == 0:
+            if sloleks_db is None:
+                raise Exception('sloleks_db not properly setup!')
+            msd, lemma, text = sloleks_db.get_word_form(self.lemma, self.msd(), self.data)
+            if msd is not None:
+                self.words.append(WordDummy(msd, lemma, text))
        self.words.append(WordMsdOnly(self._common_msd()))
-        return super()._render()
+        return super()._render(sloleks_db)
    
    def _common_msd(self):
        msds = sorted(self.msds, key=len)
@ -182,5 +208,5 @@ class WordFormAgreementCR(WordFormMsdCR):

        return True

-    def render(self):
+    def render(self, sloleks_db=None):
        pass
--- a/src/representation_assigner.py
+++ b/src/representation_assigner.py
@ -39,7 +39,7 @@ class RepresentationAssigner:
        return self.representation_factory(self.more, word_renderer)

    @staticmethod
-    def set_representations(match, word_renderer):
+    def set_representations(match, word_renderer, sloleks_db=None):
        representations = {}
        for c in match.structure.components:
            representations[c.idx] = []
@ -70,7 +70,7 @@ class RepresentationAssigner:

        for cid, reps in representations.items():
            for rep in reps:
-                rep.render()
+                rep.render(sloleks_db=sloleks_db)
        
        for cid, reps in representations.items():
            reps = [rep.rendition_text for rep in reps]
--- a/src/sloleks_db.py
+++ b/src/sloleks_db.py
@ -0,0 +1,191 @@
+from collections import defaultdict
+from ast import literal_eval
+
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import Session, aliased
+from sqlalchemy import create_engine
+from sqlalchemy import func
+
+from match import StructureMatch
+from representation_assigner import RepresentationAssigner
+from progress_bar import progress
+
+# Lexeme = None
+# LexemeFeature = None
+# SyntacticStructure = None
+# StructureComponent = None
+# Feature = None
+# LexicalUnitLexeme = None
+# LexicalUnit = None
+# LexicalUnitType = None
+# Category = None
+# Sense = None
+# Measure = None
+# LexicalUnitMeasure = None
+# Corpus = None
+# Definition = None
+# WordForm = None
+# WordFormFeature = None
+# FormRepresentation = None
+from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION
+
+
+class SloleksDatabase:
+    def __init__(self, db):
+        # self.db = db
+        # self.dispersions = {}
+        # self.min_freq = args.min_freq
+
+        # self.db.init("""CREATE TABLE Colocations (
+        #     colocation_id INTEGER PRIMARY KEY,
+        #     structure_id varchar(8),
+        #     key varchar(256))
+        #     """)
+        global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
+        [db_user, db_password, db_database, db_host] = db.split(':')
+
+        engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
+                               pool_recycle=14400)
+        Base = declarative_base()
+        Base.metadata.reflect(engine)
+
+        class Lexeme(Base):
+            __table__ = Base.metadata.tables['jedro_lexeme']
+
+        class LexemeFeature(Base):
+            __table__ = Base.metadata.tables['jedro_lexeme_feature']
+
+        class SyntacticStructure(Base):
+            __table__ = Base.metadata.tables['jedro_syntacticstructure']
+
+        class StructureComponent(Base):
+            __table__ = Base.metadata.tables['jedro_structurecomponent']
+
+        class Feature(Base):
+            __table__ = Base.metadata.tables['jedro_feature']
+
+        class LexicalUnitLexeme(Base):
+            __table__ = Base.metadata.tables['jedro_lexicalunit_lexeme']
+
+        class LexicalUnit(Base):
+            __table__ = Base.metadata.tables['jedro_lexicalunit']
+
+        class LexicalUnitType(Base):
+            __table__ = Base.metadata.tables['jedro_lexicalunittype']
+
+        class Category(Base):
+            __table__ = Base.metadata.tables['jedro_category']
+
+        class Sense(Base):
+            __table__ = Base.metadata.tables['jedro_sense']
+
+        class Measure(Base):
+            __table__ = Base.metadata.tables['jedro_measure']
+
+        class LexicalUnitMeasure(Base):
+            __table__ = Base.metadata.tables['jedro_lexicalunitmeasure']
+
+        class Corpus(Base):
+            __table__ = Base.metadata.tables['jedro_corpus']
+
+        class Definition(Base):
+            __table__ = Base.metadata.tables['jedro_definition']
+
+        class WordForm(Base):
+            __table__ = Base.metadata.tables['jedro_wordform']
+
+        class WordFormFeature(Base):
+            __table__ = Base.metadata.tables['jedro_wordform_feature']
+
+        class FormRepresentation(Base):
+            __table__ = Base.metadata.tables['jedro_formrepresentation']
+
+        self.session = Session(engine)
+
+    def close(self):
+        self.session.close()
+
+    def decypher_msd(self, msd):
+        t = msd[0]
+        decypher = []
+        if t == 'N':
+            # gender = CODES_TRANSLATION[t][2][msd[2]]
+            number = CODES_TRANSLATION[t][3][msd[3]]
+            case = CODES_TRANSLATION[t][4][msd[4]]
+            decypher = [number, case]
+        elif t == 'V':
+            # gender = CODES_TRANSLATION[t][6][msd[6]]
+            vform = CODES_TRANSLATION[t][3][msd[3]]
+            number = CODES_TRANSLATION[t][5][msd[5]]
+            person = 'third'
+            decypher = [vform, number, person]
+        elif t == 'A':
+            gender = CODES_TRANSLATION[t][3][msd[3]]
+            number = CODES_TRANSLATION[t][4][msd[4]]
+            case = CODES_TRANSLATION[t][5][msd[5]]
+            decypher = [gender, number, case]
+
+        return decypher
+
+    def get_word_form(self, lemma, msd, data, align_msd=False):
+        # modify msd as required
+        msd = list(msd)
+
+        if not align_msd and 'msd' in data:
+            for key, value in data['msd'].items():
+                t = msd[0]
+                v = TAGSET[t].index(key.lower())
+                msd[v + 1] = CODES[value]
+
+        elif 'agreement' in data:
+            align_msd = list(align_msd)
+            t_align_msd = align_msd[0]
+            t = msd[0]
+
+            for att in data['agreement']:
+                v_align_msd = TAGSET[t_align_msd].index(att.lower())
+                v = TAGSET[t].index(att.lower())
+                # fix for verbs with short msds
+                if v >= len(msd):
+                    return None, None, None
+                # if v >= len(msd) and t == 'V' and att == 'number':
+                #     if len(msd) == 4:
+                #         msd += ['3']
+                #     if len(msd) == 5:
+                #         msd += ['_']
+                # try:
+                msd[v + 1] = align_msd[v_align_msd + 1]
+                # except:
+                #     print('here')
+
+        # msd = list(msd)
+        decypher_msd = self.decypher_msd(msd)
+
+        if not decypher_msd:
+            return None, None, None
+
+        wfs = [aliased(WordFormFeature) for _ in decypher_msd]
+        # wf1 = aliased(WordFormFeature)
+        # wf2 = aliased(WordFormFeature)
+        # wf3 = aliased(WordFormFeature)
+        query_preposition = self.session.query(FormRepresentation.form) \
+            .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
+            .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
+
+        for wf in wfs:
+            query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
+            # .join(wf1, wf1.word_form_id == WordForm.id) \
+            # .join(wf2, wf2.word_form_id == WordForm.id) \
+            # .join(wf3, wf3.word_form_id == WordForm.id) \
+
+        query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
+
+        for wf, msd_el in zip(wfs, decypher_msd):
+            query_preposition = query_preposition.filter(wf.value == msd_el)
+
+        pattern_translation_hws = query_preposition.all()
+        if len(pattern_translation_hws) > 0:
+            return ''.join(msd), lemma, pattern_translation_hws[0][0]
+        # pattern_translation_hws = [el[0] for el in query_preposition.all()]
+        return None, None, None
+        # return pattern_translation_hws
--- a/src/syntactic_structure.py
+++ b/src/syntactic_structure.py
@ -14,7 +14,7 @@ class SyntacticStructure:
    @staticmethod
    def from_xml(xml):
        st = SyntacticStructure()
-        st.id = xml.get('id')
+        st.id = xml.get('id_nsss')
        st.lbs = xml.get('LBS')

        assert len(list(xml)) == 1
--- a/src/wani.py
+++ b/src/wani.py
@ -11,6 +11,7 @@ import concurrent.futures
 import tempfile

 from progress_bar import progress
+from sloleks_db import SloleksDatabase
 from word import Word
 from syntactic_structure import build_structures
 from match_store import MatchStore
@ -20,16 +21,20 @@ from loader import load_files
 from database import Database
 from time_info import TimeInfo

+from src.postprocessor import Postprocessor

-def match_file(words, structures):
+
+def match_file(words, structures, postprocessor):
    matches = {s: [] for s in structures}

    for s in progress(structures, "matching"):
        for w in words:
            mhere = s.match(w)
            for match in mhere:
-                colocation_id = [(idx, w.lemma) for idx, w in match.items()]
+                # colocation_id = [(idx, w.lemma) for idx, w in match.items()]
+                colocation_id = [[idx, w.lemma] for idx, w in match.items()]
                colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
+                match, collocation_id = postprocessor.process(match, colocation_id)
                colocation_id = tuple(colocation_id)

                matches[s].append((match, colocation_id))
@ -38,6 +43,7 @@ def match_file(words, structures):


 def main(args):
+    sloleks_db = SloleksDatabase(args.sloleks_db)
    structures, lemma_msds, max_num_components = build_structures(args)
    timeinfo = TimeInfo(len(args.input))

@ -51,7 +57,11 @@ def main(args):
            continue

        start_time = time.time()
-        matches = match_file(words, structures)
+        postprocessor = Postprocessor()
+        matches = match_file(words, structures, postprocessor)
+
+        # matches = .process()
+        # TODO Add postprocessing here or inside previous function!
        match_store.add_matches(matches)
        word_stats.add_words(words)
        database.commit()
@ -74,7 +84,7 @@ def main(args):

    # figure out representations!
    if args.out or args.out_no_stat:
-        match_store.set_representations(word_stats, structures)
+        match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)

    Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)
@ -85,6 +95,10 @@ def main(args):
    Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)

+    # sloleks_db.get_word_form(lemma, gender, number, case)
+    sloleks_db.close()
+
+
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Extract structures from a parsed corpus.')
@ -92,6 +106,7 @@ if __name__ == '__main__':
                        help='Structures definitions in xml file')
    parser.add_argument('input',
                        help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
+    parser.add_argument('--sloleks_db', type=str, help='Sloleks database credentials')
    parser.add_argument('--out',
                        help='Classic output file')
    parser.add_argument('--out-no-stat',
@ -100,7 +115,7 @@ if __name__ == '__main__':
                        help='Additional output file, writes more data')
    parser.add_argument('--stats',
                        help='Output file for statistics')
-
+#
    parser.add_argument('--no-msd-translate',
                        help='MSDs are translated from slovene to english by default',
                        action='store_true')
--- a/src/word.py
+++ b/src/word.py
@ -14,6 +14,16 @@ class WordMsdOnly:
        return None


+class WordDummy:
+    def __init__(self, msd, lemma, text):
+        self.msd = msd
+        self.lemma = lemma
+        self.text = text
+
+    def most_frequent_text(self, word_renderer):
+        return word_renderer.render(self.lemma, self.msd)
+
+
 class Word:
    def __init__(self, lemma, msd, wid, text, do_msd_translate):
        self.lemma = lemma
@ -29,7 +39,7 @@ class Word:
        self.int_id = int(last_num)

        assert None not in (self.id, self.lemma, self.msd)
-    
+
    @staticmethod
    def from_xml(xml, do_msd_translate):
        lemma = xml.get('lemma')
@ -41,10 +51,10 @@ class Word:
    @staticmethod
    def get_msd(comp):
        d = dict(comp.items())
-        if 'msd' in d:
-            return d['msd']
-        elif 'ana' in d:
+        if 'ana' in d:
            return d['ana'][4:]
+        elif 'msd' in d:
+            return d['msd']
        else:
            logging.error(d)
            raise NotImplementedError("MSD?")