Added s/z, k/h + fixed bug 90 + connecting with sloleks on lemma_fallback

2020-07-08 19:23:56 +02:00 · 2020-07-08 19:23:56 +02:00 · 777791ad1e
commit 777791ad1e
parent ec113f9cd2
12 changed files with 443 additions and 32 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,3 +9,4 @@ __pycache__
 prev
 old
 data
--- a/src/init.py
+++ b/src/init.py
--- a/src/codes_tagset.py
+++ b/src/codes_tagset.py
@ -1,3 +1,129 @@
 CODES_TRANSLATION = {
    "N": {
        2: {
            'm': 'masculine',
            'f': 'feminine',
            'n': 'neuter',
        },
        3: {
            "s": "singular",
            "d": "dual",
            "p": "plural",
        },
        4: {
            "n": "nominative",
            "g": "genitive",
            "d": "dative",
            "a": "accusative",
            "l": "locative",
            "i": "instrumental",
        },
    },
    "V": {
        1: {
            "m": "main",
            "a": "auxiliary",
        },
        3: {
            "n": "infinitive",
            "u": "supine",
            "p": "participle",
            "r": "present",
            "f": "future",
            "c": "conditional",
            "m": "imperative",
        },
        4: {
            "1": "first",
            "2": "second",
            "3": "third",
        },
        5: {
            "s": "singular",
            "d": "dual",
            "p": "plural",
        },
        6: {
            'm': 'masculine',
            'f': 'feminine',
            'n': 'neuter',
        },
        8: {
            "n": "no",
            "y": "yes",
        },
    },
    "A": {
        1: {
            "g": "general",
            "s": "possessive",
        },
        2: {
            "p": "positive",
            "c": "comparative",
            "s": "superlative",
        },
        3: {
            'm': 'masculine',
            'f': 'feminine',
            'n': 'neuter',
        },
        4: {
            "s": "singular",
            "d": "dual",
            "p": "plural",
        },
        5: {
            "n": "nominative",
            "g": "genitive",
            "d": "dative",
            "a": "accusative",
            "l": "locative",
            "i": "instrumental",
        },
    }
    # "R": "Adverb",
    # "P": "Pronoun",
    # "M": "Numeral",
    # "S": "Preposition",
    # "C": "Conjunction",
    # "Q": "Particle",
    # "I": "Interjection",
    # "Y": "Abbreviation",
    # "X": "Residual",
    #
    #
    # "e": "perfective",
    # "p": "progressive",
    # "b": "biaspectual",
    #
    #
    # "p": "personal",
    # "d": "demonstrative",
    # "r": "relative",
    # "x": "reflexive",
    # "q": "interrogative",
    # "i": "indefinite",
    # "z": "negative",
    # "b": "bound",
    # "d": "digit",
    # "r": "roman",
    # "l": "letter",
    # "c": "cardinal",
    # "o": "ordinal",
    # "p": "pronominal",
    # "s": "special",
    # "c": "coordinating",
    # "s": "subordinating",
    # "f": "foreign",
    # "t": "typo",
    # "p": "program",
    # "w": "web",
    # "e": "emo",
    # "h": "hashtag",
    # "a: "at""
 }
 CODES = {
    "Noun": "N",
    "Verb": "V",
--- a/src/loader.py
+++ b/src/loader.py
@ -124,9 +124,9 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
                dest = l.get('dep')
            else:
                ana = l.get('ana')
-                if ana[:4] != 'syn:': # dont bother...
+                if ana[:8] != 'jos-syn:': # dont bother...
                    continue
-                ana = ana[4:]
+                ana = ana[8:]
                lfrom, dest = l.get('target').replace('#', '').split()
            if lfrom in words:
--- a/src/match_store.py
+++ b/src/match_store.py
@ -91,7 +91,14 @@ class MatchStore:
                                   (structure.id,)):
            yield StructureMatch.from_db(self.db, cid[0], structure)
-    def set_representations(self, word_renderer, structures):
+    def add_inserts(self, inserts):
        for match in inserts:
            for component_id, text in match.representations.items():
                self.db.execute("""
                    INSERT INTO Representations (colocation_id, component_id, text) 
                    VALUES (?,?,?)""", (match.match_id, component_id, text))
    def set_representations(self, word_renderer, structures, sloleks_db=None):
        step_name = 'representation'
        if self.db.is_step_done(step_name):
            print("Representation step already done, skipping")
@ -105,17 +112,14 @@ class MatchStore:
        for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
            structure = structures_dict[sid]
            match = StructureMatch.from_db(self.db, cid, structure)
-            RepresentationAssigner.set_representations(match, word_renderer)
+            RepresentationAssigner.set_representations(match, word_renderer, sloleks_db=sloleks_db)
            inserts.append(match)
            if len(inserts) > num_inserts:
-                for match in inserts:
+                self.add_inserts(inserts)
                    for component_id, text in match.representations.items():
                        self.db.execute("""
                            INSERT INTO Representations (colocation_id, component_id, text) 
                            VALUES (?,?,?)""", (match.match_id, component_id, text))
                inserts = []
        self.add_inserts(inserts)
        self.db.step_is_done(step_name)
    def has_colocation_id_enough_frequency(self, colocation_id):
--- a/src/postprocessor.py
+++ b/src/postprocessor.py
@ -0,0 +1,38 @@
 class Postprocessor:
    def __init__(self, fix_one_letter_words=True):
        self.fix_one_letter_words = fix_one_letter_words
    @staticmethod
    def fix_sz(next_word):
        if next_word[0] in ['c', 'č', 'f', 'h', 'k', 'p', 's', 'š', 't']:
            return 's'
        return 'z'
    @staticmethod
    def fix_kh(next_word):
        if next_word[0] in ['g', 'k']:
            return 'h'
        return 'k'
    def process(self, match, collocation_id):
        # self.matches = matches
        # if self.fix_one_letter_words:
        #     for syn_structure_key, syn_structure_value in self.matches.items():
        #         for match, collocation_id in syn_structure_value:
        if len(collocation_id) > 2:
            # a = collocation_id[1:-1]
            # b = enumerate(collocation_id[1:-1])
            # for a, c in b:
            #     print('here')
            for idx, (col_id, word) in enumerate(collocation_id[1:-1]):
                if word in ['s', 'z']:
                    correct_letter = self.fix_sz(collocation_id[idx + 2][1])
                    collocation_id[idx + 1][1] = correct_letter
                    match[col_id].text = correct_letter
                elif word in ['k', 'h']:
                    correct_letter = self.fix_kh(collocation_id[idx + 2][1])
                    collocation_id[idx + 1][1] = correct_letter
                    match[col_id].text = correct_letter
        collocation_id = [collocation_id[0]] + [tuple(line) for line in collocation_id[1:]]
        return match, collocation_id
--- a/src/representation.py
+++ b/src/representation.py
@ -4,6 +4,9 @@ from collections import Counter
 from codes_tagset import TAGSET, CODES
 from word import WordMsdOnly
 from src.word import WordDummy
 class ComponentRepresentation:
    def __init__(self, data, word_renderer):
        self.data = data
@ -19,23 +22,23 @@ class ComponentRepresentation:
    def add_word(self, word):
        self.words.append(word)
-    def render(self):
+    def render(self, sloleks_db=None):
        if self.rendition_text is None:
-            self.rendition_text = self._render()
+            self.rendition_text = self._render(sloleks_db=sloleks_db)
-    def _render(self):
+    def _render(self, sloleks_db=None):
        raise NotImplementedError("Not implemented for class: {}".format(type(self)))
 class LemmaCR(ComponentRepresentation):
-    def _render(self):
+    def _render(self, sloleks_db=None):
        return self.words[0].lemma if len(self.words) > 0 else None
 class LexisCR(ComponentRepresentation):
-    def _render(self):
+    def _render(self, sloleks_db=None):
        return self.data['lexis']
 class WordFormAllCR(ComponentRepresentation):
-    def _render(self):
+    def _render(self, sloleks_db=None):
        if len(self.words) == 0:
            return None
        else:
@ -43,7 +46,7 @@ class WordFormAllCR(ComponentRepresentation):
            return "/".join(set(forms))
 class WordFormAnyCR(ComponentRepresentation):
-    def _render(self):
+    def _render(self, sloleks_db=None):
        text_forms = {}
        msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
        for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
@ -60,6 +63,23 @@ class WordFormAnyCR(ComponentRepresentation):
            # check if agreements match
            agreements_matched = [agr.match(word_msd) for agr in self.agreement]
            # in case all agreements do not match try to get data from sloleks and change properly
            if not all(agreements_matched):
                if sloleks_db is None:
                    raise Exception('sloleks_db not properly setup!')
                for agr in self.agreement:
                    if not agr.match(word_msd):
                        msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
                        if msd is not None:
                            agr.msds[0] = msd
                            agr.words.append(WordDummy(msd, lemma, text))
                            # agr.words[0].msd = msd
                            # agr.words[0].text = text
                            agreements_matched = [agr.match(word_msd) for agr in self.agreement]
                        else:
                            break
            # if we are at the last "backup word", then confirm matches 
            # that worked for this one and return
            if word_lemma is None:
@ -109,9 +129,15 @@ class WordFormMsdCR(WordFormAnyCR):
        if self.check_msd(word.msd):
            super().add_word(word)
-    def _render(self):
+    def _render(self, sloleks_db=None):
        if len(self.words) == 0:
            if sloleks_db is None:
                raise Exception('sloleks_db not properly setup!')
            msd, lemma, text = sloleks_db.get_word_form(self.lemma, self.msd(), self.data)
            if msd is not None:
                self.words.append(WordDummy(msd, lemma, text))
        self.words.append(WordMsdOnly(self._common_msd()))
-        return super()._render()
+        return super()._render(sloleks_db)
    def _common_msd(self):
        msds = sorted(self.msds, key=len)
@ -182,5 +208,5 @@ class WordFormAgreementCR(WordFormMsdCR):
        return True
-    def render(self):
+    def render(self, sloleks_db=None):
        pass
--- a/src/representation_assigner.py
+++ b/src/representation_assigner.py
@ -39,7 +39,7 @@ class RepresentationAssigner:
        return self.representation_factory(self.more, word_renderer)
    @staticmethod
-    def set_representations(match, word_renderer):
+    def set_representations(match, word_renderer, sloleks_db=None):
        representations = {}
        for c in match.structure.components:
            representations[c.idx] = []
@ -70,7 +70,7 @@ class RepresentationAssigner:
        for cid, reps in representations.items():
            for rep in reps:
-                rep.render()
+                rep.render(sloleks_db=sloleks_db)
        for cid, reps in representations.items():
            reps = [rep.rendition_text for rep in reps]
--- a/src/sloleks_db.py
+++ b/src/sloleks_db.py
@ -0,0 +1,191 @@
 from collections import defaultdict
 from ast import literal_eval
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import Session, aliased
 from sqlalchemy import create_engine
 from sqlalchemy import func
 from match import StructureMatch
 from representation_assigner import RepresentationAssigner
 from progress_bar import progress
 # Lexeme = None
 # LexemeFeature = None
 # SyntacticStructure = None
 # StructureComponent = None
 # Feature = None
 # LexicalUnitLexeme = None
 # LexicalUnit = None
 # LexicalUnitType = None
 # Category = None
 # Sense = None
 # Measure = None
 # LexicalUnitMeasure = None
 # Corpus = None
 # Definition = None
 # WordForm = None
 # WordFormFeature = None
 # FormRepresentation = None
 from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION
 class SloleksDatabase:
    def __init__(self, db):
        # self.db = db
        # self.dispersions = {}
        # self.min_freq = args.min_freq
        # self.db.init("""CREATE TABLE Colocations (
        #     colocation_id INTEGER PRIMARY KEY,
        #     structure_id varchar(8),
        #     key varchar(256))
        #     """)
        global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
        [db_user, db_password, db_database, db_host] = db.split(':')
        engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
                               pool_recycle=14400)
        Base = declarative_base()
        Base.metadata.reflect(engine)
        class Lexeme(Base):
            __table__ = Base.metadata.tables['jedro_lexeme']
        class LexemeFeature(Base):
            __table__ = Base.metadata.tables['jedro_lexeme_feature']
        class SyntacticStructure(Base):
            __table__ = Base.metadata.tables['jedro_syntacticstructure']
        class StructureComponent(Base):
            __table__ = Base.metadata.tables['jedro_structurecomponent']
        class Feature(Base):
            __table__ = Base.metadata.tables['jedro_feature']
        class LexicalUnitLexeme(Base):
            __table__ = Base.metadata.tables['jedro_lexicalunit_lexeme']
        class LexicalUnit(Base):
            __table__ = Base.metadata.tables['jedro_lexicalunit']
        class LexicalUnitType(Base):
            __table__ = Base.metadata.tables['jedro_lexicalunittype']
        class Category(Base):
            __table__ = Base.metadata.tables['jedro_category']
        class Sense(Base):
            __table__ = Base.metadata.tables['jedro_sense']
        class Measure(Base):
            __table__ = Base.metadata.tables['jedro_measure']
        class LexicalUnitMeasure(Base):
            __table__ = Base.metadata.tables['jedro_lexicalunitmeasure']
        class Corpus(Base):
            __table__ = Base.metadata.tables['jedro_corpus']
        class Definition(Base):
            __table__ = Base.metadata.tables['jedro_definition']
        class WordForm(Base):
            __table__ = Base.metadata.tables['jedro_wordform']
        class WordFormFeature(Base):
            __table__ = Base.metadata.tables['jedro_wordform_feature']
        class FormRepresentation(Base):
            __table__ = Base.metadata.tables['jedro_formrepresentation']
        self.session = Session(engine)
    def close(self):
        self.session.close()
    def decypher_msd(self, msd):
        t = msd[0]
        decypher = []
        if t == 'N':
            # gender = CODES_TRANSLATION[t][2][msd[2]]
            number = CODES_TRANSLATION[t][3][msd[3]]
            case = CODES_TRANSLATION[t][4][msd[4]]
            decypher = [number, case]
        elif t == 'V':
            # gender = CODES_TRANSLATION[t][6][msd[6]]
            vform = CODES_TRANSLATION[t][3][msd[3]]
            number = CODES_TRANSLATION[t][5][msd[5]]
            person = 'third'
            decypher = [vform, number, person]
        elif t == 'A':
            gender = CODES_TRANSLATION[t][3][msd[3]]
            number = CODES_TRANSLATION[t][4][msd[4]]
            case = CODES_TRANSLATION[t][5][msd[5]]
            decypher = [gender, number, case]
        return decypher
    def get_word_form(self, lemma, msd, data, align_msd=False):
        # modify msd as required
        msd = list(msd)
        if not align_msd and 'msd' in data:
            for key, value in data['msd'].items():
                t = msd[0]
                v = TAGSET[t].index(key.lower())
                msd[v + 1] = CODES[value]
        elif 'agreement' in data:
            align_msd = list(align_msd)
            t_align_msd = align_msd[0]
            t = msd[0]
            for att in data['agreement']:
                v_align_msd = TAGSET[t_align_msd].index(att.lower())
                v = TAGSET[t].index(att.lower())
                # fix for verbs with short msds
                if v >= len(msd):
                    return None, None, None
                # if v >= len(msd) and t == 'V' and att == 'number':
                #     if len(msd) == 4:
                #         msd += ['3']
                #     if len(msd) == 5:
                #         msd += ['_']
                # try:
                msd[v + 1] = align_msd[v_align_msd + 1]
                # except:
                #     print('here')
        # msd = list(msd)
        decypher_msd = self.decypher_msd(msd)
        if not decypher_msd:
            return None, None, None
        wfs = [aliased(WordFormFeature) for _ in decypher_msd]
        # wf1 = aliased(WordFormFeature)
        # wf2 = aliased(WordFormFeature)
        # wf3 = aliased(WordFormFeature)
        query_preposition = self.session.query(FormRepresentation.form) \
            .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
            .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
        for wf in wfs:
            query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
            # .join(wf1, wf1.word_form_id == WordForm.id) \
            # .join(wf2, wf2.word_form_id == WordForm.id) \
            # .join(wf3, wf3.word_form_id == WordForm.id) \
        query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
        for wf, msd_el in zip(wfs, decypher_msd):
            query_preposition = query_preposition.filter(wf.value == msd_el)
        pattern_translation_hws = query_preposition.all()
        if len(pattern_translation_hws) > 0:
            return ''.join(msd), lemma, pattern_translation_hws[0][0]
        # pattern_translation_hws = [el[0] for el in query_preposition.all()]
        return None, None, None
        # return pattern_translation_hws
--- a/src/syntactic_structure.py
+++ b/src/syntactic_structure.py
@ -14,7 +14,7 @@ class SyntacticStructure:
    @staticmethod
    def from_xml(xml):
        st = SyntacticStructure()
-        st.id = xml.get('id')
+        st.id = xml.get('id_nsss')
        st.lbs = xml.get('LBS')
        assert len(list(xml)) == 1
--- a/src/wani.py
+++ b/src/wani.py
@ -11,6 +11,7 @@ import concurrent.futures
 import tempfile
 from progress_bar import progress
 from sloleks_db import SloleksDatabase
 from word import Word
 from syntactic_structure import build_structures
 from match_store import MatchStore
@ -20,16 +21,20 @@ from loader import load_files
 from database import Database
 from time_info import TimeInfo
 from src.postprocessor import Postprocessor
-def match_file(words, structures):
+
 def match_file(words, structures, postprocessor):
    matches = {s: [] for s in structures}
    for s in progress(structures, "matching"):
        for w in words:
            mhere = s.match(w)
            for match in mhere:
-                colocation_id = [(idx, w.lemma) for idx, w in match.items()]
+                # colocation_id = [(idx, w.lemma) for idx, w in match.items()]
                colocation_id = [[idx, w.lemma] for idx, w in match.items()]
                colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
                match, collocation_id = postprocessor.process(match, colocation_id)
                colocation_id = tuple(colocation_id)
                matches[s].append((match, colocation_id))
@ -38,6 +43,7 @@ def match_file(words, structures):
 def main(args):
    sloleks_db = SloleksDatabase(args.sloleks_db)
    structures, lemma_msds, max_num_components = build_structures(args)
    timeinfo = TimeInfo(len(args.input))
@ -51,7 +57,11 @@ def main(args):
            continue
        start_time = time.time()
-        matches = match_file(words, structures)
+        postprocessor = Postprocessor()
        matches = match_file(words, structures, postprocessor)
        # matches = .process()
        # TODO Add postprocessing here or inside previous function!
        match_store.add_matches(matches)
        word_stats.add_words(words)
        database.commit()
@ -74,7 +84,7 @@ def main(args):
    # figure out representations!
    if args.out or args.out_no_stat:
-        match_store.set_representations(word_stats, structures)
+        match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
    Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)
@ -85,6 +95,10 @@ def main(args):
    Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)
    # sloleks_db.get_word_form(lemma, gender, number, case)
    sloleks_db.close()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Extract structures from a parsed corpus.')
@ -92,6 +106,7 @@ if __name__ == '__main__':
                        help='Structures definitions in xml file')
    parser.add_argument('input',
                        help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
    parser.add_argument('--sloleks_db', type=str, help='Sloleks database credentials')
    parser.add_argument('--out',
                        help='Classic output file')
    parser.add_argument('--out-no-stat',
@ -100,7 +115,7 @@ if __name__ == '__main__':
                        help='Additional output file, writes more data')
    parser.add_argument('--stats',
                        help='Output file for statistics')
-
+#
    parser.add_argument('--no-msd-translate',
                        help='MSDs are translated from slovene to english by default',
                        action='store_true')
--- a/src/word.py
+++ b/src/word.py
@ -14,6 +14,16 @@ class WordMsdOnly:
        return None
 class WordDummy:
    def __init__(self, msd, lemma, text):
        self.msd = msd
        self.lemma = lemma
        self.text = text
    def most_frequent_text(self, word_renderer):
        return word_renderer.render(self.lemma, self.msd)
 class Word:
    def __init__(self, lemma, msd, wid, text, do_msd_translate):
        self.lemma = lemma
@ -41,10 +51,10 @@ class Word:
    @staticmethod
    def get_msd(comp):
        d = dict(comp.items())
-        if 'msd' in d:
+        if 'ana' in d:
            return d['msd']
        elif 'ana' in d:
            return d['ana'][4:]
        elif 'msd' in d:
            return d['msd']
        else:
            logging.error(d)
            raise NotImplementedError("MSD?")