Created new column "Joint_representative_form_variable" + Fixed collocation structures + Fixed bug with wrong lemma_fallback msds

2020-07-16 20:53:59 +02:00 · 2020-07-16 20:53:59 +02:00 · 9a9d344510
commit 9a9d344510
parent de3e52c57c
9 changed files with 55 additions and 103 deletions
--- a/src/formatter.py
+++ b/src/formatter.py
@ -40,26 +40,28 @@ class OutNoStatFormatter(Formatter):
        return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"]
    def header_right(self):
-        return ["Joint_representative_form", "Frequency"]
+        return ["Joint_representative_form_fixed", "Joint_representative_form_variable", "Frequency"]
    def content_repeat(self, words, representations, idx, _sidx):
        word = words[idx]
        if idx not in representations:
            return [word.lemma, "", ""]
-        rep = representations[idx]
+        rep_text, rep_msd = representations[idx]
-        if rep is None:
+        if rep_text is None:
            self.representation[idx] = word.lemma
            return [word.lemma, word.lemma, "", "lemma_fallback"]
        else:
-            self.representation[idx] = rep
+            self.representation[idx] = rep_text
-            return [word.lemma, rep, word.msd, "ok"]
+            return [word.lemma, rep_text, rep_msd, "ok"]
-    def content_right(self, freq, best_word_order=None):
+    def content_right(self, freq, variable_word_order=None):
-        if best_word_order is None:
+        fixed_word_order = sorted(self.representation.keys())
-            best_word_order = sorted(self.representation.keys())
+        if variable_word_order is None:
-        rep = ' '.join([self.representation[o] for o in best_word_order if o in self.representation])
+            variable_word_order = fixed_word_order
-        result = [rep, str(freq)]
+        rep_fixed_word_order = ' '.join([self.representation[o] for o in fixed_word_order if o in self.representation])
        rep_variable_word_order = ' '.join([self.representation[o] for o in variable_word_order if o in self.representation])
        result = [rep_fixed_word_order, rep_variable_word_order, str(freq)]
        self.representation = {}
        return result
@ -183,13 +185,13 @@ class OutFormatter(Formatter):
    def header_right(self):
        return self.f1.header_right() + self.f2.header_right()
-    def content_repeat(self, words, representations, idx, sidx, best_word_order=None):
+    def content_repeat(self, words, representations, idx, sidx, variable_word_order=None):
        cr1 = self.f1.content_repeat(words, representations, idx, sidx)
        cr2 = self.f2.content_repeat(words, representations, idx, sidx)
        return cr1 + cr2
-    def content_right(self, freq, best_word_order=None):
+    def content_right(self, freq, variable_word_order=None):
-        return self.f1.content_right(freq, best_word_order) + self.f2.content_right(freq)
+        return self.f1.content_right(freq, variable_word_order) + self.f2.content_right(freq)
    def group(self):
        return self.f1.group() and self.f2.group()
--- a/src/match.py
+++ b/src/match.py
@ -28,8 +28,8 @@ class StructureMatch:
            result.matches[-1][str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
-        for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)):
+        for component_id, text, msd in db.execute("SELECT component_id, text, msd FROM Representations WHERE colocation_id=?", (colocation_id,)):
-            result.representations[str(component_id)] = text
+            result.representations[str(component_id)] = (text, msd)
        return result
--- a/src/match_store.py
+++ b/src/match_store.py
@ -35,6 +35,7 @@ class MatchStore:
            colocation_id INTEGER,
            component_id INTEGER,
            text varchar(32),
            msd varchar(32),
            FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id))
            """)
        self.db.init("""CREATE TABLE Dispersions (
@ -93,10 +94,10 @@ class MatchStore:
    def add_inserts(self, inserts):
        for match in inserts:
-            for component_id, text in match.representations.items():
+            for component_id, (text, msd) in match.representations.items():
                self.db.execute("""
-                    INSERT INTO Representations (colocation_id, component_id, text) 
+                    INSERT INTO Representations (colocation_id, component_id, text, msd) 
-                    VALUES (?,?,?)""", (match.match_id, component_id, text))
+                    VALUES (?,?,?,?)""", (match.match_id, component_id, text, msd))
    def set_representations(self, word_renderer, structures, sloleks_db=None):
        step_name = 'representation'
--- a/src/postprocessor.py
+++ b/src/postprocessor.py
@ -16,15 +16,7 @@ class Postprocessor:
        return 'k'
    def process(self, match, collocation_id):
        # self.matches = matches
        # if self.fix_one_letter_words:
        #     for syn_structure_key, syn_structure_value in self.matches.items():
        #         for match, collocation_id in syn_structure_value:
        if len(collocation_id) > 2:
            # a = collocation_id[1:-1]
            # b = enumerate(collocation_id[1:-1])
            # for a, c in b:
            #     print('here')
            for idx, (col_id, word) in enumerate(collocation_id[1:-1]):
                if word in ['s', 'z']:
                    correct_letter = self.fix_sz(collocation_id[idx + 2][1])
--- a/src/representation.py
+++ b/src/representation.py
@ -14,6 +14,7 @@ class ComponentRepresentation:
        self.words = []
        self.rendition_text = None
        self.rendition_msd = None
        self.agreement = []
    def get_agreement(self):
@ -24,18 +25,22 @@ class ComponentRepresentation:
    def render(self, sloleks_db=None):
        if self.rendition_text is None:
-            self.rendition_text = self._render(sloleks_db=sloleks_db)
+            self.rendition_text, self.rendition_msd = self._render(sloleks_db=sloleks_db)
    def _render(self, sloleks_db=None):
        raise NotImplementedError("Not implemented for class: {}".format(type(self)))
 class LemmaCR(ComponentRepresentation):
    def _render(self, sloleks_db=None):
-        return self.words[0].lemma if len(self.words) > 0 else None
+        # TODO FIX THIS TO LEMMA MSD
        if len(self.words) > 0:
            return self.words[0].lemma, self.words[0].msd
        else:
            return None, None
 class LexisCR(ComponentRepresentation):
    def _render(self, sloleks_db=None):
-        return self.data['lexis']
+        return self.data['lexis'], 'Q'
 class WordFormAllCR(ComponentRepresentation):
    def _render(self, sloleks_db=None):
@ -43,7 +48,9 @@ class WordFormAllCR(ComponentRepresentation):
            return None
        else:
            forms = [w.text.lower() for w in self.words]
-            return "/".join(set(forms))
+            msds = [w.msd for w in self.words]
            return "/".join(set(forms)), "/".join(set(msds))
 class WordFormAnyCR(ComponentRepresentation):
    def _render(self, sloleks_db=None):
@ -86,14 +93,14 @@ class WordFormAnyCR(ComponentRepresentation):
                for agr, matched in zip(self.agreement, agreements_matched):
                    if matched:
                        agr.confirm_match()
-                return None
+                return None, None
            # if all agreements match, we win!
            if all(agreements_matched):
                for agr in self.agreement:
                    agr.confirm_match()
-                return text_forms[(word_msd, word_lemma)]
+                return text_forms[(word_msd, word_lemma)], word_msd
 class WordFormMsdCR(WordFormAnyCR):
@ -154,6 +161,7 @@ class WordFormAgreementCR(WordFormMsdCR):
    def __init__(self, data, word_renderer):
        super().__init__(data, word_renderer)
        self.rendition_candidate = None
        self.rendition_msd_candidate = None
    def get_agreement(self):
        return self.data['other']
@ -169,12 +177,14 @@ class WordFormAgreementCR(WordFormMsdCR):
            if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):
                if self.check_msd(candidate_msd):
                    self.rendition_candidate = candidate_text
                    self.rendition_msd_candidate = candidate_msd
                    return True
        return False
    def confirm_match(self):
        self.rendition_text = self.rendition_candidate
        self.rendition_msd = self.rendition_msd_candidate
    @staticmethod
    def check_agreement(msd1, msd2, agreements):
--- a/src/representation_assigner.py
+++ b/src/representation_assigner.py
@ -73,10 +73,11 @@ class RepresentationAssigner:
                rep.render(sloleks_db=sloleks_db)
        for cid, reps in representations.items():
-            reps = [rep.rendition_text for rep in reps]
+            reps_text = [rep.rendition_text for rep in reps]
-            if reps == []:
+            reps_msd = [rep.rendition_msd for rep in reps]
            if reps_text == []:
                pass
-            elif all(r is None for r in reps):
+            elif all(r is None for r in reps_text):
-                match.representations[cid] = None
+                match.representations[cid] = (None, None)
            else:
-                match.representations[cid] = " ".join(("" if r is None else r) for r in reps)
+                match.representations[cid] = (" ".join(("" if r is None else r) for r in reps_text), " ".join(("" if r is None else r) for r in reps_msd))
--- a/src/sloleks_db.py
+++ b/src/sloleks_db.py
@ -1,46 +1,12 @@
 from collections import defaultdict
 from ast import literal_eval
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import Session, aliased
 from sqlalchemy import create_engine
 from sqlalchemy import func
 from match import StructureMatch
 from representation_assigner import RepresentationAssigner
 from progress_bar import progress
 # Lexeme = None
 # LexemeFeature = None
 # SyntacticStructure = None
 # StructureComponent = None
 # Feature = None
 # LexicalUnitLexeme = None
 # LexicalUnit = None
 # LexicalUnitType = None
 # Category = None
 # Sense = None
 # Measure = None
 # LexicalUnitMeasure = None
 # Corpus = None
 # Definition = None
 # WordForm = None
 # WordFormFeature = None
 # FormRepresentation = None
 from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION
 class SloleksDatabase:
    def __init__(self, db):
        # self.db = db
        # self.dispersions = {}
        # self.min_freq = args.min_freq
        # self.db.init("""CREATE TABLE Colocations (
        #     colocation_id INTEGER PRIMARY KEY,
        #     structure_id varchar(8),
        #     key varchar(256))
        #     """)
        global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
        [db_user, db_password, db_database, db_host] = db.split(':')
@ -130,14 +96,15 @@ class SloleksDatabase:
    def get_word_form(self, lemma, msd, data, align_msd=False):
        # modify msd as required
        msd = list(msd)
-
+        if 'msd' in data:
        if not align_msd and 'msd' in data:
            for key, value in data['msd'].items():
                t = msd[0]
                v = TAGSET[t].index(key.lower())
                if v + 1 >= len(msd):
                    msd = msd + ['-' for _ in range(v - len(msd) + 2)]
                msd[v + 1] = CODES[value]
-        elif 'agreement' in data:
+        if align_msd and 'agreement' in data:
            align_msd = list(align_msd)
            t_align_msd = align_msd[0]
            t = msd[0]
@ -146,37 +113,24 @@ class SloleksDatabase:
                v_align_msd = TAGSET[t_align_msd].index(att.lower())
                v = TAGSET[t].index(att.lower())
                # fix for verbs with short msds
-                if v >= len(msd):
+                if v + 1 >= len(msd):
-                    return None, None, None
+                    msd = msd + ['-' for _ in range(v - len(msd) + 2)]
-                # if v >= len(msd) and t == 'V' and att == 'number':
+                    # return None, None, None
-                #     if len(msd) == 4:
+
-                #         msd += ['3']
+                msd[v + 1] = align_msd[v_align_msd + 1]
                #     if len(msd) == 5:
                #         msd += ['_']
                # try:
                msd[v + 1] = align_msd[v_align_msd + 1]
                # except:
                #     print('here')
        # msd = list(msd)
        decypher_msd = self.decypher_msd(msd)
        if not decypher_msd:
            return None, None, None
        wfs = [aliased(WordFormFeature) for _ in decypher_msd]
        # wf1 = aliased(WordFormFeature)
        # wf2 = aliased(WordFormFeature)
        # wf3 = aliased(WordFormFeature)
        query_preposition = self.session.query(FormRepresentation.form) \
            .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
            .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
        for wf in wfs:
            query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
            # .join(wf1, wf1.word_form_id == WordForm.id) \
            # .join(wf2, wf2.word_form_id == WordForm.id) \
            # .join(wf3, wf3.word_form_id == WordForm.id) \
        query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
@ -186,6 +140,4 @@ class SloleksDatabase:
        pattern_translation_hws = query_preposition.all()
        if len(pattern_translation_hws) > 0:
            return ''.join(msd), lemma, pattern_translation_hws[0][0]
        # pattern_translation_hws = [el[0] for el in query_preposition.all()]
        return None, None, None
        # return pattern_translation_hws
--- a/src/wani.py
+++ b/src/wani.py
@ -73,8 +73,6 @@ def main(args):
        postprocessor = Postprocessor()
        matches = match_file(words, structures, postprocessor)
        # matches = .process()
        # TODO Add postprocessing here or inside previous function!
        match_store.add_matches(matches)
        word_stats.add_words(words)
        database.commit()
--- a/src/writer.py
+++ b/src/writer.py
@ -82,15 +82,11 @@ class Writer:
            self.formatter.new_match(match)
-            best_word_order = self.find_best_word_order(match.matches)
+            variable_word_order = self.find_variable_word_order(match.matches)
            for words in match.matches:
                to_write = []
                # TODO instead of enumerate in bottom components first iterate over all words in match.matches, compare
                #  word.int_id and return most popular order and append to it remaining numbers to len(components)
                for idx, _comp in enumerate(components):
                    idx = str(idx + 1)
                    if idx not in words:
@ -105,7 +101,7 @@ class Writer:
                to_write = [structure.id] + to_write + [match.match_id]
                # header_right
-                to_write.extend(self.formatter.content_right(len(match), best_word_order))
+                to_write.extend(self.formatter.content_right(len(match), variable_word_order))
                rows.append(to_write)
                if self.formatter.group():
@ -148,7 +144,7 @@ class Writer:
            fp_close(fp)
    @staticmethod
-    def find_best_word_order(matches):
+    def find_variable_word_order(matches):
        orders = {}
        for words in matches:
            order = tuple([tup[0] for tup in sorted(words.items(), key=lambda x: x[1].int_id)])