Created new column "Joint_representative_form_variable" + Fixed collocation structures + Fixed bug with wrong lemma_fallback msds

2020-07-16 20:53:59 +02:00
parent de3e52c57c
commit 9a9d344510
9 changed files with 55 additions and 103 deletions
@@ -40,26 +40,28 @@ class OutNoStatFormatter(Formatter):
        return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"]
    
    def header_right(self):
-        return ["Joint_representative_form", "Frequency"]
+        return ["Joint_representative_form_fixed", "Joint_representative_form_variable", "Frequency"]
    
    def content_repeat(self, words, representations, idx, _sidx):
        word = words[idx]
        if idx not in representations:
            return [word.lemma, "", ""]

-        rep = representations[idx]
-        if rep is None:
+        rep_text, rep_msd = representations[idx]
+        if rep_text is None:
            self.representation[idx] = word.lemma
            return [word.lemma, word.lemma, "", "lemma_fallback"]
        else:
-            self.representation[idx] = rep
-            return [word.lemma, rep, word.msd, "ok"]
+            self.representation[idx] = rep_text
+            return [word.lemma, rep_text, rep_msd, "ok"]

-    def content_right(self, freq, best_word_order=None):
-        if best_word_order is None:
-            best_word_order = sorted(self.representation.keys())
-        rep = ' '.join([self.representation[o] for o in best_word_order if o in self.representation])
-        result = [rep, str(freq)]
+    def content_right(self, freq, variable_word_order=None):
+        fixed_word_order = sorted(self.representation.keys())
+        if variable_word_order is None:
+            variable_word_order = fixed_word_order
+        rep_fixed_word_order = ' '.join([self.representation[o] for o in fixed_word_order if o in self.representation])
+        rep_variable_word_order = ' '.join([self.representation[o] for o in variable_word_order if o in self.representation])
+        result = [rep_fixed_word_order, rep_variable_word_order, str(freq)]
        self.representation = {}
        return result

@@ -183,13 +185,13 @@ class OutFormatter(Formatter):
    def header_right(self):
        return self.f1.header_right() + self.f2.header_right()

-    def content_repeat(self, words, representations, idx, sidx, best_word_order=None):
+    def content_repeat(self, words, representations, idx, sidx, variable_word_order=None):
        cr1 = self.f1.content_repeat(words, representations, idx, sidx)
        cr2 = self.f2.content_repeat(words, representations, idx, sidx)
        return cr1 + cr2

-    def content_right(self, freq, best_word_order=None):
-        return self.f1.content_right(freq, best_word_order) + self.f2.content_right(freq)
+    def content_right(self, freq, variable_word_order=None):
+        return self.f1.content_right(freq, variable_word_order) + self.f2.content_right(freq)

    def group(self):
        return self.f1.group() and self.f2.group()
@@ -28,8 +28,8 @@ class StructureMatch:

            result.matches[-1][str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
        
-        for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)):
-            result.representations[str(component_id)] = text
+        for component_id, text, msd in db.execute("SELECT component_id, text, msd FROM Representations WHERE colocation_id=?", (colocation_id,)):
+            result.representations[str(component_id)] = (text, msd)
        
        return result

@@ -35,6 +35,7 @@ class MatchStore:
            colocation_id INTEGER,
            component_id INTEGER,
            text varchar(32),
+            msd varchar(32),
            FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id))
            """)
        self.db.init("""CREATE TABLE Dispersions (
@@ -93,10 +94,10 @@ class MatchStore:

    def add_inserts(self, inserts):
        for match in inserts:
-            for component_id, text in match.representations.items():
+            for component_id, (text, msd) in match.representations.items():
                self.db.execute("""
-                    INSERT INTO Representations (colocation_id, component_id, text) 
-                    VALUES (?,?,?)""", (match.match_id, component_id, text))
+                    INSERT INTO Representations (colocation_id, component_id, text, msd) 
+                    VALUES (?,?,?,?)""", (match.match_id, component_id, text, msd))

    def set_representations(self, word_renderer, structures, sloleks_db=None):
        step_name = 'representation'
@@ -16,15 +16,7 @@ class Postprocessor:
        return 'k'

    def process(self, match, collocation_id):
-        # self.matches = matches
-        # if self.fix_one_letter_words:
-        #     for syn_structure_key, syn_structure_value in self.matches.items():
-        #         for match, collocation_id in syn_structure_value:
        if len(collocation_id) > 2:
-            # a = collocation_id[1:-1]
-            # b = enumerate(collocation_id[1:-1])
-            # for a, c in b:
-            #     print('here')
            for idx, (col_id, word) in enumerate(collocation_id[1:-1]):
                if word in ['s', 'z']:
                    correct_letter = self.fix_sz(collocation_id[idx + 2][1])
@@ -14,6 +14,7 @@ class ComponentRepresentation:

        self.words = []
        self.rendition_text = None
+        self.rendition_msd = None
        self.agreement = []

    def get_agreement(self):
@@ -24,18 +25,22 @@ class ComponentRepresentation:

    def render(self, sloleks_db=None):
        if self.rendition_text is None:
-            self.rendition_text = self._render(sloleks_db=sloleks_db)
+            self.rendition_text, self.rendition_msd = self._render(sloleks_db=sloleks_db)

    def _render(self, sloleks_db=None):
        raise NotImplementedError("Not implemented for class: {}".format(type(self)))

 class LemmaCR(ComponentRepresentation):
    def _render(self, sloleks_db=None):
-        return self.words[0].lemma if len(self.words) > 0 else None
+        # TODO FIX THIS TO LEMMA MSD
+        if len(self.words) > 0:
+            return self.words[0].lemma, self.words[0].msd
+        else:
+            return None, None

 class LexisCR(ComponentRepresentation):
    def _render(self, sloleks_db=None):
-        return self.data['lexis']
+        return self.data['lexis'], 'Q'

 class WordFormAllCR(ComponentRepresentation):
    def _render(self, sloleks_db=None):
@@ -43,7 +48,9 @@ class WordFormAllCR(ComponentRepresentation):
            return None
        else:
            forms = [w.text.lower() for w in self.words]
-            return "/".join(set(forms))
+            msds = [w.msd for w in self.words]
+
+            return "/".join(set(forms)), "/".join(set(msds))

 class WordFormAnyCR(ComponentRepresentation):
    def _render(self, sloleks_db=None):
@@ -86,14 +93,14 @@ class WordFormAnyCR(ComponentRepresentation):
                for agr, matched in zip(self.agreement, agreements_matched):
                    if matched:
                        agr.confirm_match()
-                return None
+                return None, None

            # if all agreements match, we win!
            if all(agreements_matched):
                for agr in self.agreement:
                    agr.confirm_match()

-                return text_forms[(word_msd, word_lemma)]
+                return text_forms[(word_msd, word_lemma)], word_msd


 class WordFormMsdCR(WordFormAnyCR):
@@ -154,6 +161,7 @@ class WordFormAgreementCR(WordFormMsdCR):
    def __init__(self, data, word_renderer):
        super().__init__(data, word_renderer)
        self.rendition_candidate = None
+        self.rendition_msd_candidate = None

    def get_agreement(self):
        return self.data['other']
@@ -169,12 +177,14 @@ class WordFormAgreementCR(WordFormMsdCR):
            if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):
                if self.check_msd(candidate_msd):
                    self.rendition_candidate = candidate_text
+                    self.rendition_msd_candidate = candidate_msd
                    return True

        return False

    def confirm_match(self):
        self.rendition_text = self.rendition_candidate
+        self.rendition_msd = self.rendition_msd_candidate

    @staticmethod
    def check_agreement(msd1, msd2, agreements):
@@ -73,10 +73,11 @@ class RepresentationAssigner:
                rep.render(sloleks_db=sloleks_db)
        
        for cid, reps in representations.items():
-            reps = [rep.rendition_text for rep in reps]
-            if reps == []:
+            reps_text = [rep.rendition_text for rep in reps]
+            reps_msd = [rep.rendition_msd for rep in reps]
+            if reps_text == []:
                pass
-            elif all(r is None for r in reps):
-                match.representations[cid] = None
+            elif all(r is None for r in reps_text):
+                match.representations[cid] = (None, None)
            else:
-                match.representations[cid] = " ".join(("" if r is None else r) for r in reps)
+                match.representations[cid] = (" ".join(("" if r is None else r) for r in reps_text), " ".join(("" if r is None else r) for r in reps_msd))
@@ -1,46 +1,12 @@
-from collections import defaultdict
-from ast import literal_eval
-
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import Session, aliased
 from sqlalchemy import create_engine
-from sqlalchemy import func

-from match import StructureMatch
-from representation_assigner import RepresentationAssigner
-from progress_bar import progress
-
-# Lexeme = None
-# LexemeFeature = None
-# SyntacticStructure = None
-# StructureComponent = None
-# Feature = None
-# LexicalUnitLexeme = None
-# LexicalUnit = None
-# LexicalUnitType = None
-# Category = None
-# Sense = None
-# Measure = None
-# LexicalUnitMeasure = None
-# Corpus = None
-# Definition = None
-# WordForm = None
-# WordFormFeature = None
-# FormRepresentation = None
 from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION


 class SloleksDatabase:
    def __init__(self, db):
-        # self.db = db
-        # self.dispersions = {}
-        # self.min_freq = args.min_freq
-
-        # self.db.init("""CREATE TABLE Colocations (
-        #     colocation_id INTEGER PRIMARY KEY,
-        #     structure_id varchar(8),
-        #     key varchar(256))
-        #     """)
        global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
        [db_user, db_password, db_database, db_host] = db.split(':')

@@ -130,14 +96,15 @@ class SloleksDatabase:
    def get_word_form(self, lemma, msd, data, align_msd=False):
        # modify msd as required
        msd = list(msd)
-
-        if not align_msd and 'msd' in data:
+        if 'msd' in data:
            for key, value in data['msd'].items():
                t = msd[0]
                v = TAGSET[t].index(key.lower())
+                if v + 1 >= len(msd):
+                    msd = msd + ['-' for _ in range(v - len(msd) + 2)]
                msd[v + 1] = CODES[value]

-        elif 'agreement' in data:
+        if align_msd and 'agreement' in data:
            align_msd = list(align_msd)
            t_align_msd = align_msd[0]
            t = msd[0]
@@ -146,37 +113,24 @@ class SloleksDatabase:
                v_align_msd = TAGSET[t_align_msd].index(att.lower())
                v = TAGSET[t].index(att.lower())
                # fix for verbs with short msds
-                if v >= len(msd):
-                    return None, None, None
-                # if v >= len(msd) and t == 'V' and att == 'number':
-                #     if len(msd) == 4:
-                #         msd += ['3']
-                #     if len(msd) == 5:
-                #         msd += ['_']
-                # try:
-                msd[v + 1] = align_msd[v_align_msd + 1]
-                # except:
-                #     print('here')
+                if v + 1 >= len(msd):
+                    msd = msd + ['-' for _ in range(v - len(msd) + 2)]
+                    # return None, None, None
+
+                msd[v + 1] = align_msd[v_align_msd + 1]

-        # msd = list(msd)
        decypher_msd = self.decypher_msd(msd)

        if not decypher_msd:
            return None, None, None

        wfs = [aliased(WordFormFeature) for _ in decypher_msd]
-        # wf1 = aliased(WordFormFeature)
-        # wf2 = aliased(WordFormFeature)
-        # wf3 = aliased(WordFormFeature)
        query_preposition = self.session.query(FormRepresentation.form) \
            .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
            .join(Lexeme, Lexeme.id == WordForm.lexeme_id)

        for wf in wfs:
            query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
-            # .join(wf1, wf1.word_form_id == WordForm.id) \
-            # .join(wf2, wf2.word_form_id == WordForm.id) \
-            # .join(wf3, wf3.word_form_id == WordForm.id) \

        query_preposition = query_preposition.filter(Lexeme.lemma == lemma)

@@ -186,6 +140,4 @@ class SloleksDatabase:
        pattern_translation_hws = query_preposition.all()
        if len(pattern_translation_hws) > 0:
            return ''.join(msd), lemma, pattern_translation_hws[0][0]
-        # pattern_translation_hws = [el[0] for el in query_preposition.all()]
        return None, None, None
-        # return pattern_translation_hws
@@ -73,8 +73,6 @@ def main(args):
        postprocessor = Postprocessor()
        matches = match_file(words, structures, postprocessor)

-        # matches = .process()
-        # TODO Add postprocessing here or inside previous function!
        match_store.add_matches(matches)
        word_stats.add_words(words)
        database.commit()
@@ -82,15 +82,11 @@ class Writer:

            self.formatter.new_match(match)

-            best_word_order = self.find_best_word_order(match.matches)
+            variable_word_order = self.find_variable_word_order(match.matches)

            for words in match.matches:
                to_write = []

-                # TODO instead of enumerate in bottom components first iterate over all words in match.matches, compare
-                #  word.int_id and return most popular order and append to it remaining numbers to len(components)
-
-
                for idx, _comp in enumerate(components):
                    idx = str(idx + 1)
                    if idx not in words:
@@ -105,7 +101,7 @@ class Writer:
                to_write = [structure.id] + to_write + [match.match_id]

                # header_right
-                to_write.extend(self.formatter.content_right(len(match), best_word_order))
+                to_write.extend(self.formatter.content_right(len(match), variable_word_order))
                rows.append(to_write)

                if self.formatter.group():
@@ -148,7 +144,7 @@ class Writer:
            fp_close(fp)

    @staticmethod
-    def find_best_word_order(matches):
+    def find_variable_word_order(matches):
        orders = {}
        for words in matches:
            order = tuple([tup[0] for tup in sorted(words.items(), key=lambda x: x[1].int_id)])