From 9a9d344510f31eae56c1e2fe35a77c2bf344ae64 Mon Sep 17 00:00:00 2001
From: Luka <krsnik.luka92@gmail.com>
Date: Thu, 16 Jul 2020 20:53:59 +0200
Subject: [PATCH] Created new column "Joint_representative_form_variable" +
 Fixed collocation structures + Fixed bug with wrong lemma_fallback msds

---
 src/formatter.py               | 28 ++++++++-------
 src/match.py                   |  4 +--
 src/match_store.py             |  7 ++--
 src/postprocessor.py           |  8 -----
 src/representation.py          | 22 ++++++++----
 src/representation_assigner.py | 11 +++---
 src/sloleks_db.py              | 66 +++++-----------------------------
 src/wani.py                    |  2 --
 src/writer.py                  | 10 ++----
 9 files changed, 55 insertions(+), 103 deletions(-)

diff --git a/src/formatter.py b/src/formatter.py
index 363ad21..85adc6d 100644
--- a/src/formatter.py
+++ b/src/formatter.py
@@ -40,26 +40,28 @@ class OutNoStatFormatter(Formatter):
         return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"]
     
     def header_right(self):
-        return ["Joint_representative_form", "Frequency"]
+        return ["Joint_representative_form_fixed", "Joint_representative_form_variable", "Frequency"]
     
     def content_repeat(self, words, representations, idx, _sidx):
         word = words[idx]
         if idx not in representations:
             return [word.lemma, "", ""]
 
-        rep = representations[idx]
-        if rep is None:
+        rep_text, rep_msd = representations[idx]
+        if rep_text is None:
             self.representation[idx] = word.lemma
             return [word.lemma, word.lemma, "", "lemma_fallback"]
         else:
-            self.representation[idx] = rep
-            return [word.lemma, rep, word.msd, "ok"]
+            self.representation[idx] = rep_text
+            return [word.lemma, rep_text, rep_msd, "ok"]
 
-    def content_right(self, freq, best_word_order=None):
-        if best_word_order is None:
-            best_word_order = sorted(self.representation.keys())
-        rep = ' '.join([self.representation[o] for o in best_word_order if o in self.representation])
-        result = [rep, str(freq)]
+    def content_right(self, freq, variable_word_order=None):
+        fixed_word_order = sorted(self.representation.keys())
+        if variable_word_order is None:
+            variable_word_order = fixed_word_order
+        rep_fixed_word_order = ' '.join([self.representation[o] for o in fixed_word_order if o in self.representation])
+        rep_variable_word_order = ' '.join([self.representation[o] for o in variable_word_order if o in self.representation])
+        result = [rep_fixed_word_order, rep_variable_word_order, str(freq)]
         self.representation = {}
         return result
 
@@ -183,13 +185,13 @@ class OutFormatter(Formatter):
     def header_right(self):
         return self.f1.header_right() + self.f2.header_right()
 
-    def content_repeat(self, words, representations, idx, sidx, best_word_order=None):
+    def content_repeat(self, words, representations, idx, sidx, variable_word_order=None):
         cr1 = self.f1.content_repeat(words, representations, idx, sidx)
         cr2 = self.f2.content_repeat(words, representations, idx, sidx)
         return cr1 + cr2
 
-    def content_right(self, freq, best_word_order=None):
-        return self.f1.content_right(freq, best_word_order) + self.f2.content_right(freq)
+    def content_right(self, freq, variable_word_order=None):
+        return self.f1.content_right(freq, variable_word_order) + self.f2.content_right(freq)
 
     def group(self):
         return self.f1.group() and self.f2.group()
diff --git a/src/match.py b/src/match.py
index 1d65db0..d3f82ef 100644
--- a/src/match.py
+++ b/src/match.py
@@ -28,8 +28,8 @@ class StructureMatch:
 
             result.matches[-1][str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
         
-        for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)):
-            result.representations[str(component_id)] = text
+        for component_id, text, msd in db.execute("SELECT component_id, text, msd FROM Representations WHERE colocation_id=?", (colocation_id,)):
+            result.representations[str(component_id)] = (text, msd)
         
         return result
 
diff --git a/src/match_store.py b/src/match_store.py
index ff200ce..6025200 100644
--- a/src/match_store.py
+++ b/src/match_store.py
@@ -35,6 +35,7 @@ class MatchStore:
             colocation_id INTEGER,
             component_id INTEGER,
             text varchar(32),
+            msd varchar(32),
             FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id))
             """)
         self.db.init("""CREATE TABLE Dispersions (
@@ -93,10 +94,10 @@ class MatchStore:
 
     def add_inserts(self, inserts):
         for match in inserts:
-            for component_id, text in match.representations.items():
+            for component_id, (text, msd) in match.representations.items():
                 self.db.execute("""
-                    INSERT INTO Representations (colocation_id, component_id, text) 
-                    VALUES (?,?,?)""", (match.match_id, component_id, text))
+                    INSERT INTO Representations (colocation_id, component_id, text, msd) 
+                    VALUES (?,?,?,?)""", (match.match_id, component_id, text, msd))
 
     def set_representations(self, word_renderer, structures, sloleks_db=None):
         step_name = 'representation'
diff --git a/src/postprocessor.py b/src/postprocessor.py
index 375cb3f..60d15cc 100644
--- a/src/postprocessor.py
+++ b/src/postprocessor.py
@@ -16,15 +16,7 @@ class Postprocessor:
         return 'k'
 
     def process(self, match, collocation_id):
-        # self.matches = matches
-        # if self.fix_one_letter_words:
-        #     for syn_structure_key, syn_structure_value in self.matches.items():
-        #         for match, collocation_id in syn_structure_value:
         if len(collocation_id) > 2:
-            # a = collocation_id[1:-1]
-            # b = enumerate(collocation_id[1:-1])
-            # for a, c in b:
-            #     print('here')
             for idx, (col_id, word) in enumerate(collocation_id[1:-1]):
                 if word in ['s', 'z']:
                     correct_letter = self.fix_sz(collocation_id[idx + 2][1])
diff --git a/src/representation.py b/src/representation.py
index 31bdeff..c059415 100644
--- a/src/representation.py
+++ b/src/representation.py
@@ -14,6 +14,7 @@ class ComponentRepresentation:
 
         self.words = []
         self.rendition_text = None
+        self.rendition_msd = None
         self.agreement = []
 
     def get_agreement(self):
@@ -24,18 +25,22 @@ class ComponentRepresentation:
 
     def render(self, sloleks_db=None):
         if self.rendition_text is None:
-            self.rendition_text = self._render(sloleks_db=sloleks_db)
+            self.rendition_text, self.rendition_msd = self._render(sloleks_db=sloleks_db)
 
     def _render(self, sloleks_db=None):
         raise NotImplementedError("Not implemented for class: {}".format(type(self)))
 
 class LemmaCR(ComponentRepresentation):
     def _render(self, sloleks_db=None):
-        return self.words[0].lemma if len(self.words) > 0 else None
+        # TODO FIX THIS TO LEMMA MSD
+        if len(self.words) > 0:
+            return self.words[0].lemma, self.words[0].msd
+        else:
+            return None, None
 
 class LexisCR(ComponentRepresentation):
     def _render(self, sloleks_db=None):
-        return self.data['lexis']
+        return self.data['lexis'], 'Q'
 
 class WordFormAllCR(ComponentRepresentation):
     def _render(self, sloleks_db=None):
@@ -43,7 +48,9 @@ class WordFormAllCR(ComponentRepresentation):
             return None
         else:
             forms = [w.text.lower() for w in self.words]
-            return "/".join(set(forms))
+            msds = [w.msd for w in self.words]
+
+            return "/".join(set(forms)), "/".join(set(msds))
 
 class WordFormAnyCR(ComponentRepresentation):
     def _render(self, sloleks_db=None):
@@ -86,14 +93,14 @@ class WordFormAnyCR(ComponentRepresentation):
                 for agr, matched in zip(self.agreement, agreements_matched):
                     if matched:
                         agr.confirm_match()
-                return None
+                return None, None
 
             # if all agreements match, we win!
             if all(agreements_matched):
                 for agr in self.agreement:
                     agr.confirm_match()
 
-                return text_forms[(word_msd, word_lemma)]
+                return text_forms[(word_msd, word_lemma)], word_msd
 
 
 class WordFormMsdCR(WordFormAnyCR):
@@ -154,6 +161,7 @@ class WordFormAgreementCR(WordFormMsdCR):
     def __init__(self, data, word_renderer):
         super().__init__(data, word_renderer)
         self.rendition_candidate = None
+        self.rendition_msd_candidate = None
 
     def get_agreement(self):
         return self.data['other']
@@ -169,12 +177,14 @@ class WordFormAgreementCR(WordFormMsdCR):
             if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):
                 if self.check_msd(candidate_msd):
                     self.rendition_candidate = candidate_text
+                    self.rendition_msd_candidate = candidate_msd
                     return True
 
         return False
 
     def confirm_match(self):
         self.rendition_text = self.rendition_candidate
+        self.rendition_msd = self.rendition_msd_candidate
 
     @staticmethod
     def check_agreement(msd1, msd2, agreements):
diff --git a/src/representation_assigner.py b/src/representation_assigner.py
index 3c8ca52..822a1b1 100644
--- a/src/representation_assigner.py
+++ b/src/representation_assigner.py
@@ -73,10 +73,11 @@ class RepresentationAssigner:
                 rep.render(sloleks_db=sloleks_db)
         
         for cid, reps in representations.items():
-            reps = [rep.rendition_text for rep in reps]
-            if reps == []:
+            reps_text = [rep.rendition_text for rep in reps]
+            reps_msd = [rep.rendition_msd for rep in reps]
+            if reps_text == []:
                 pass
-            elif all(r is None for r in reps):
-                match.representations[cid] = None
+            elif all(r is None for r in reps_text):
+                match.representations[cid] = (None, None)
             else:
-                match.representations[cid] = " ".join(("" if r is None else r) for r in reps)
+                match.representations[cid] = (" ".join(("" if r is None else r) for r in reps_text), " ".join(("" if r is None else r) for r in reps_msd))
diff --git a/src/sloleks_db.py b/src/sloleks_db.py
index 953048d..7f3897f 100644
--- a/src/sloleks_db.py
+++ b/src/sloleks_db.py
@@ -1,46 +1,12 @@
-from collections import defaultdict
-from ast import literal_eval
-
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import Session, aliased
 from sqlalchemy import create_engine
-from sqlalchemy import func
 
-from match import StructureMatch
-from representation_assigner import RepresentationAssigner
-from progress_bar import progress
-
-# Lexeme = None
-# LexemeFeature = None
-# SyntacticStructure = None
-# StructureComponent = None
-# Feature = None
-# LexicalUnitLexeme = None
-# LexicalUnit = None
-# LexicalUnitType = None
-# Category = None
-# Sense = None
-# Measure = None
-# LexicalUnitMeasure = None
-# Corpus = None
-# Definition = None
-# WordForm = None
-# WordFormFeature = None
-# FormRepresentation = None
 from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION
 
 
 class SloleksDatabase:
     def __init__(self, db):
-        # self.db = db
-        # self.dispersions = {}
-        # self.min_freq = args.min_freq
-
-        # self.db.init("""CREATE TABLE Colocations (
-        #     colocation_id INTEGER PRIMARY KEY,
-        #     structure_id varchar(8),
-        #     key varchar(256))
-        #     """)
         global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
         [db_user, db_password, db_database, db_host] = db.split(':')
 
@@ -130,14 +96,15 @@ class SloleksDatabase:
     def get_word_form(self, lemma, msd, data, align_msd=False):
         # modify msd as required
         msd = list(msd)
-
-        if not align_msd and 'msd' in data:
+        if 'msd' in data:
             for key, value in data['msd'].items():
                 t = msd[0]
                 v = TAGSET[t].index(key.lower())
+                if v + 1 >= len(msd):
+                    msd = msd + ['-' for _ in range(v - len(msd) + 2)]
                 msd[v + 1] = CODES[value]
 
-        elif 'agreement' in data:
+        if align_msd and 'agreement' in data:
             align_msd = list(align_msd)
             t_align_msd = align_msd[0]
             t = msd[0]
@@ -146,37 +113,24 @@ class SloleksDatabase:
                 v_align_msd = TAGSET[t_align_msd].index(att.lower())
                 v = TAGSET[t].index(att.lower())
                 # fix for verbs with short msds
-                if v >= len(msd):
-                    return None, None, None
-                # if v >= len(msd) and t == 'V' and att == 'number':
-                #     if len(msd) == 4:
-                #         msd += ['3']
-                #     if len(msd) == 5:
-                #         msd += ['_']
-                # try:
-                msd[v + 1] = align_msd[v_align_msd + 1]
-                # except:
-                #     print('here')
+                if v + 1 >= len(msd):
+                    msd = msd + ['-' for _ in range(v - len(msd) + 2)]
+                    # return None, None, None
+
+                msd[v + 1] = align_msd[v_align_msd + 1]
 
-        # msd = list(msd)
         decypher_msd = self.decypher_msd(msd)
 
         if not decypher_msd:
             return None, None, None
 
         wfs = [aliased(WordFormFeature) for _ in decypher_msd]
-        # wf1 = aliased(WordFormFeature)
-        # wf2 = aliased(WordFormFeature)
-        # wf3 = aliased(WordFormFeature)
         query_preposition = self.session.query(FormRepresentation.form) \
             .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
             .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
 
         for wf in wfs:
             query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
-            # .join(wf1, wf1.word_form_id == WordForm.id) \
-            # .join(wf2, wf2.word_form_id == WordForm.id) \
-            # .join(wf3, wf3.word_form_id == WordForm.id) \
 
         query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
 
@@ -186,6 +140,4 @@ class SloleksDatabase:
         pattern_translation_hws = query_preposition.all()
         if len(pattern_translation_hws) > 0:
             return ''.join(msd), lemma, pattern_translation_hws[0][0]
-        # pattern_translation_hws = [el[0] for el in query_preposition.all()]
         return None, None, None
-        # return pattern_translation_hws
diff --git a/src/wani.py b/src/wani.py
index b386697..cde8dc5 100644
--- a/src/wani.py
+++ b/src/wani.py
@@ -73,8 +73,6 @@ def main(args):
         postprocessor = Postprocessor()
         matches = match_file(words, structures, postprocessor)
 
-        # matches = .process()
-        # TODO Add postprocessing here or inside previous function!
         match_store.add_matches(matches)
         word_stats.add_words(words)
         database.commit()
diff --git a/src/writer.py b/src/writer.py
index 613cc18..bda8c23 100644
--- a/src/writer.py
+++ b/src/writer.py
@@ -82,15 +82,11 @@ class Writer:
 
             self.formatter.new_match(match)
 
-            best_word_order = self.find_best_word_order(match.matches)
+            variable_word_order = self.find_variable_word_order(match.matches)
 
             for words in match.matches:
                 to_write = []
 
-                # TODO instead of enumerate in bottom components first iterate over all words in match.matches, compare
-                #  word.int_id and return most popular order and append to it remaining numbers to len(components)
-
-
                 for idx, _comp in enumerate(components):
                     idx = str(idx + 1)
                     if idx not in words:
@@ -105,7 +101,7 @@ class Writer:
                 to_write = [structure.id] + to_write + [match.match_id]
 
                 # header_right
-                to_write.extend(self.formatter.content_right(len(match), best_word_order))
+                to_write.extend(self.formatter.content_right(len(match), variable_word_order))
                 rows.append(to_write)
 
                 if self.formatter.group():
@@ -148,7 +144,7 @@ class Writer:
             fp_close(fp)
 
     @staticmethod
-    def find_best_word_order(matches):
+    def find_variable_word_order(matches):
         orders = {}
         for words in matches:
             order = tuple([tup[0] for tup in sorted(words.items(), key=lambda x: x[1].int_id)])