From 777791ad1e2e413311eff2e4f07baafbc510f7c1 Mon Sep 17 00:00:00 2001
From: Luka <krsnik.luka92@gmail.com>
Date: Wed, 8 Jul 2020 19:23:56 +0200
Subject: [PATCH] Added s/z, k/h + fixed bug 90 + connecting with sloleks on
 lemma_fallback

---
 .gitignore                     |   1 +
 src/__init__.py                |   0
 src/codes_tagset.py            | 126 ++++++++++++++++++++++
 src/loader.py                  |   4 +-
 src/match_store.py             |  18 ++--
 src/postprocessor.py           |  38 +++++++
 src/representation.py          |  48 +++++++--
 src/representation_assigner.py |   4 +-
 src/sloleks_db.py              | 191 +++++++++++++++++++++++++++++++++
 src/syntactic_structure.py     |   2 +-
 src/wani.py                    |  25 ++++-
 src/word.py                    |  18 +++-
 12 files changed, 443 insertions(+), 32 deletions(-)
 create mode 100644 src/__init__.py
 create mode 100644 src/postprocessor.py
 create mode 100644 src/sloleks_db.py

diff --git a/.gitignore b/.gitignore
index 20d04bf..3a0f11c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ __pycache__
 
 prev
 old
+data
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/codes_tagset.py b/src/codes_tagset.py
index c407fe6..be2ffcf 100644
--- a/src/codes_tagset.py
+++ b/src/codes_tagset.py
@@ -1,3 +1,129 @@
+CODES_TRANSLATION = {
+    "N": {
+        2: {
+            'm': 'masculine',
+            'f': 'feminine',
+            'n': 'neuter',
+        },
+        3: {
+            "s": "singular",
+            "d": "dual",
+            "p": "plural",
+        },
+        4: {
+            "n": "nominative",
+            "g": "genitive",
+            "d": "dative",
+            "a": "accusative",
+            "l": "locative",
+            "i": "instrumental",
+        },
+    },
+    "V": {
+        1: {
+            "m": "main",
+            "a": "auxiliary",
+        },
+        3: {
+            "n": "infinitive",
+            "u": "supine",
+            "p": "participle",
+            "r": "present",
+            "f": "future",
+            "c": "conditional",
+            "m": "imperative",
+        },
+        4: {
+            "1": "first",
+            "2": "second",
+            "3": "third",
+        },
+        5: {
+            "s": "singular",
+            "d": "dual",
+            "p": "plural",
+        },
+        6: {
+            'm': 'masculine',
+            'f': 'feminine',
+            'n': 'neuter',
+        },
+        8: {
+            "n": "no",
+            "y": "yes",
+        },
+    },
+    "A": {
+        1: {
+            "g": "general",
+            "s": "possessive",
+        },
+        2: {
+            "p": "positive",
+            "c": "comparative",
+            "s": "superlative",
+        },
+        3: {
+            'm': 'masculine',
+            'f': 'feminine',
+            'n': 'neuter',
+        },
+        4: {
+            "s": "singular",
+            "d": "dual",
+            "p": "plural",
+        },
+        5: {
+            "n": "nominative",
+            "g": "genitive",
+            "d": "dative",
+            "a": "accusative",
+            "l": "locative",
+            "i": "instrumental",
+        },
+    }
+    # "R": "Adverb",
+    # "P": "Pronoun",
+    # "M": "Numeral",
+    # "S": "Preposition",
+    # "C": "Conjunction",
+    # "Q": "Particle",
+    # "I": "Interjection",
+    # "Y": "Abbreviation",
+    # "X": "Residual",
+    #
+    #
+    # "e": "perfective",
+    # "p": "progressive",
+    # "b": "biaspectual",
+    #
+    #
+    # "p": "personal",
+    # "d": "demonstrative",
+    # "r": "relative",
+    # "x": "reflexive",
+    # "q": "interrogative",
+    # "i": "indefinite",
+    # "z": "negative",
+    # "b": "bound",
+    # "d": "digit",
+    # "r": "roman",
+    # "l": "letter",
+    # "c": "cardinal",
+    # "o": "ordinal",
+    # "p": "pronominal",
+    # "s": "special",
+    # "c": "coordinating",
+    # "s": "subordinating",
+    # "f": "foreign",
+    # "t": "typo",
+    # "p": "program",
+    # "w": "web",
+    # "e": "emo",
+    # "h": "hashtag",
+    # "a: "at""
+}
+
 CODES = {
     "Noun": "N",
     "Verb": "V",
diff --git a/src/loader.py b/src/loader.py
index 1876352..2ff41c5 100644
--- a/src/loader.py
+++ b/src/loader.py
@@ -124,9 +124,9 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
                 dest = l.get('dep')
             else:
                 ana = l.get('ana')
-                if ana[:4] != 'syn:': # dont bother...
+                if ana[:8] != 'jos-syn:': # dont bother...
                     continue
-                ana = ana[4:]
+                ana = ana[8:]
                 lfrom, dest = l.get('target').replace('#', '').split()
 
             if lfrom in words:
diff --git a/src/match_store.py b/src/match_store.py
index 378278f..ff200ce 100644
--- a/src/match_store.py
+++ b/src/match_store.py
@@ -91,7 +91,14 @@ class MatchStore:
                                    (structure.id,)):
             yield StructureMatch.from_db(self.db, cid[0], structure)
 
-    def set_representations(self, word_renderer, structures):
+    def add_inserts(self, inserts):
+        for match in inserts:
+            for component_id, text in match.representations.items():
+                self.db.execute("""
+                    INSERT INTO Representations (colocation_id, component_id, text) 
+                    VALUES (?,?,?)""", (match.match_id, component_id, text))
+
+    def set_representations(self, word_renderer, structures, sloleks_db=None):
         step_name = 'representation'
         if self.db.is_step_done(step_name):
             print("Representation step already done, skipping")
@@ -105,17 +112,14 @@ class MatchStore:
         for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
             structure = structures_dict[sid]
             match = StructureMatch.from_db(self.db, cid, structure)
-            RepresentationAssigner.set_representations(match, word_renderer)
+            RepresentationAssigner.set_representations(match, word_renderer, sloleks_db=sloleks_db)
 
             inserts.append(match)
             if len(inserts) > num_inserts:
-                for match in inserts:
-                    for component_id, text in match.representations.items():
-                        self.db.execute("""
-                            INSERT INTO Representations (colocation_id, component_id, text) 
-                            VALUES (?,?,?)""", (match.match_id, component_id, text))
+                self.add_inserts(inserts)
                 inserts = []
 
+        self.add_inserts(inserts)
         self.db.step_is_done(step_name)
 
     def has_colocation_id_enough_frequency(self, colocation_id):
diff --git a/src/postprocessor.py b/src/postprocessor.py
new file mode 100644
index 0000000..375cb3f
--- /dev/null
+++ b/src/postprocessor.py
@@ -0,0 +1,38 @@
+
+class Postprocessor:
+    def __init__(self, fix_one_letter_words=True):
+        self.fix_one_letter_words = fix_one_letter_words
+
+    @staticmethod
+    def fix_sz(next_word):
+        if next_word[0] in ['c', 'č', 'f', 'h', 'k', 'p', 's', 'š', 't']:
+            return 's'
+        return 'z'
+
+    @staticmethod
+    def fix_kh(next_word):
+        if next_word[0] in ['g', 'k']:
+            return 'h'
+        return 'k'
+
+    def process(self, match, collocation_id):
+        # self.matches = matches
+        # if self.fix_one_letter_words:
+        #     for syn_structure_key, syn_structure_value in self.matches.items():
+        #         for match, collocation_id in syn_structure_value:
+        if len(collocation_id) > 2:
+            # a = collocation_id[1:-1]
+            # b = enumerate(collocation_id[1:-1])
+            # for a, c in b:
+            #     print('here')
+            for idx, (col_id, word) in enumerate(collocation_id[1:-1]):
+                if word in ['s', 'z']:
+                    correct_letter = self.fix_sz(collocation_id[idx + 2][1])
+                    collocation_id[idx + 1][1] = correct_letter
+                    match[col_id].text = correct_letter
+                elif word in ['k', 'h']:
+                    correct_letter = self.fix_kh(collocation_id[idx + 2][1])
+                    collocation_id[idx + 1][1] = correct_letter
+                    match[col_id].text = correct_letter
+        collocation_id = [collocation_id[0]] + [tuple(line) for line in collocation_id[1:]]
+        return match, collocation_id
diff --git a/src/representation.py b/src/representation.py
index 8c21aa2..70d0e66 100644
--- a/src/representation.py
+++ b/src/representation.py
@@ -4,6 +4,9 @@ from collections import Counter
 from codes_tagset import TAGSET, CODES
 from word import WordMsdOnly
 
+from src.word import WordDummy
+
+
 class ComponentRepresentation:
     def __init__(self, data, word_renderer):
         self.data = data
@@ -19,23 +22,23 @@ class ComponentRepresentation:
     def add_word(self, word):
         self.words.append(word)
 
-    def render(self):
+    def render(self, sloleks_db=None):
         if self.rendition_text is None:
-            self.rendition_text = self._render()
+            self.rendition_text = self._render(sloleks_db=sloleks_db)
 
-    def _render(self):
+    def _render(self, sloleks_db=None):
         raise NotImplementedError("Not implemented for class: {}".format(type(self)))
 
 class LemmaCR(ComponentRepresentation):
-    def _render(self):
+    def _render(self, sloleks_db=None):
         return self.words[0].lemma if len(self.words) > 0 else None
 
 class LexisCR(ComponentRepresentation):
-    def _render(self):
+    def _render(self, sloleks_db=None):
         return self.data['lexis']
 
 class WordFormAllCR(ComponentRepresentation):
-    def _render(self):
+    def _render(self, sloleks_db=None):
         if len(self.words) == 0:
             return None
         else:
@@ -43,7 +46,7 @@ class WordFormAllCR(ComponentRepresentation):
             return "/".join(set(forms))
 
 class WordFormAnyCR(ComponentRepresentation):
-    def _render(self):
+    def _render(self, sloleks_db=None):
         text_forms = {}
         msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
         for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
@@ -59,7 +62,24 @@ class WordFormAnyCR(ComponentRepresentation):
         for word_msd, word_lemma in sorted_words:
             # check if agreements match
             agreements_matched = [agr.match(word_msd) for agr in self.agreement]
-            
+
+            # in case all agreements do not match try to get data from sloleks and change properly
+            if not all(agreements_matched):
+                if sloleks_db is None:
+                    raise Exception('sloleks_db not properly setup!')
+                for agr in self.agreement:
+                    if not agr.match(word_msd):
+                        msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
+                        if msd is not None:
+                            agr.msds[0] = msd
+                            agr.words.append(WordDummy(msd, lemma, text))
+                            # agr.words[0].msd = msd
+                            # agr.words[0].text = text
+                            agreements_matched = [agr.match(word_msd) for agr in self.agreement]
+                        else:
+                            break
+
+
             # if we are at the last "backup word", then confirm matches 
             # that worked for this one and return
             if word_lemma is None:
@@ -109,9 +129,15 @@ class WordFormMsdCR(WordFormAnyCR):
         if self.check_msd(word.msd):
             super().add_word(word)
 
-    def _render(self):
+    def _render(self, sloleks_db=None):
+        if len(self.words) == 0:
+            if sloleks_db is None:
+                raise Exception('sloleks_db not properly setup!')
+            msd, lemma, text = sloleks_db.get_word_form(self.lemma, self.msd(), self.data)
+            if msd is not None:
+                self.words.append(WordDummy(msd, lemma, text))
         self.words.append(WordMsdOnly(self._common_msd()))
-        return super()._render()
+        return super()._render(sloleks_db)
     
     def _common_msd(self):
         msds = sorted(self.msds, key=len)
@@ -182,5 +208,5 @@ class WordFormAgreementCR(WordFormMsdCR):
 
         return True
 
-    def render(self):
+    def render(self, sloleks_db=None):
         pass
diff --git a/src/representation_assigner.py b/src/representation_assigner.py
index ac4708d..3c8ca52 100644
--- a/src/representation_assigner.py
+++ b/src/representation_assigner.py
@@ -39,7 +39,7 @@ class RepresentationAssigner:
         return self.representation_factory(self.more, word_renderer)
 
     @staticmethod
-    def set_representations(match, word_renderer):
+    def set_representations(match, word_renderer, sloleks_db=None):
         representations = {}
         for c in match.structure.components:
             representations[c.idx] = []
@@ -70,7 +70,7 @@ class RepresentationAssigner:
 
         for cid, reps in representations.items():
             for rep in reps:
-                rep.render()
+                rep.render(sloleks_db=sloleks_db)
         
         for cid, reps in representations.items():
             reps = [rep.rendition_text for rep in reps]
diff --git a/src/sloleks_db.py b/src/sloleks_db.py
new file mode 100644
index 0000000..953048d
--- /dev/null
+++ b/src/sloleks_db.py
@@ -0,0 +1,191 @@
+from collections import defaultdict
+from ast import literal_eval
+
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import Session, aliased
+from sqlalchemy import create_engine
+from sqlalchemy import func
+
+from match import StructureMatch
+from representation_assigner import RepresentationAssigner
+from progress_bar import progress
+
+# Lexeme = None
+# LexemeFeature = None
+# SyntacticStructure = None
+# StructureComponent = None
+# Feature = None
+# LexicalUnitLexeme = None
+# LexicalUnit = None
+# LexicalUnitType = None
+# Category = None
+# Sense = None
+# Measure = None
+# LexicalUnitMeasure = None
+# Corpus = None
+# Definition = None
+# WordForm = None
+# WordFormFeature = None
+# FormRepresentation = None
+from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION
+
+
+class SloleksDatabase:
+    def __init__(self, db):
+        # self.db = db
+        # self.dispersions = {}
+        # self.min_freq = args.min_freq
+
+        # self.db.init("""CREATE TABLE Colocations (
+        #     colocation_id INTEGER PRIMARY KEY,
+        #     structure_id varchar(8),
+        #     key varchar(256))
+        #     """)
+        global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
+        [db_user, db_password, db_database, db_host] = db.split(':')
+
+        engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
+                               pool_recycle=14400)
+        Base = declarative_base()
+        Base.metadata.reflect(engine)
+
+        class Lexeme(Base):
+            __table__ = Base.metadata.tables['jedro_lexeme']
+
+        class LexemeFeature(Base):
+            __table__ = Base.metadata.tables['jedro_lexeme_feature']
+
+        class SyntacticStructure(Base):
+            __table__ = Base.metadata.tables['jedro_syntacticstructure']
+
+        class StructureComponent(Base):
+            __table__ = Base.metadata.tables['jedro_structurecomponent']
+
+        class Feature(Base):
+            __table__ = Base.metadata.tables['jedro_feature']
+
+        class LexicalUnitLexeme(Base):
+            __table__ = Base.metadata.tables['jedro_lexicalunit_lexeme']
+
+        class LexicalUnit(Base):
+            __table__ = Base.metadata.tables['jedro_lexicalunit']
+
+        class LexicalUnitType(Base):
+            __table__ = Base.metadata.tables['jedro_lexicalunittype']
+
+        class Category(Base):
+            __table__ = Base.metadata.tables['jedro_category']
+
+        class Sense(Base):
+            __table__ = Base.metadata.tables['jedro_sense']
+
+        class Measure(Base):
+            __table__ = Base.metadata.tables['jedro_measure']
+
+        class LexicalUnitMeasure(Base):
+            __table__ = Base.metadata.tables['jedro_lexicalunitmeasure']
+
+        class Corpus(Base):
+            __table__ = Base.metadata.tables['jedro_corpus']
+
+        class Definition(Base):
+            __table__ = Base.metadata.tables['jedro_definition']
+
+        class WordForm(Base):
+            __table__ = Base.metadata.tables['jedro_wordform']
+
+        class WordFormFeature(Base):
+            __table__ = Base.metadata.tables['jedro_wordform_feature']
+
+        class FormRepresentation(Base):
+            __table__ = Base.metadata.tables['jedro_formrepresentation']
+
+        self.session = Session(engine)
+
+    def close(self):
+        self.session.close()
+
+    def decypher_msd(self, msd):
+        t = msd[0]
+        decypher = []
+        if t == 'N':
+            # gender = CODES_TRANSLATION[t][2][msd[2]]
+            number = CODES_TRANSLATION[t][3][msd[3]]
+            case = CODES_TRANSLATION[t][4][msd[4]]
+            decypher = [number, case]
+        elif t == 'V':
+            # gender = CODES_TRANSLATION[t][6][msd[6]]
+            vform = CODES_TRANSLATION[t][3][msd[3]]
+            number = CODES_TRANSLATION[t][5][msd[5]]
+            person = 'third'
+            decypher = [vform, number, person]
+        elif t == 'A':
+            gender = CODES_TRANSLATION[t][3][msd[3]]
+            number = CODES_TRANSLATION[t][4][msd[4]]
+            case = CODES_TRANSLATION[t][5][msd[5]]
+            decypher = [gender, number, case]
+
+        return decypher
+
+    def get_word_form(self, lemma, msd, data, align_msd=False):
+        # modify msd as required
+        msd = list(msd)
+
+        if not align_msd and 'msd' in data:
+            for key, value in data['msd'].items():
+                t = msd[0]
+                v = TAGSET[t].index(key.lower())
+                msd[v + 1] = CODES[value]
+
+        elif 'agreement' in data:
+            align_msd = list(align_msd)
+            t_align_msd = align_msd[0]
+            t = msd[0]
+
+            for att in data['agreement']:
+                v_align_msd = TAGSET[t_align_msd].index(att.lower())
+                v = TAGSET[t].index(att.lower())
+                # fix for verbs with short msds
+                if v >= len(msd):
+                    return None, None, None
+                # if v >= len(msd) and t == 'V' and att == 'number':
+                #     if len(msd) == 4:
+                #         msd += ['3']
+                #     if len(msd) == 5:
+                #         msd += ['_']
+                # try:
+                msd[v + 1] = align_msd[v_align_msd + 1]
+                # except:
+                #     print('here')
+
+        # msd = list(msd)
+        decypher_msd = self.decypher_msd(msd)
+
+        if not decypher_msd:
+            return None, None, None
+
+        wfs = [aliased(WordFormFeature) for _ in decypher_msd]
+        # wf1 = aliased(WordFormFeature)
+        # wf2 = aliased(WordFormFeature)
+        # wf3 = aliased(WordFormFeature)
+        query_preposition = self.session.query(FormRepresentation.form) \
+            .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
+            .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
+
+        for wf in wfs:
+            query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
+            # .join(wf1, wf1.word_form_id == WordForm.id) \
+            # .join(wf2, wf2.word_form_id == WordForm.id) \
+            # .join(wf3, wf3.word_form_id == WordForm.id) \
+
+        query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
+
+        for wf, msd_el in zip(wfs, decypher_msd):
+            query_preposition = query_preposition.filter(wf.value == msd_el)
+
+        pattern_translation_hws = query_preposition.all()
+        if len(pattern_translation_hws) > 0:
+            return ''.join(msd), lemma, pattern_translation_hws[0][0]
+        # pattern_translation_hws = [el[0] for el in query_preposition.all()]
+        return None, None, None
+        # return pattern_translation_hws
diff --git a/src/syntactic_structure.py b/src/syntactic_structure.py
index 6cf4d28..222fe35 100644
--- a/src/syntactic_structure.py
+++ b/src/syntactic_structure.py
@@ -14,7 +14,7 @@ class SyntacticStructure:
     @staticmethod
     def from_xml(xml):
         st = SyntacticStructure()
-        st.id = xml.get('id')
+        st.id = xml.get('id_nsss')
         st.lbs = xml.get('LBS')
 
         assert len(list(xml)) == 1
diff --git a/src/wani.py b/src/wani.py
index d285c62..9eeb9cf 100644
--- a/src/wani.py
+++ b/src/wani.py
@@ -11,6 +11,7 @@ import concurrent.futures
 import tempfile
 
 from progress_bar import progress
+from sloleks_db import SloleksDatabase
 from word import Word
 from syntactic_structure import build_structures
 from match_store import MatchStore
@@ -20,16 +21,20 @@ from loader import load_files
 from database import Database
 from time_info import TimeInfo
 
+from src.postprocessor import Postprocessor
 
-def match_file(words, structures):
+
+def match_file(words, structures, postprocessor):
     matches = {s: [] for s in structures}
 
     for s in progress(structures, "matching"):
         for w in words:
             mhere = s.match(w)
             for match in mhere:
-                colocation_id = [(idx, w.lemma) for idx, w in match.items()]
+                # colocation_id = [(idx, w.lemma) for idx, w in match.items()]
+                colocation_id = [[idx, w.lemma] for idx, w in match.items()]
                 colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
+                match, collocation_id = postprocessor.process(match, colocation_id)
                 colocation_id = tuple(colocation_id)
 
                 matches[s].append((match, colocation_id))
@@ -38,6 +43,7 @@ def match_file(words, structures):
 
 
 def main(args):
+    sloleks_db = SloleksDatabase(args.sloleks_db)
     structures, lemma_msds, max_num_components = build_structures(args)
     timeinfo = TimeInfo(len(args.input))
 
@@ -51,7 +57,11 @@ def main(args):
             continue
 
         start_time = time.time()
-        matches = match_file(words, structures)
+        postprocessor = Postprocessor()
+        matches = match_file(words, structures, postprocessor)
+
+        # matches = .process()
+        # TODO Add postprocessing here or inside previous function!
         match_store.add_matches(matches)
         word_stats.add_words(words)
         database.commit()
@@ -74,7 +84,7 @@ def main(args):
 
     # figure out representations!
     if args.out or args.out_no_stat:
-        match_store.set_representations(word_stats, structures)
+        match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
 
     Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
         structures, match_store)
@@ -85,6 +95,10 @@ def main(args):
     Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
         structures, match_store)
 
+    # sloleks_db.get_word_form(lemma, gender, number, case)
+    sloleks_db.close()
+
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
         description='Extract structures from a parsed corpus.')
@@ -92,6 +106,7 @@ if __name__ == '__main__':
                         help='Structures definitions in xml file')
     parser.add_argument('input',
                         help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
+    parser.add_argument('--sloleks_db', type=str, help='Sloleks database credentials')
     parser.add_argument('--out',
                         help='Classic output file')
     parser.add_argument('--out-no-stat',
@@ -100,7 +115,7 @@ if __name__ == '__main__':
                         help='Additional output file, writes more data')
     parser.add_argument('--stats',
                         help='Output file for statistics')
-
+#
     parser.add_argument('--no-msd-translate',
                         help='MSDs are translated from slovene to english by default',
                         action='store_true')
diff --git a/src/word.py b/src/word.py
index bbf3889..f30522c 100644
--- a/src/word.py
+++ b/src/word.py
@@ -14,6 +14,16 @@ class WordMsdOnly:
         return None
 
 
+class WordDummy:
+    def __init__(self, msd, lemma, text):
+        self.msd = msd
+        self.lemma = lemma
+        self.text = text
+
+    def most_frequent_text(self, word_renderer):
+        return word_renderer.render(self.lemma, self.msd)
+
+
 class Word:
     def __init__(self, lemma, msd, wid, text, do_msd_translate):
         self.lemma = lemma
@@ -29,7 +39,7 @@ class Word:
         self.int_id = int(last_num)
 
         assert None not in (self.id, self.lemma, self.msd)
-    
+
     @staticmethod
     def from_xml(xml, do_msd_translate):
         lemma = xml.get('lemma')
@@ -41,10 +51,10 @@ class Word:
     @staticmethod
     def get_msd(comp):
         d = dict(comp.items())
-        if 'msd' in d:
-            return d['msd']
-        elif 'ana' in d:
+        if 'ana' in d:
             return d['ana'][4:]
+        elif 'msd' in d:
+            return d['msd']
         else:
             logging.error(d)
             raise NotImplementedError("MSD?")