luscenje_struktur/src/word_stats.py

from collections import defaultdict, Counter

from progress_bar import progress


class WordStats:
    def __init__(self, lemma_features, db):
        self.lemma_features = lemma_features
        self.db = db

        self.all_words = 0
        self.memoized_msd_merges = {}

        self.db.init("""CREATE TABLE UniqWords (
            uw_id INTEGER PRIMARY KEY, 
            lemma varchar(64), 
            msd varchar(16), 
            text varchar(64), 
            frequency int
            )""")
        self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")

        self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
        self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
        self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")

    def add_words(self, words):
        for w in progress(words, "adding-words"):
            params = {'lemma': w.lemma, 'msd': w.msd, 'text': w.text}
            res = self.db.execute("""UPDATE UniqWords SET frequency=frequency + 1
                WHERE lemma=:lemma AND msd=:msd AND text=:text""", params)

            if res.rowcount == 0:
                self.db.execute("""INSERT INTO UniqWords (lemma, msd, text, frequency) 
                    VALUES (:lemma, :msd, :text, 1)""", params)

        self.db.commit()
        self.all_words += len(words)

    def num_all_words(self):
        return self.all_words

    def generate_renders(self):
        lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
        for lemma in progress(lemmas, 'word-count'):
            num_words = defaultdict(int)
            for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)):
                num_words[msd[0]] += freq
                
            for msd0, freq in num_words.items():
                self.db.execute("INSERT INTO WordCount (lemma, msd0, frequency) VALUES (?,?,?)",
                    (lemma, msd0, freq))
        self.db.commit()

    def common_lemma_msd(self, lemma, msd):
        lf = self.lemma_features
        if msd[0] in lf:
            return "".join(
                l1 if l1 != "-" else l2 for l1, l2 in zip(lf[msd[0]], msd)
            )
        else:
            return msd

    def render(self, lemma, msd):
        statement = """SELECT msd, frequency FROM UniqWords WHERE 
        lemma=:lemma AND msd=:msd ORDER BY frequency DESC"""

        cur = self.db.execute(statement, {"lemma": lemma, "msd": msd})
        if cur.rowcount > 0:
            return cur.fetchone()[0]

    def available_words(self, lemma, existing_texts):
        counted_texts = Counter(existing_texts)
        for (msd, text), _n in counted_texts.most_common():
            yield (msd, text)

        statement = """SELECT msd, text, frequency FROM UniqWords WHERE 
        lemma=:lemma ORDER BY frequency DESC"""
        for msd, text, _f in self.db.execute(statement, {'lemma': lemma}):
            if (msd, text) not in counted_texts:
                yield (msd, text)
    
    def num_words(self, lemma, msd0):
        statement = "SELECT frequency FROM WordCount WHERE lemma=? AND msd0=? LIMIT 1"
        cur = self.db.execute(statement, (lemma, msd0))
        result = cur.fetchone()[0]
        return result


    def get_lemma_msd(self, lemma, word_msd):
        # should be here, since we collect every lemmas
        msd0 = word_msd[0]
        lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=? AND msd0=?", 
                                    (lemma, msd0)).fetchone()[0]

        if lemma_msd[0] == '-':
            if word_msd[0] in self.lemma_features:
                return self.lemma_features[word_msd[0]]
            else:
                return '-'
        else:
            return lemma_msd
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`from collections import defaultdict, Counter`

New progress bar 2019-06-17 15:30:51 +00:00			`from progress_bar import progress`


HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`class WordStats:`
adding separate database class 2019-06-27 10:37:23 +00:00			`def __init__(self, lemma_features, db):`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`self.lemma_features = lemma_features`
adding separate database class 2019-06-27 10:37:23 +00:00			`self.db = db`
word stats on sqlite now, not yet really working. 2019-06-26 22:37:47 +00:00
			`self.all_words = 0`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`self.memoized_msd_merges = {}`

adding separate database class 2019-06-27 10:37:23 +00:00			`self.db.init("""CREATE TABLE UniqWords (`
word stats on sqlite now, not yet really working. 2019-06-26 22:37:47 +00:00			`uw_id INTEGER PRIMARY KEY,`
			`lemma varchar(64),`
			`msd varchar(16),`
			`text varchar(64),`
			`frequency int`
			`)""")`
adding separate database class 2019-06-27 10:37:23 +00:00			`self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")`
word stats on sqlite now, not yet really working. 2019-06-26 22:37:47 +00:00
adding separate database class 2019-06-27 10:37:23 +00:00			`self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")`
			`self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")`
			`self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")`
word stats on sqlite now, not yet really working. 2019-06-26 22:37:47 +00:00
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`def add_words(self, words):`
word stats on sqlite now, not yet really working. 2019-06-26 22:37:47 +00:00			`for w in progress(words, "adding-words"):`
			`params = {'lemma': w.lemma, 'msd': w.msd, 'text': w.text}`
			`res = self.db.execute("""UPDATE UniqWords SET frequency=frequency + 1`
			`WHERE lemma=:lemma AND msd=:msd AND text=:text""", params)`

			`if res.rowcount == 0:`
			`self.db.execute("""INSERT INTO UniqWords (lemma, msd, text, frequency)`
			`VALUES (:lemma, :msd, :text, 1)""", params)`

			`self.db.commit()`
word stats now collected more memory-efficient 2019-06-15 20:20:20 +00:00			`self.all_words += len(words)`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
			`def num_all_words(self):`
word stats now collected more memory-efficient 2019-06-15 20:20:20 +00:00			`return self.all_words`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
			`def generate_renders(self):`
word stats on sqlite now, not yet really working. 2019-06-26 22:37:47 +00:00			`lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]`
			`for lemma in progress(lemmas, 'word-count'):`
			`num_words = defaultdict(int)`
			`for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)):`
			`num_words[msd[0]] += freq`

			`for msd0, freq in num_words.items():`
			`self.db.execute("INSERT INTO WordCount (lemma, msd0, frequency) VALUES (?,?,?)",`
			`(lemma, msd0, freq))`
			`self.db.commit()`

			`def common_lemma_msd(self, lemma, msd):`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`lf = self.lemma_features`
word stats on sqlite now, not yet really working. 2019-06-26 22:37:47 +00:00			`if msd[0] in lf:`
			`return "".join(`
			`l1 if l1 != "-" else l2 for l1, l2 in zip(lf[msd[0]], msd)`
			`)`
			`else:`
			`return msd`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
			`def render(self, lemma, msd):`
word stats on sqlite now, not yet really working. 2019-06-26 22:37:47 +00:00			`statement = """SELECT msd, frequency FROM UniqWords WHERE`
			`lemma=:lemma AND msd=:msd ORDER BY frequency DESC"""`

			`cur = self.db.execute(statement, {"lemma": lemma, "msd": msd})`
			`if cur.rowcount > 0:`
			`return cur.fetchone()[0]`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
			`def available_words(self, lemma, existing_texts):`
			`counted_texts = Counter(existing_texts)`
			`for (msd, text), _n in counted_texts.most_common():`
			`yield (msd, text)`

word stats on sqlite now, not yet really working. 2019-06-26 22:37:47 +00:00			`statement = """SELECT msd, text, frequency FROM UniqWords WHERE`
			`lemma=:lemma ORDER BY frequency DESC"""`
			`for msd, text, _f in self.db.execute(statement, {'lemma': lemma}):`
Skipping already used abailable words, stupid refactoring bug 2019-06-26 22:57:46 +00:00			`if (msd, text) not in counted_texts:`
			`yield (msd, text)`
word stats on sqlite now, not yet really working. 2019-06-26 22:37:47 +00:00
			`def num_words(self, lemma, msd0):`
			`statement = "SELECT frequency FROM WordCount WHERE lemma=? AND msd0=? LIMIT 1"`
			`cur = self.db.execute(statement, (lemma, msd0))`
			`result = cur.fetchone()[0]`
			`return result`

HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
			`def get_lemma_msd(self, lemma, word_msd):`
			`# should be here, since we collect every lemmas`
common msd now based on (lemma,msd0) not only lemma #757-127 2019-06-28 20:00:38 +00:00			`msd0 = word_msd[0]`
			`lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=? AND msd0=?",`
			`(lemma, msd0)).fetchone()[0]`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
			`if lemma_msd[0] == '-':`
			`if word_msd[0] in self.lemma_features:`
			`return self.lemma_features[word_msd[0]]`
			`else:`
			`return '-'`
			`else:`
word stats on sqlite now, not yet really working. 2019-06-26 22:37:47 +00:00			`return lemma_msd`