Merge branch 'sqlite'
This commit is contained in:
		
						commit
						fa8a5e55f8
					
				| @ -119,7 +119,7 @@ class StatsFormatter(Formatter): | ||||
|                 freq = 0 | ||||
|             else: | ||||
|                 word = match.matches[0][cid] | ||||
|                 freq = self.word_renderer.num_words[(word.lemma, word.msd[0])] | ||||
|                 freq = self.word_renderer.num_words(word.lemma, word.msd[0]) | ||||
| 
 | ||||
|             self.stats["freq"][cid] = freq | ||||
| 
 | ||||
|  | ||||
| @ -52,7 +52,8 @@ class WordFormAnyCR(ComponentRepresentation): | ||||
|         words_counter = [] | ||||
|         for word in self.words: | ||||
|             words_counter.append((word.msd, word.lemma)) | ||||
|         sorted_words = sorted(set(words_counter), key=lambda x: -words_counter.count(x)) | ||||
|         sorted_words = sorted( | ||||
|             set(words_counter), key=lambda x: -words_counter.count(x) + (sum(ord(l) for l in x[1]) / 1e5 if x[1] is not None else .5)) | ||||
| 
 | ||||
|         for word_msd, word_lemma in sorted_words: | ||||
|             for agr in self.agreement: | ||||
|  | ||||
| @ -1,65 +1,81 @@ | ||||
| from collections import defaultdict, Counter | ||||
| 
 | ||||
| from progress_bar import progress | ||||
| import sqlite3 | ||||
| 
 | ||||
| 
 | ||||
| class WordStats: | ||||
|     def __init__(self, lemma_features): | ||||
|         self.raw_data = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) | ||||
|         self.all_words = 0 | ||||
| 
 | ||||
|         self.rendered_words = {} | ||||
|         self.frequent_words = {} | ||||
|         self.num_words = {} | ||||
|         self.lemma_msd = {} | ||||
|         self.lemma_features = lemma_features | ||||
| 
 | ||||
|         self.all_words = 0 | ||||
|         self.memoized_msd_merges = {} | ||||
| 
 | ||||
|         with open("sqlite.db", 'w') as fp: | ||||
|             fp.write("") | ||||
| 
 | ||||
|         self.db = sqlite3.connect('sqlite.db') | ||||
|         self.db.execute("""CREATE TABLE UniqWords ( | ||||
|             uw_id INTEGER PRIMARY KEY,  | ||||
|             lemma varchar(64),  | ||||
|             msd varchar(16),  | ||||
|             text varchar(64),  | ||||
|             frequency int | ||||
|             )""") | ||||
|         self.db.execute("CREATE TABLE CommonMsd (lemma varchar(64), msd varchar(16))") | ||||
|         self.db.execute("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)") | ||||
| 
 | ||||
|         self.db.execute("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)") | ||||
|         self.db.execute("CREATE INDEX lemma_on_uw ON UniqWords (lemma)") | ||||
|         self.db.execute("CREATE INDEX lemma_on_cm ON CommonMsd (lemma)") | ||||
|         self.db.execute("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)") | ||||
| 
 | ||||
|     def add_words(self, words): | ||||
|         for w in words: | ||||
|             self.raw_data[w.lemma][w.msd][w.text] += 1 | ||||
|         for w in progress(words, "adding-words"): | ||||
|             params = {'lemma': w.lemma, 'msd': w.msd, 'text': w.text} | ||||
|             res = self.db.execute("""UPDATE UniqWords SET frequency=frequency + 1 | ||||
|                 WHERE lemma=:lemma AND msd=:msd AND text=:text""", params) | ||||
| 
 | ||||
|             if res.rowcount == 0: | ||||
|                 self.db.execute("""INSERT INTO UniqWords (lemma, msd, text, frequency)  | ||||
|                     VALUES (:lemma, :msd, :text, 1)""", params) | ||||
| 
 | ||||
|         self.db.commit() | ||||
|         self.all_words += len(words) | ||||
| 
 | ||||
|     def num_all_words(self): | ||||
|         return self.all_words | ||||
| 
 | ||||
|     def generate_renders(self): | ||||
|         num_words = defaultdict(int) | ||||
|         for lemma, ld in progress(self.raw_data.items(), "lemma-render"): | ||||
|             self.rendered_words[lemma] = {} | ||||
|             freq_words = defaultdict(int) | ||||
|         lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")] | ||||
| 
 | ||||
|         for lemma in progress(lemmas, 'common-msd'): | ||||
|             common_msd = "*" * 10 | ||||
|             for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)): | ||||
|                 common_msd = self.merge_msd(common_msd, msd[0]) | ||||
|             common_msd = self.common_lemma_msd(lemma, common_msd) | ||||
| 
 | ||||
|             for msd, text_counters in ld.items(): | ||||
|                 num = sum(text_counters.values()) | ||||
|             self.db.execute("INSERT INTO CommonMsd (lemma, msd) VALUES (?, ?)", (lemma, common_msd)) | ||||
|         self.db.commit() | ||||
| 
 | ||||
|                 # TODO: this should be out of generate_renders... | ||||
|                 num_words[(lemma, msd[0])] += num | ||||
|         for lemma in progress(lemmas, 'word-count'): | ||||
|             num_words = defaultdict(int) | ||||
|             for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)): | ||||
|                 num_words[msd[0]] += freq | ||||
|                  | ||||
|             for msd0, freq in num_words.items(): | ||||
|                 self.db.execute("INSERT INTO WordCount (lemma, msd0, frequency) VALUES (?,?,?)", | ||||
|                     (lemma, msd0, freq)) | ||||
|         self.db.commit() | ||||
| 
 | ||||
|                 rep = max(text_counters, key=text_counters.get) | ||||
|                 self.rendered_words[lemma][msd] = (rep, num) | ||||
| 
 | ||||
|                 for txt, n in text_counters.items(): | ||||
|                     freq_words[(msd, txt)] += n | ||||
| 
 | ||||
|                 common_msd = self.merge_msd(common_msd, msd) | ||||
| 
 | ||||
|             self.lemma_msd[lemma] = common_msd | ||||
| 
 | ||||
|             self.frequent_words[lemma] = [] | ||||
|             for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]): | ||||
|                 self.frequent_words[lemma].append((msd, txt, n)) | ||||
| 
 | ||||
|         self.num_words = dict(num_words) | ||||
| 
 | ||||
|         #next, determine lemma's default msds | ||||
|     def common_lemma_msd(self, lemma, msd): | ||||
|         lf = self.lemma_features | ||||
|         for lemma in self.lemma_msd: | ||||
|             cmsd = self.lemma_msd[lemma] | ||||
|             if cmsd[0] in lf: | ||||
|                 self.lemma_msd[lemma] = "".join( | ||||
|                     l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd) | ||||
|                 ) | ||||
|         if msd[0] in lf: | ||||
|             return "".join( | ||||
|                 l1 if l1 != "-" else l2 for l1, l2 in zip(lf[msd[0]], msd) | ||||
|             ) | ||||
|         else: | ||||
|             return msd | ||||
| 
 | ||||
|     def merge_msd(self, common_msd, new_msd): | ||||
|         key = (common_msd, new_msd) | ||||
| @ -79,23 +95,34 @@ class WordStats: | ||||
|         return value | ||||
| 
 | ||||
|     def render(self, lemma, msd): | ||||
|         if lemma in self.rendered_words: | ||||
|             if msd in self.rendered_words[lemma]: | ||||
|                 return self.rendered_words[lemma][msd][0] | ||||
|         statement = """SELECT msd, frequency FROM UniqWords WHERE  | ||||
|         lemma=:lemma AND msd=:msd ORDER BY frequency DESC""" | ||||
| 
 | ||||
|         cur = self.db.execute(statement, {"lemma": lemma, "msd": msd}) | ||||
|         if cur.rowcount > 0: | ||||
|             return cur.fetchone()[0] | ||||
| 
 | ||||
|     def available_words(self, lemma, existing_texts): | ||||
|         counted_texts = Counter(existing_texts) | ||||
|         for (msd, text), _n in counted_texts.most_common(): | ||||
|             yield (msd, text) | ||||
| 
 | ||||
|         if lemma in self.frequent_words: | ||||
|             for msd, text, _ in self.frequent_words[lemma]: | ||||
|                 if (msd, text) not in counted_texts: | ||||
|                     yield (msd, text) | ||||
|         statement = """SELECT msd, text, frequency FROM UniqWords WHERE  | ||||
|         lemma=:lemma ORDER BY frequency DESC""" | ||||
|         for msd, text, _f in self.db.execute(statement, {'lemma': lemma}): | ||||
|             if (msd, text) not in counted_texts: | ||||
|                 yield (msd, text) | ||||
|      | ||||
|     def num_words(self, lemma, msd0): | ||||
|         statement = "SELECT frequency FROM WordCount WHERE lemma=? AND msd0=? LIMIT 1" | ||||
|         cur = self.db.execute(statement, (lemma, msd0)) | ||||
|         result = cur.fetchone()[0] | ||||
|         return result | ||||
| 
 | ||||
| 
 | ||||
|     def get_lemma_msd(self, lemma, word_msd): | ||||
|         # should be here, since we collect every lemmas | ||||
|         lemma_msd = self.lemma_msd[lemma] | ||||
|         lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=?", (lemma,)).fetchone()[0] | ||||
| 
 | ||||
|         if lemma_msd[0] == '-': | ||||
|             if word_msd[0] in self.lemma_features: | ||||
| @ -103,4 +130,4 @@ class WordStats: | ||||
|             else: | ||||
|                 return '-' | ||||
|         else: | ||||
|             return lemma_msd | ||||
|             return lemma_msd | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user