common msd now based on (lemma,msd0) not only lemma #757-127
This commit is contained in:
		
							parent
							
								
									8c20295adf
								
							
						
					
					
						commit
						47340fe80c
					
				| @ -18,12 +18,12 @@ class WordStats: | |||||||
|             text varchar(64),  |             text varchar(64),  | ||||||
|             frequency int |             frequency int | ||||||
|             )""") |             )""") | ||||||
|         self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd varchar(16))") |         self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd0 char, msd varchar(16))") | ||||||
|         self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)") |         self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)") | ||||||
| 
 | 
 | ||||||
|         self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)") |         self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)") | ||||||
|         self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)") |         self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)") | ||||||
|         self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma)") |         self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma, msd0)") | ||||||
|         self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)") |         self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)") | ||||||
| 
 | 
 | ||||||
|     def add_words(self, words): |     def add_words(self, words): | ||||||
| @ -46,12 +46,20 @@ class WordStats: | |||||||
|         lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")] |         lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")] | ||||||
| 
 | 
 | ||||||
|         for lemma in progress(lemmas, 'common-msd'): |         for lemma in progress(lemmas, 'common-msd'): | ||||||
|             common_msd = "*" * 10 |             common_msds = defaultdict(lambda: "*" * 10) | ||||||
|             for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)): | 
 | ||||||
|                 common_msd = self.merge_msd(common_msd, msd[0]) |             for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)): | ||||||
|             common_msd = self.common_lemma_msd(lemma, common_msd) |                 msd = msd[0] | ||||||
|  |                 current_msd = common_msds[msd[0]] | ||||||
|  |                 new_msd = self.merge_msd(current_msd, msd) | ||||||
|  |                 common_msds[msd[0]] = new_msd | ||||||
|  | 
 | ||||||
|  |             for msd0, common_msd in common_msds.items(): | ||||||
|  |                 common_msd = self.common_lemma_msd(lemma, common_msd) | ||||||
|  |                 self.db.execute("INSERT INTO CommonMsd (lemma, msd0, msd) VALUES (?, ?, ?)",  | ||||||
|  |                                 (lemma, msd0, common_msd)) | ||||||
|  |              | ||||||
| 
 | 
 | ||||||
|             self.db.execute("INSERT INTO CommonMsd (lemma, msd) VALUES (?, ?)", (lemma, common_msd)) |  | ||||||
|         self.db.commit() |         self.db.commit() | ||||||
| 
 | 
 | ||||||
|         for lemma in progress(lemmas, 'word-count'): |         for lemma in progress(lemmas, 'word-count'): | ||||||
| @ -118,7 +126,9 @@ class WordStats: | |||||||
| 
 | 
 | ||||||
|     def get_lemma_msd(self, lemma, word_msd): |     def get_lemma_msd(self, lemma, word_msd): | ||||||
|         # should be here, since we collect every lemmas |         # should be here, since we collect every lemmas | ||||||
|         lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=?", (lemma,)).fetchone()[0] |         msd0 = word_msd[0] | ||||||
|  |         lemma_msd = self.db.execute("SELECT msd FROM CommonMsd WHERE lemma=? AND msd0=?",  | ||||||
|  |                                     (lemma, msd0)).fetchone()[0] | ||||||
| 
 | 
 | ||||||
|         if lemma_msd[0] == '-': |         if lemma_msd[0] == '-': | ||||||
|             if word_msd[0] in self.lemma_features: |             if word_msd[0] in self.lemma_features: | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user