Implementing the rest of stats. Maybe ok?
This commit is contained in:
		
							parent
							
								
									d7f97ba9b3
								
							
						
					
					
						commit
						9ccbd02603
					
				
							
								
								
									
										53
									
								
								wani.py
									
									
									
									
									
								
							
							
						
						
									
										53
									
								
								wani.py
									
									
									
									
									
								
							| @ -10,6 +10,7 @@ import time | ||||
| import subprocess | ||||
| import concurrent.futures | ||||
| import tempfile | ||||
| from math import log2 | ||||
| 
 | ||||
| from msd_translate import MSD_TRANSLATE | ||||
| 
 | ||||
| @ -401,7 +402,7 @@ class ComponentRendition: | ||||
|         for cid, reps in representations.items(): | ||||
|             for rep in reps: | ||||
|                 rep.render() | ||||
| 
 | ||||
|          | ||||
|         for cid, reps in representations.items(): | ||||
|             reps = [rep.rendition_text for rep in reps] | ||||
|             if reps == []: | ||||
| @ -1126,19 +1127,19 @@ class StatsFormatter(Formatter): | ||||
|         jppb_forms = set() | ||||
|         self.stats = {"freq": {}} | ||||
| 
 | ||||
|         for words in match.matches: | ||||
|             cw1 = words[self.jppb[0]] | ||||
|             cw2 = words[self.jppb[1]] | ||||
|             jppb_forms.add((cw1.text, cw2.text)) | ||||
|         for cid in self.corew: | ||||
|             if cid not in match.matches[0]: | ||||
|                 freq = 0 | ||||
|             else: | ||||
|                 word = match.matches[0][cid] | ||||
|                 freq = self.word_renderer.num_words[(word.lemma, word.msd[0])] | ||||
| 
 | ||||
|         for cid, word in match.matches[0].items(): | ||||
|             if cid in self.corew: | ||||
|                 self.stats["freq"][cid] = self.word_renderer.num_words[( | ||||
|                     word.text, word.msd[0])] | ||||
|             self.stats["freq"][cid] = freq | ||||
| 
 | ||||
|         self.stats['fc'] = match.distinct_forms() | ||||
|         self.stats['fc'] = len(jppb_forms) | ||||
|         self.stats['n'] = self.word_renderer.num_all_words() | ||||
|         self.stats['df'] = match.distinct_forms() | ||||
|         self.stats['fcxy'] = len(match) | ||||
|         self.stats['n'] = len(jppb_forms) | ||||
|         self.stats['N'] = self.word_renderer.num_all_words() | ||||
| 
 | ||||
|     def header_repeat(self): | ||||
|         return ["Distribution", "Delta"] | ||||
| @ -1147,18 +1148,36 @@ class StatsFormatter(Formatter): | ||||
|         return ["LogDice_core", "LogDice_all", "Distinct_forms"] | ||||
|      | ||||
|     def content_repeat(self, words, representations, idx, sidx): | ||||
|         # not a core word | ||||
|         if idx not in self.corew: | ||||
|             return [""] * self.length() | ||||
| 
 | ||||
|         word = words[idx] | ||||
|         key = (sidx, idx, word.lemma) | ||||
|         distribution = self.colocation_ids.dispersions[key] | ||||
| 
 | ||||
|         # TODO... | ||||
|         delta = "?" | ||||
|         delta = "" | ||||
|         if idx in self.jppb: | ||||
|             idx2 = self.jppb[0] if self.jppb[0] != idx else self.jppb[1] | ||||
|             fx = self.stats['freq'][idx] | ||||
|             fy = self.stats['freq'][idx2] | ||||
|             fxy = self.stats['fcxy'] | ||||
|             N = self.stats['N'] | ||||
|             delta = fxy / fx - (fy - fxy) / (N - fx) | ||||
| 
 | ||||
|         return [str(distribution), delta] | ||||
|         return [str(distribution), str(delta)] | ||||
|      | ||||
|     def content_right(self, freq): | ||||
|         # TODO... | ||||
|         return ["?"] * 3 | ||||
|         fx = self.stats['freq'][self.jppb[0]] | ||||
|         fy = self.stats['freq'][self.jppb[1]] | ||||
|         fxy = self.stats['fcxy'] | ||||
|         logdice_core = 14 + log2(2 * fxy / (fx + fy)) | ||||
| 
 | ||||
|         sum_fi = sum(self.stats['freq'][idx] for idx in self.corew) | ||||
|         fc = fxy | ||||
|         logdice_all = 14 + log2(len(self.corew) * fc / sum_fi) | ||||
| 
 | ||||
|         return [str(logdice_core), str(logdice_all), str(self.stats['df'])] | ||||
|      | ||||
|     def group(self): | ||||
|         return True | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user