Implementing the rest of stats. Maybe ok?

2019-06-10 00:25:36 +02:00 · 2019-06-10 00:25:36 +02:00 · 9ccbd02603
commit 9ccbd02603
parent d7f97ba9b3
1 changed files with 36 additions and 17 deletions
--- a/wani.py
+++ b/wani.py
@ -10,6 +10,7 @@ import time
 import subprocess
 import concurrent.futures
 import tempfile
+from math import log2

 from msd_translate import MSD_TRANSLATE

@ -401,7 +402,7 @@ class ComponentRendition:
        for cid, reps in representations.items():
            for rep in reps:
                rep.render()
-
+        
        for cid, reps in representations.items():
            reps = [rep.rendition_text for rep in reps]
            if reps == []:
@ -1126,19 +1127,19 @@ class StatsFormatter(Formatter):
        jppb_forms = set()
        self.stats = {"freq": {}}

-        for words in match.matches:
-            cw1 = words[self.jppb[0]]
-            cw2 = words[self.jppb[1]]
-            jppb_forms.add((cw1.text, cw2.text))
+        for cid in self.corew:
+            if cid not in match.matches[0]:
+                freq = 0
+            else:
+                word = match.matches[0][cid]
+                freq = self.word_renderer.num_words[(word.lemma, word.msd[0])]

-        for cid, word in match.matches[0].items():
-            if cid in self.corew:
-                self.stats["freq"][cid] = self.word_renderer.num_words[(
-                    word.text, word.msd[0])]
+            self.stats["freq"][cid] = freq

-        self.stats['fc'] = match.distinct_forms()
-        self.stats['fc'] = len(jppb_forms)
-        self.stats['n'] = self.word_renderer.num_all_words()
+        self.stats['df'] = match.distinct_forms()
+        self.stats['fcxy'] = len(match)
+        self.stats['n'] = len(jppb_forms)
+        self.stats['N'] = self.word_renderer.num_all_words()

    def header_repeat(self):
        return ["Distribution", "Delta"]
@ -1147,18 +1148,36 @@ class StatsFormatter(Formatter):
        return ["LogDice_core", "LogDice_all", "Distinct_forms"]
    
    def content_repeat(self, words, representations, idx, sidx):
+        # not a core word
+        if idx not in self.corew:
+            return [""] * self.length()
+
        word = words[idx]
        key = (sidx, idx, word.lemma)
        distribution = self.colocation_ids.dispersions[key]

-        # TODO...
-        delta = "?"
+        delta = ""
+        if idx in self.jppb:
+            idx2 = self.jppb[0] if self.jppb[0] != idx else self.jppb[1]
+            fx = self.stats['freq'][idx]
+            fy = self.stats['freq'][idx2]
+            fxy = self.stats['fcxy']
+            N = self.stats['N']
+            delta = fxy / fx - (fy - fxy) / (N - fx)

-        return [str(distribution), delta]
+        return [str(distribution), str(delta)]
    
    def content_right(self, freq):
-        # TODO...
-        return ["?"] * 3
+        fx = self.stats['freq'][self.jppb[0]]
+        fy = self.stats['freq'][self.jppb[1]]
+        fxy = self.stats['fcxy']
+        logdice_core = 14 + log2(2 * fxy / (fx + fy))
+
+        sum_fi = sum(self.stats['freq'][idx] for idx in self.corew)
+        fc = fxy
+        logdice_all = 14 + log2(len(self.corew) * fc / sum_fi)
+
+        return [str(logdice_core), str(logdice_all), str(self.stats['df'])]
    
    def group(self):
        return True