From 9ccbd0260392eae54adcc8a3f7c308ffd50fc689 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Mon, 10 Jun 2019 00:25:36 +0200 Subject: [PATCH] Implementing the rest of stats. Maybe ok? --- wani.py | 53 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/wani.py b/wani.py index 13ffe2f..d3a5d2e 100644 --- a/wani.py +++ b/wani.py @@ -10,6 +10,7 @@ import time import subprocess import concurrent.futures import tempfile +from math import log2 from msd_translate import MSD_TRANSLATE @@ -401,7 +402,7 @@ class ComponentRendition: for cid, reps in representations.items(): for rep in reps: rep.render() - + for cid, reps in representations.items(): reps = [rep.rendition_text for rep in reps] if reps == []: @@ -1126,19 +1127,19 @@ class StatsFormatter(Formatter): jppb_forms = set() self.stats = {"freq": {}} - for words in match.matches: - cw1 = words[self.jppb[0]] - cw2 = words[self.jppb[1]] - jppb_forms.add((cw1.text, cw2.text)) + for cid in self.corew: + if cid not in match.matches[0]: + freq = 0 + else: + word = match.matches[0][cid] + freq = self.word_renderer.num_words[(word.lemma, word.msd[0])] - for cid, word in match.matches[0].items(): - if cid in self.corew: - self.stats["freq"][cid] = self.word_renderer.num_words[( - word.text, word.msd[0])] + self.stats["freq"][cid] = freq - self.stats['fc'] = match.distinct_forms() - self.stats['fc'] = len(jppb_forms) - self.stats['n'] = self.word_renderer.num_all_words() + self.stats['df'] = match.distinct_forms() + self.stats['fcxy'] = len(match) + self.stats['n'] = len(jppb_forms) + self.stats['N'] = self.word_renderer.num_all_words() def header_repeat(self): return ["Distribution", "Delta"] @@ -1147,18 +1148,36 @@ class StatsFormatter(Formatter): return ["LogDice_core", "LogDice_all", "Distinct_forms"] def content_repeat(self, words, representations, idx, sidx): + # not a core word + if idx not in self.corew: + return [""] * self.length() + word = words[idx] key = (sidx, idx, word.lemma) distribution = self.colocation_ids.dispersions[key] - # TODO... - delta = "?" + delta = "" + if idx in self.jppb: + idx2 = self.jppb[0] if self.jppb[0] != idx else self.jppb[1] + fx = self.stats['freq'][idx] + fy = self.stats['freq'][idx2] + fxy = self.stats['fcxy'] + N = self.stats['N'] + delta = fxy / fx - (fy - fxy) / (N - fx) - return [str(distribution), delta] + return [str(distribution), str(delta)] def content_right(self, freq): - # TODO... - return ["?"] * 3 + fx = self.stats['freq'][self.jppb[0]] + fy = self.stats['freq'][self.jppb[1]] + fxy = self.stats['fcxy'] + logdice_core = 14 + log2(2 * fxy / (fx + fy)) + + sum_fi = sum(self.stats['freq'][idx] for idx in self.corew) + fc = fxy + logdice_all = 14 + log2(len(self.corew) * fc / sum_fi) + + return [str(logdice_core), str(logdice_all), str(self.stats['df'])] def group(self): return True