Implementing the rest of stats. Maybe ok?

This commit is contained in:
Ozbolt Menegatti 2019-06-10 00:25:36 +02:00
parent d7f97ba9b3
commit 9ccbd02603

53
wani.py
View File

@ -10,6 +10,7 @@ import time
import subprocess import subprocess
import concurrent.futures import concurrent.futures
import tempfile import tempfile
from math import log2
from msd_translate import MSD_TRANSLATE from msd_translate import MSD_TRANSLATE
@ -401,7 +402,7 @@ class ComponentRendition:
for cid, reps in representations.items(): for cid, reps in representations.items():
for rep in reps: for rep in reps:
rep.render() rep.render()
for cid, reps in representations.items(): for cid, reps in representations.items():
reps = [rep.rendition_text for rep in reps] reps = [rep.rendition_text for rep in reps]
if reps == []: if reps == []:
@ -1126,19 +1127,19 @@ class StatsFormatter(Formatter):
jppb_forms = set() jppb_forms = set()
self.stats = {"freq": {}} self.stats = {"freq": {}}
for words in match.matches: for cid in self.corew:
cw1 = words[self.jppb[0]] if cid not in match.matches[0]:
cw2 = words[self.jppb[1]] freq = 0
jppb_forms.add((cw1.text, cw2.text)) else:
word = match.matches[0][cid]
freq = self.word_renderer.num_words[(word.lemma, word.msd[0])]
for cid, word in match.matches[0].items(): self.stats["freq"][cid] = freq
if cid in self.corew:
self.stats["freq"][cid] = self.word_renderer.num_words[(
word.text, word.msd[0])]
self.stats['fc'] = match.distinct_forms() self.stats['df'] = match.distinct_forms()
self.stats['fc'] = len(jppb_forms) self.stats['fcxy'] = len(match)
self.stats['n'] = self.word_renderer.num_all_words() self.stats['n'] = len(jppb_forms)
self.stats['N'] = self.word_renderer.num_all_words()
def header_repeat(self): def header_repeat(self):
return ["Distribution", "Delta"] return ["Distribution", "Delta"]
@ -1147,18 +1148,36 @@ class StatsFormatter(Formatter):
return ["LogDice_core", "LogDice_all", "Distinct_forms"] return ["LogDice_core", "LogDice_all", "Distinct_forms"]
def content_repeat(self, words, representations, idx, sidx): def content_repeat(self, words, representations, idx, sidx):
# not a core word
if idx not in self.corew:
return [""] * self.length()
word = words[idx] word = words[idx]
key = (sidx, idx, word.lemma) key = (sidx, idx, word.lemma)
distribution = self.colocation_ids.dispersions[key] distribution = self.colocation_ids.dispersions[key]
# TODO... delta = ""
delta = "?" if idx in self.jppb:
idx2 = self.jppb[0] if self.jppb[0] != idx else self.jppb[1]
fx = self.stats['freq'][idx]
fy = self.stats['freq'][idx2]
fxy = self.stats['fcxy']
N = self.stats['N']
delta = fxy / fx - (fy - fxy) / (N - fx)
return [str(distribution), delta] return [str(distribution), str(delta)]
def content_right(self, freq): def content_right(self, freq):
# TODO... fx = self.stats['freq'][self.jppb[0]]
return ["?"] * 3 fy = self.stats['freq'][self.jppb[1]]
fxy = self.stats['fcxy']
logdice_core = 14 + log2(2 * fxy / (fx + fy))
sum_fi = sum(self.stats['freq'][idx] for idx in self.corew)
fc = fxy
logdice_all = 14 + log2(len(self.corew) * fc / sum_fi)
return [str(logdice_core), str(logdice_all), str(self.stats['df'])]
def group(self): def group(self):
return True return True