Implementing the rest of stats. Maybe ok?

pull/1/head
Ozbolt Menegatti 5 years ago
parent d7f97ba9b3
commit 9ccbd02603

@ -10,6 +10,7 @@ import time
import subprocess
import concurrent.futures
import tempfile
from math import log2
from msd_translate import MSD_TRANSLATE
@ -401,7 +402,7 @@ class ComponentRendition:
for cid, reps in representations.items():
for rep in reps:
rep.render()
for cid, reps in representations.items():
reps = [rep.rendition_text for rep in reps]
if reps == []:
@ -1126,19 +1127,19 @@ class StatsFormatter(Formatter):
jppb_forms = set()
self.stats = {"freq": {}}
for words in match.matches:
cw1 = words[self.jppb[0]]
cw2 = words[self.jppb[1]]
jppb_forms.add((cw1.text, cw2.text))
for cid in self.corew:
if cid not in match.matches[0]:
freq = 0
else:
word = match.matches[0][cid]
freq = self.word_renderer.num_words[(word.lemma, word.msd[0])]
for cid, word in match.matches[0].items():
if cid in self.corew:
self.stats["freq"][cid] = self.word_renderer.num_words[(
word.text, word.msd[0])]
self.stats["freq"][cid] = freq
self.stats['fc'] = match.distinct_forms()
self.stats['fc'] = len(jppb_forms)
self.stats['n'] = self.word_renderer.num_all_words()
self.stats['df'] = match.distinct_forms()
self.stats['fcxy'] = len(match)
self.stats['n'] = len(jppb_forms)
self.stats['N'] = self.word_renderer.num_all_words()
def header_repeat(self):
return ["Distribution", "Delta"]
@ -1147,18 +1148,36 @@ class StatsFormatter(Formatter):
return ["LogDice_core", "LogDice_all", "Distinct_forms"]
def content_repeat(self, words, representations, idx, sidx):
# not a core word
if idx not in self.corew:
return [""] * self.length()
word = words[idx]
key = (sidx, idx, word.lemma)
distribution = self.colocation_ids.dispersions[key]
# TODO...
delta = "?"
delta = ""
if idx in self.jppb:
idx2 = self.jppb[0] if self.jppb[0] != idx else self.jppb[1]
fx = self.stats['freq'][idx]
fy = self.stats['freq'][idx2]
fxy = self.stats['fcxy']
N = self.stats['N']
delta = fxy / fx - (fy - fxy) / (N - fx)
return [str(distribution), delta]
return [str(distribution), str(delta)]
def content_right(self, freq):
# TODO...
return ["?"] * 3
fx = self.stats['freq'][self.jppb[0]]
fy = self.stats['freq'][self.jppb[1]]
fxy = self.stats['fcxy']
logdice_core = 14 + log2(2 * fxy / (fx + fy))
sum_fi = sum(self.stats['freq'][idx] for idx in self.corew)
fc = fxy
logdice_all = 14 + log2(len(self.corew) * fc / sum_fi)
return [str(logdice_core), str(logdice_all), str(self.stats['df'])]
def group(self):
return True

Loading…
Cancel
Save