|
|
|
@ -10,6 +10,7 @@ import time
|
|
|
|
|
import subprocess
|
|
|
|
|
import concurrent.futures
|
|
|
|
|
import tempfile
|
|
|
|
|
from math import log2
|
|
|
|
|
|
|
|
|
|
from msd_translate import MSD_TRANSLATE
|
|
|
|
|
|
|
|
|
@ -401,7 +402,7 @@ class ComponentRendition:
|
|
|
|
|
for cid, reps in representations.items():
|
|
|
|
|
for rep in reps:
|
|
|
|
|
rep.render()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for cid, reps in representations.items():
|
|
|
|
|
reps = [rep.rendition_text for rep in reps]
|
|
|
|
|
if reps == []:
|
|
|
|
@ -1126,19 +1127,19 @@ class StatsFormatter(Formatter):
|
|
|
|
|
jppb_forms = set()
|
|
|
|
|
self.stats = {"freq": {}}
|
|
|
|
|
|
|
|
|
|
for words in match.matches:
|
|
|
|
|
cw1 = words[self.jppb[0]]
|
|
|
|
|
cw2 = words[self.jppb[1]]
|
|
|
|
|
jppb_forms.add((cw1.text, cw2.text))
|
|
|
|
|
for cid in self.corew:
|
|
|
|
|
if cid not in match.matches[0]:
|
|
|
|
|
freq = 0
|
|
|
|
|
else:
|
|
|
|
|
word = match.matches[0][cid]
|
|
|
|
|
freq = self.word_renderer.num_words[(word.lemma, word.msd[0])]
|
|
|
|
|
|
|
|
|
|
for cid, word in match.matches[0].items():
|
|
|
|
|
if cid in self.corew:
|
|
|
|
|
self.stats["freq"][cid] = self.word_renderer.num_words[(
|
|
|
|
|
word.text, word.msd[0])]
|
|
|
|
|
self.stats["freq"][cid] = freq
|
|
|
|
|
|
|
|
|
|
self.stats['fc'] = match.distinct_forms()
|
|
|
|
|
self.stats['fc'] = len(jppb_forms)
|
|
|
|
|
self.stats['n'] = self.word_renderer.num_all_words()
|
|
|
|
|
self.stats['df'] = match.distinct_forms()
|
|
|
|
|
self.stats['fcxy'] = len(match)
|
|
|
|
|
self.stats['n'] = len(jppb_forms)
|
|
|
|
|
self.stats['N'] = self.word_renderer.num_all_words()
|
|
|
|
|
|
|
|
|
|
def header_repeat(self):
|
|
|
|
|
return ["Distribution", "Delta"]
|
|
|
|
@ -1147,18 +1148,36 @@ class StatsFormatter(Formatter):
|
|
|
|
|
return ["LogDice_core", "LogDice_all", "Distinct_forms"]
|
|
|
|
|
|
|
|
|
|
def content_repeat(self, words, representations, idx, sidx):
|
|
|
|
|
# not a core word
|
|
|
|
|
if idx not in self.corew:
|
|
|
|
|
return [""] * self.length()
|
|
|
|
|
|
|
|
|
|
word = words[idx]
|
|
|
|
|
key = (sidx, idx, word.lemma)
|
|
|
|
|
distribution = self.colocation_ids.dispersions[key]
|
|
|
|
|
|
|
|
|
|
# TODO...
|
|
|
|
|
delta = "?"
|
|
|
|
|
delta = ""
|
|
|
|
|
if idx in self.jppb:
|
|
|
|
|
idx2 = self.jppb[0] if self.jppb[0] != idx else self.jppb[1]
|
|
|
|
|
fx = self.stats['freq'][idx]
|
|
|
|
|
fy = self.stats['freq'][idx2]
|
|
|
|
|
fxy = self.stats['fcxy']
|
|
|
|
|
N = self.stats['N']
|
|
|
|
|
delta = fxy / fx - (fy - fxy) / (N - fx)
|
|
|
|
|
|
|
|
|
|
return [str(distribution), delta]
|
|
|
|
|
return [str(distribution), str(delta)]
|
|
|
|
|
|
|
|
|
|
def content_right(self, freq):
|
|
|
|
|
# TODO...
|
|
|
|
|
return ["?"] * 3
|
|
|
|
|
fx = self.stats['freq'][self.jppb[0]]
|
|
|
|
|
fy = self.stats['freq'][self.jppb[1]]
|
|
|
|
|
fxy = self.stats['fcxy']
|
|
|
|
|
logdice_core = 14 + log2(2 * fxy / (fx + fy))
|
|
|
|
|
|
|
|
|
|
sum_fi = sum(self.stats['freq'][idx] for idx in self.corew)
|
|
|
|
|
fc = fxy
|
|
|
|
|
logdice_all = 14 + log2(len(self.corew) * fc / sum_fi)
|
|
|
|
|
|
|
|
|
|
return [str(logdice_core), str(logdice_all), str(self.stats['df'])]
|
|
|
|
|
|
|
|
|
|
def group(self):
|
|
|
|
|
return True
|
|
|
|
|