luscenje_struktur/src/formatter.py

from math import log2
import re

from component import ComponentType


class Formatter:
    def __init__(self, colocation_ids, word_renderer):
        self.colocation_ids = colocation_ids
        self.word_renderer = word_renderer
        self.additional_init()

    def header_repeat(self):
        raise NotImplementedError("Header repeat formatter not implemented")
    def header_right(self):
        raise NotImplementedError("Header right formatter not implemented")
    def content_repeat(self, words, representations, idx, sidx):
        raise NotImplementedError("Content repeat formatter not implemented")
    def content_right(self, freq):
        raise NotImplementedError("Content right formatter not implemented")
    def group(self):
        raise NotImplementedError("Group for formatter not implemented")

    def additional_init(self):
        pass
    def length(self):
        return len(self.header_repeat())

    def set_structure(self, structure):
        pass
    def new_match(self, match):
        pass


class OutNoStatFormatter(Formatter):
    def additional_init(self):
        self.representation = ""

    def header_repeat(self):
        return ["Lemma", "Representative_form", "RF_scenario"]

    def header_right(self):
        return ["Joint_representative_form", "Frequency"]

    def content_repeat(self, words, representations, idx, _sidx):
        word = words[idx]
        if idx not in representations:
            return [word.lemma, "", ""]

        rep = representations[idx]
        if rep is None:
            self.representation += " " + word.lemma
            return [word.lemma, word.lemma, "lemma_fallback"]
        else:
            self.representation += " " + rep
            return [word.lemma, rep, "ok"]

    def content_right(self, freq):
        rep = re.sub(' +', ' ', self.representation.strip())
        result = [rep, str(freq)]
        self.representation = ""
        return result

    def group(self):
        return True

    def __str__(self):
        return "out-no-stat"

class AllFormatter(Formatter):
    def header_repeat(self):
        return ["Token_ID", "Word_form", "Lemma", "Msd"]

    def header_right(self):
        return []

    def content_repeat(self, words, _representations, idx, _sidx):
        word = words[idx]
        return [word.id, word.text, word.lemma, word.msd]

    def content_right(self, _freq):
        return []

    def group(self):
        return False

    def __str__(self):
        return "all"

class StatsFormatter(Formatter):
    def additional_init(self):
        self.stats = None
        self.jppb = None
        self.corew = None

    @staticmethod
    def stat_str(num):
        return "{:.5f}".format(num) if isinstance(num, float) else str(num)

    def set_structure(self, structure):
        jppb = []
        corew = []

        for component in structure.components:
            if component.type == ComponentType.Core2w:
                jppb.append(component.idx)
            if component.type != ComponentType.Other:
                corew.append(component.idx)

        assert(len(jppb) == 2)
        self.jppb = tuple(jppb)
        self.corew = tuple(corew)

    def new_match(self, match):
        self.stats = {"freq": {}}

        for cid in self.corew:
            if cid not in match.matches[0]:
                freq = 0
            else:
                word = match.matches[0][cid]
                freq = self.word_renderer.num_words[(word.lemma, word.msd[0])]

            self.stats["freq"][cid] = freq

        fx = self.stats["freq"][self.jppb[0]]
        fy = self.stats["freq"][self.jppb[1]]
        freq = len(match)
        N = self.word_renderer.num_all_words()

        self.stats['d12'] = freq / fx - (fy - freq) / (N - fx)
        self.stats['d21'] = freq / fy - (fx - freq) / (N - fy)

        self.stats['df'] = match.distinct_forms()
        self.stats['freq_all'] = freq

    def header_repeat(self):
        return ["Distribution"]

    def header_right(self):
        return ["Delta_p12", "Delta_p21", "LogDice_core", "LogDice_all", "Distinct_forms"]

    def content_repeat(self, words, representations, idx, sidx):
        # not a core word
        if idx not in self.corew:
            return [""] * self.length()

        word = words[idx]
        key = (sidx, idx, word.lemma)
        distribution = self.colocation_ids.dispersions[key]
        return [self.stat_str(distribution)]

    def content_right(self, freq):
        fx = self.stats["freq"][self.jppb[0]]
        fy = self.stats["freq"][self.jppb[1]]
        freq = self.stats['freq_all']
        logdice_core = 14 + log2(2 * freq / (fx + fy))

        fi = [self.stats["freq"][idx] for idx in self.corew]
        fi = [f for f in fi if f > 0]
        logdice_all = 14 + log2(len(fi) * freq / sum(fi))

        return [self.stat_str(x) for x in (
            self.stats["d12"], self.stats["d21"], logdice_core, logdice_all, self.stats['df']
        )]

    def group(self):
        return True

    def __str__(self):
        return "stat"

class OutFormatter(Formatter):
    def additional_init(self):
        self.f1 = OutNoStatFormatter(self.colocation_ids, self.word_renderer)
        self.f2 = StatsFormatter(self.colocation_ids, self.word_renderer)

    def header_repeat(self):
        return self.f1.header_repeat() + self.f2.header_repeat()

    def header_right(self):
        return self.f1.header_right() + self.f2.header_right()

    def content_repeat(self, words, representations, idx, sidx):
        cr1 = self.f1.content_repeat(words, representations, idx, sidx)
        cr2 = self.f2.content_repeat(words, representations, idx, sidx)
        return cr1 + cr2

    def content_right(self, freq):
        return self.f1.content_right(freq) + self.f2.content_right(freq)

    def group(self):
        return self.f1.group() and self.f2.group()

    def set_structure(self, structure):
        self.f2.set_structure(structure)

    def new_match(self, match):
        self.f2.new_match(match)

    def __str__(self):
        return "out"