luscenje_struktur/src/formatter.py

from math import log2
import re

from component import ComponentType


class Formatter:
    def __init__(self, colocation_ids, word_renderer):
        self.colocation_ids = colocation_ids
        self.word_renderer = word_renderer
        self.additional_init()
    
    def header_repeat(self):
        raise NotImplementedError("Header repeat formatter not implemented")
    def header_right(self):
        raise NotImplementedError("Header right formatter not implemented")
    def content_repeat(self, words, representations, idx, sidx):
        raise NotImplementedError("Content repeat formatter not implemented")
    def content_right(self, freq):
        raise NotImplementedError("Content right formatter not implemented")
    def group(self):
        raise NotImplementedError("Group for formatter not implemented")
    
    def additional_init(self):
        pass
    def length(self):
        return len(self.header_repeat())

    def set_structure(self, structure):
        pass
    def new_match(self, match):
        pass


class OutNoStatFormatter(Formatter):
    def additional_init(self):
        self.representation = ""

    def header_repeat(self):
        return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"]
    
    def header_right(self):
        return ["Joint_representative_form", "Frequency"]
    
    def content_repeat(self, words, representations, idx, _sidx):
        word = words[idx]
        if idx not in representations:
            return [word.lemma, "", ""]

        rep = representations[idx]
        if rep is None:
            self.representation += " " + word.lemma
            return [word.lemma, word.lemma, "", "lemma_fallback"]
        else:
            self.representation += " " + rep
            return [word.lemma, rep, word.msd, "ok"]

    def content_right(self, freq):
        rep = re.sub(' +', ' ', self.representation.strip())
        result = [rep, str(freq)]
        self.representation = ""
        return result

    def group(self):
        return True
    
    def __str__(self):
        return "out-no-stat"

class AllFormatter(Formatter):
    def header_repeat(self):
        return ["Token_ID", "Word_form", "Lemma", "Msd"]
    
    def header_right(self):
        return []
    
    def content_repeat(self, words, _representations, idx, _sidx):
        word = words[idx]
        return [word.id, word.text, word.lemma, word.msd]
    
    def content_right(self, _freq):
        return []
    
    def group(self):
        return False

    def __str__(self):
        return "all"

class StatsFormatter(Formatter):
    def additional_init(self):
        self.stats = None
        self.jppb = None
        self.corew = None
    
    @staticmethod
    def stat_str(num):
        return "{:.5f}".format(num) if isinstance(num, float) else str(num)
    
    def set_structure(self, structure):
        jppb = []
        corew = []

        for component in structure.components:
            if component.type == ComponentType.Core2w:
                jppb.append(component.idx)
            if component.type != ComponentType.Other:
                corew.append(component.idx)

        assert(len(jppb) == 2)
        self.jppb = tuple(jppb)
        self.corew = tuple(corew)
    
    def new_match(self, match):
        self.stats = {"freq": {}}

        for cid in self.corew:
            if cid not in match.matches[0]:
                freq = 0
            else:
                word = match.matches[0][cid]
                freq = self.word_renderer.num_words(word.lemma, word.msd[0])

            self.stats["freq"][cid] = freq

        fx = self.stats["freq"][self.jppb[0]]
        fy = self.stats["freq"][self.jppb[1]]
        freq = len(match)
        N = self.word_renderer.num_all_words()

        self.stats['d12'] = freq / fx - (fy - freq) / (N - fx)
        self.stats['d21'] = freq / fy - (fx - freq) / (N - fy)

        self.stats['df'] = match.distinct_forms()
        self.stats['freq_all'] = freq

    def header_repeat(self):
        return ["Distribution"]
    
    def header_right(self):
        return ["Delta_p12", "Delta_p21", "LogDice_core", "LogDice_all", "Distinct_forms"]
    
    def content_repeat(self, words, representations, idx, sidx):
        # not a core word
        if idx not in self.corew:
            return [""] * self.length()

        word = words[idx]
        key = (sidx, idx, word.lemma)
        distribution = self.colocation_ids.dispersions[key]
        return [self.stat_str(distribution)]
    
    def content_right(self, freq):
        fx = self.stats["freq"][self.jppb[0]]
        fy = self.stats["freq"][self.jppb[1]]
        freq = self.stats['freq_all']
        logdice_core = 14 + log2(2 * freq / (fx + fy))

        fi = [self.stats["freq"][idx] for idx in self.corew]
        fi = [f for f in fi if f > 0]
        logdice_all = 14 + log2(len(fi) * freq / sum(fi))

        return [self.stat_str(x) for x in (
            self.stats["d12"], self.stats["d21"], logdice_core, logdice_all, self.stats['df']
        )]
    
    def group(self):
        return True
    
    def __str__(self):
        return "stat"

class OutFormatter(Formatter):
    def additional_init(self):
        self.f1 = OutNoStatFormatter(self.colocation_ids, self.word_renderer)
        self.f2 = StatsFormatter(self.colocation_ids, self.word_renderer)

    def header_repeat(self):
        return self.f1.header_repeat() + self.f2.header_repeat()

    def header_right(self):
        return self.f1.header_right() + self.f2.header_right()

    def content_repeat(self, words, representations, idx, sidx):
        cr1 = self.f1.content_repeat(words, representations, idx, sidx)
        cr2 = self.f2.content_repeat(words, representations, idx, sidx)
        return cr1 + cr2

    def content_right(self, freq):
        return self.f1.content_right(freq) + self.f2.content_right(freq)

    def group(self):
        return self.f1.group() and self.f2.group()

    def set_structure(self, structure):
        self.f2.set_structure(structure)

    def new_match(self, match):
        self.f2.new_match(match)
    
    def __str__(self):
        return "out"
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`from math import log2`
			`import re`

			`from component import ComponentType`


			`class Formatter:`
			`def __init__(self, colocation_ids, word_renderer):`
			`self.colocation_ids = colocation_ids`
			`self.word_renderer = word_renderer`
			`self.additional_init()`

			`def header_repeat(self):`
			`raise NotImplementedError("Header repeat formatter not implemented")`
			`def header_right(self):`
			`raise NotImplementedError("Header right formatter not implemented")`
			`def content_repeat(self, words, representations, idx, sidx):`
			`raise NotImplementedError("Content repeat formatter not implemented")`
			`def content_right(self, freq):`
			`raise NotImplementedError("Content right formatter not implemented")`
			`def group(self):`
			`raise NotImplementedError("Group for formatter not implemented")`

			`def additional_init(self):`
			`pass`
			`def length(self):`
			`return len(self.header_repeat())`

			`def set_structure(self, structure):`
			`pass`
			`def new_match(self, match):`
			`pass`


			`class OutNoStatFormatter(Formatter):`
			`def additional_init(self):`
			`self.representation = ""`

			`def header_repeat(self):`
Adding msd to out formatter 2019-07-01 15:18:25 +00:00			`return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"]`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
			`def header_right(self):`
			`return ["Joint_representative_form", "Frequency"]`

			`def content_repeat(self, words, representations, idx, _sidx):`
			`word = words[idx]`
			`if idx not in representations:`
			`return [word.lemma, "", ""]`

			`rep = representations[idx]`
			`if rep is None:`
			`self.representation += " " + word.lemma`
Adding msd to out formatter 2019-07-01 15:18:25 +00:00			`return [word.lemma, word.lemma, "", "lemma_fallback"]`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`else:`
			`self.representation += " " + rep`
Adding msd to out formatter 2019-07-01 15:18:25 +00:00			`return [word.lemma, rep, word.msd, "ok"]`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
			`def content_right(self, freq):`
			`rep = re.sub(' +', ' ', self.representation.strip())`
			`result = [rep, str(freq)]`
			`self.representation = ""`
			`return result`

			`def group(self):`
			`return True`
New progress bar 2019-06-17 15:30:51 +00:00
			`def __str__(self):`
			`return "out-no-stat"`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
			`class AllFormatter(Formatter):`
			`def header_repeat(self):`
			`return ["Token_ID", "Word_form", "Lemma", "Msd"]`

			`def header_right(self):`
			`return []`

			`def content_repeat(self, words, _representations, idx, _sidx):`
			`word = words[idx]`
			`return [word.id, word.text, word.lemma, word.msd]`

			`def content_right(self, _freq):`
			`return []`

			`def group(self):`
			`return False`

New progress bar 2019-06-17 15:30:51 +00:00			`def __str__(self):`
			`return "all"`

HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`class StatsFormatter(Formatter):`
			`def additional_init(self):`
			`self.stats = None`
			`self.jppb = None`
			`self.corew = None`

			`@staticmethod`
			`def stat_str(num):`
			`return "{:.5f}".format(num) if isinstance(num, float) else str(num)`

			`def set_structure(self, structure):`
			`jppb = []`
			`corew = []`

			`for component in structure.components:`
			`if component.type == ComponentType.Core2w:`
			`jppb.append(component.idx)`
			`if component.type != ComponentType.Other:`
			`corew.append(component.idx)`

			`assert(len(jppb) == 2)`
			`self.jppb = tuple(jppb)`
			`self.corew = tuple(corew)`

			`def new_match(self, match):`
			`self.stats = {"freq": {}}`

			`for cid in self.corew:`
			`if cid not in match.matches[0]:`
			`freq = 0`
			`else:`
			`word = match.matches[0][cid]`
word stats on sqlite now, not yet really working. 2019-06-26 22:37:47 +00:00			`freq = self.word_renderer.num_words(word.lemma, word.msd[0])`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
			`self.stats["freq"][cid] = freq`

			`fx = self.stats["freq"][self.jppb[0]]`
			`fy = self.stats["freq"][self.jppb[1]]`
			`freq = len(match)`
			`N = self.word_renderer.num_all_words()`

			`self.stats['d12'] = freq / fx - (fy - freq) / (N - fx)`
			`self.stats['d21'] = freq / fy - (fx - freq) / (N - fy)`

			`self.stats['df'] = match.distinct_forms()`
			`self.stats['freq_all'] = freq`

			`def header_repeat(self):`
			`return ["Distribution"]`

			`def header_right(self):`
			`return ["Delta_p12", "Delta_p21", "LogDice_core", "LogDice_all", "Distinct_forms"]`

			`def content_repeat(self, words, representations, idx, sidx):`
			`# not a core word`
			`if idx not in self.corew:`
			`return [""] * self.length()`

			`word = words[idx]`
			`key = (sidx, idx, word.lemma)`
			`distribution = self.colocation_ids.dispersions[key]`
			`return [self.stat_str(distribution)]`

			`def content_right(self, freq):`
			`fx = self.stats["freq"][self.jppb[0]]`
			`fy = self.stats["freq"][self.jppb[1]]`
			`freq = self.stats['freq_all']`
			`logdice_core = 14 + log2(2 * freq / (fx + fy))`

			`fi = [self.stats["freq"][idx] for idx in self.corew]`
			`fi = [f for f in fi if f > 0]`
			`logdice_all = 14 + log2(len(fi) * freq / sum(fi))`

			`return [self.stat_str(x) for x in (`
			`self.stats["d12"], self.stats["d21"], logdice_core, logdice_all, self.stats['df']`
			`)]`

			`def group(self):`
			`return True`
New progress bar 2019-06-17 15:30:51 +00:00
			`def __str__(self):`
			`return "stat"`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
			`class OutFormatter(Formatter):`
			`def additional_init(self):`
			`self.f1 = OutNoStatFormatter(self.colocation_ids, self.word_renderer)`
			`self.f2 = StatsFormatter(self.colocation_ids, self.word_renderer)`

			`def header_repeat(self):`
			`return self.f1.header_repeat() + self.f2.header_repeat()`

			`def header_right(self):`
			`return self.f1.header_right() + self.f2.header_right()`

			`def content_repeat(self, words, representations, idx, sidx):`
			`cr1 = self.f1.content_repeat(words, representations, idx, sidx)`
			`cr2 = self.f2.content_repeat(words, representations, idx, sidx)`
			`return cr1 + cr2`

			`def content_right(self, freq):`
			`return self.f1.content_right(freq) + self.f2.content_right(freq)`

			`def group(self):`
			`return self.f1.group() and self.f2.group()`

			`def set_structure(self, structure):`
			`self.f2.set_structure(structure)`

			`def new_match(self, match):`
New progress bar 2019-06-17 15:30:51 +00:00			`self.f2.new_match(match)`

			`def __str__(self):`
			`return "out"`