luscenje_struktur/scripts/recalculate_statistics.py



import argparse
import csv
import logging
import os
import sys
import time
from math import log2

CORE_RESTRICTIONS = ['s', 'p', 'r', 'gg']
ALL_RESTRICTIONS = CORE_RESTRICTIONS + ['vp', 'vd', 'd']
LEMMA_COLUMNS = ['C1_Lemma', 'C2_Lemma', 'C3_Lemma', 'C4_Lemma', 'C5_Lemma']


def load_word_order(word_order_file):
    with open(word_order_file, 'r') as f:
        lines = {}
        for line in f:
            l = line.split('|')
            if l[6] not in [e[0] for e in lines] and l[6] != '' and l[6] != 'NSSS':
                pos_tags = l[2].split('-')
                core_rest = sorted([str(pt_i + 1) for cr in CORE_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr])
                assert len(core_rest) == 2, 'Core restrictions are incorrect!'
                all_rest = sorted([str(pt_i + 1) for cr in ALL_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr])
                lines[l[6]] = [core_rest, all_rest]
        return lines


def add_word(stats, pos, word, freq):
    if word == '':
        return

    if word not in stats['words'][pos]:
        stats['words'][pos][word] = int(freq)
    else:
        stats['words'][pos][word] += int(freq)


def get_new_stats(f):
    lines = []
    stats = {}
    stats['words'] = {}
    stats['words']['1'] = {}
    stats['words']['2'] = {}
    stats['words']['3'] = {}
    stats['words']['4'] = {}
    stats['words']['5'] = {}
    stats['words']['total'] = 0

    first_line = True
    positions = {}
    for line in f.readlines():
        line = line.split(',')
        lines.append(line)
        if first_line:
            positions['freq'] = line.index('Frequency')
            for lci, lc in enumerate(LEMMA_COLUMNS):
                positions[str(lci + 1)] = line.index(lc)
            first_line = False
            continue
        for pos in range(1, 6):
            pos = str(pos)
            word = line[positions[pos]]
            add_word(stats, pos, word, line[positions['freq']])
        stats['words']['total'] += int(line[positions['freq']])

    return lines, stats


def logDice_new(stats, positions, line, rest):
    fi = [int(stats['words'][r][line[positions[r]]]) for r in rest]
    res = 14 + log2(2 * int(line[positions['freq']]) / sum(fi))
    return res


def deltaP_new(stats, positions, line, rest, delta21=True):
    fi = [int(stats['words'][r][line[positions[r]]]) for r in rest]
    fx = fi[0] if delta21 else fi[1]
    fy = fi[1] if delta21 else fi[0]
    freq = int(line[positions['freq']])
    N = int(stats['words']['total'])
    res = (freq / fx) - ((fy - freq) / (N - fx))
    return res


def write_new_stats(wf, original_text, stats, file_name, word_order):
    structure_id = file_name.split('.')[-1]
    core_rest, all_rest = word_order[structure_id]

    first_line = True
    positions = {}
    for line in original_text:
        line[-1] = line[-1][:-1]
        # handle header file
        if first_line:
            line += ['structure_frequency', 'logDice_core', 'logDice_all',
                     'weighted_logDice_frequency', 'deltaP12_structure',
                     'deltaP21_structure', 'deltaP_structure']

            for i in range(5):
                new_pos = 6 + i + i * 5
                line = line[:new_pos] + ['C' + str(i + 1) + '_lemma_structure_frequency'] + line[new_pos:]

            positions['freq'] = line.index('Frequency')
            for lci, lc in enumerate(LEMMA_COLUMNS):
                positions[str(lci + 1)] = line.index(lc)
            positions['delta12'] = line.index('Delta_p12')
            positions['delta21'] = line.index('Delta_p21')
            positions['logDice_core'] = line.index('LogDice_core')
            positions['logDice_all'] = line.index('LogDice_all')
            line[positions['logDice_core']] = 'logDice_core_corpus'
            line[positions['logDice_all']] = 'logDice_all_corpus'
            first_line = False
            line = line[:positions['logDice_all'] + 1] + ['weighted_logDice_frequency_corpus'] + line[positions['logDice_all'] + 1:]
            line = line[:positions['delta21'] + 1] + ['deltaP'] + line[positions['delta21'] + 1:]
            # TODO INSERT 'deltaP',  and weightedlogDice_frequency and , 'weighted_logDice_frequency_corpus'
            wf.write(','.join(line) + '\n')
            continue

        lemma_struct_freq = []
        for i in range(5):
            new_pos = 1 + i * 5
            freq = str(stats['words'][str(i + 1)][line[new_pos]]) if line[new_pos] != '' else '0'
            lemma_struct_freq.append(freq)

        for i in range(5):
            new_pos = 6 + i + i * 5
            line = line[:new_pos] + [lemma_struct_freq[i]] + line[new_pos:]

        # add structure_frequency
        structure_frequency = int(stats['words']['total'])
        line.append("{:.5f}".format(structure_frequency))
        # add logDice_core_new
        logDice_core_new = logDice_new(stats, positions, line, core_rest)
        line.append("{:.5f}".format(logDice_core_new))
        # add logDice_all_new
        logDice_all_new = logDice_new(stats, positions, line, all_rest)
        line.append("{:.5f}".format(logDice_all_new))
        weighted_logDice_frequency_corpus = 0.3 * int(line[positions['freq']]) + 0.7 * float(
            line[positions['logDice_core']])
        # line.append("{:.5f}".format(weighted_logDice_frequency_corpus))
        weighted_logDice_frequency = 0.3 * int(line[positions['freq']]) + 0.7 * logDice_core_new
        line.append("{:.5f}".format(weighted_logDice_frequency))
        # add deltaP12_structure
        deltaP12_structure = deltaP_new(stats, positions, line, core_rest, delta21=False)
        line.append("{:.5f}".format(deltaP12_structure))
        # add deltaP21_structure
        deltaP21_structure = deltaP_new(stats, positions, line, core_rest, delta21=True)
        line.append("{:.5f}".format(deltaP21_structure))

        deltaP12 = float(line[positions['delta12']])
        deltaP21 = float(line[positions['delta21']])

        deltaP = abs(deltaP12 - deltaP21)
        # line.append("{:.5f}".format(deltaP))

        deltaP_structure = abs(deltaP12_structure - deltaP21_structure)
        line.append("{:.5f}".format(deltaP_structure))


        line = line[:positions['logDice_all'] + 1] + ["{:.5f}".format(weighted_logDice_frequency_corpus)] + line[positions[
                                                                                                      'logDice_all'] + 1:]
        line = line[:positions['delta21'] + 1] + ["{:.5f}".format(deltaP)] + line[positions['delta21'] + 1:]

        # TODO ADD OTHER COLUMNS AS IN #823 task
        wf.write(','.join(line) + '\n')


def main(args):
    if not args.ignore_recalculation:
        word_order = load_word_order(args.word_order_file)
        for file_name in os.listdir(args.input):
            read_file_path = os.path.join(args.input, file_name)
            write_file_path = os.path.join(args.output, file_name)
            with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf:
                original_text, stats = get_new_stats(rf)
                freq_pos = original_text[0].index('Frequency')
                if args.frequency_limit > 1:
                    original_text = [original_text[0]] + [l for l in original_text[1:] if int(l[freq_pos]) >= 10]
                if args.sorted:
                    if len(original_text) > 1:
                        original_text = [original_text[0]] + sorted(original_text[1:], key=lambda x: -1 * int(x[freq_pos]))
                    else:
                        original_text = [original_text[0]]
                write_new_stats(wf, original_text, stats, file_name, word_order)

    if args.format_output:
        for file_name in os.listdir(args.output):
            read_file_path = os.path.join(args.output, file_name)
            write_file_path = os.path.join(args.formatted_output, file_name)
            with open(read_file_path, 'r', encoding="utf-8") as rf, open(write_file_path, 'w') as wf:
                first_line = True
                lines = []
                formatted_output = []
                for line in rf:
                    line = line[:-1].split(',')
                    if first_line:
                        # sorting
                        a = line[-17]
                        b = line[-15]
                        # post frequency
                        c = line[-6]
                        d = line[-8]
                        formatted_output.append(line[:-14] + [line[-6], line[-8]])

                        first_line = False
                        continue
                    lines.append(line[:-14] + [line[-6], line[-8]])

                lines = [line for line in lines if int(line[-3]) >= 10]
                lines = sorted(lines, key=lambda x: (-int(x[-3]), x[-5]))
                formatted_output += lines
                for line in formatted_output:
                    wf.write(','.join(line) + '\n')
            break


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Extract structures from a parsed corpus.')
    parser.add_argument('input',
                        help='Path to folder that contains all input files.')
    parser.add_argument('output',
                        help='Path to folder that contains all input files.')
    parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.')
    parser.add_argument('--frequency_limit', type=int, default=1, help='File that contains word order for DeltaP calculations.')
    parser.add_argument('--sorted', action='store_true', help='File that contains word order for DeltaP calculations.')
    parser.add_argument('--format_output', action='store_true', help='Format and cut data as specified in #1808 on redmine.')
    parser.add_argument('--ignore_recalculation', action='store_true', help='Ignore recalculation.')
    parser.add_argument('--formatted_output', default=None, help='Destination of final results.')

    args = parser.parse_args()
    logging.basicConfig(stream=sys.stderr)

    start = time.time()
    main(args)
    logging.info("TIME: {}".format(time.time() - start))
Extended recalculate statistics to filtered output 2021-02-16 16:01:02 +00:00

Added script for file extension 2020-08-20 14:13:22 +00:00			`import argparse`
Extended recalculate statistics to filtered output 2021-02-16 16:01:02 +00:00			`import csv`
Added script for file extension 2020-08-20 14:13:22 +00:00			`import logging`
			`import os`
			`import sys`
			`import time`
			`from math import log2`

			`CORE_RESTRICTIONS = ['s', 'p', 'r', 'gg']`
			`ALL_RESTRICTIONS = CORE_RESTRICTIONS + ['vp', 'vd', 'd']`
			`LEMMA_COLUMNS = ['C1_Lemma', 'C2_Lemma', 'C3_Lemma', 'C4_Lemma', 'C5_Lemma']`


			`def load_word_order(word_order_file):`
			`with open(word_order_file, 'r') as f:`
			`lines = {}`
			`for line in f:`
			`l = line.split('\|')`
			`if l[6] not in [e[0] for e in lines] and l[6] != '' and l[6] != 'NSSS':`
			`pos_tags = l[2].split('-')`
			`core_rest = sorted([str(pt_i + 1) for cr in CORE_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr])`
			`assert len(core_rest) == 2, 'Core restrictions are incorrect!'`
			`all_rest = sorted([str(pt_i + 1) for cr in ALL_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr])`
			`lines[l[6]] = [core_rest, all_rest]`
			`return lines`


			`def add_word(stats, pos, word, freq):`
			`if word == '':`
			`return`

			`if word not in stats['words'][pos]:`
			`stats['words'][pos][word] = int(freq)`
			`else:`
			`stats['words'][pos][word] += int(freq)`


			`def get_new_stats(f):`
			`lines = []`
			`stats = {}`
			`stats['words'] = {}`
			`stats['words']['1'] = {}`
			`stats['words']['2'] = {}`
			`stats['words']['3'] = {}`
			`stats['words']['4'] = {}`
			`stats['words']['5'] = {}`
			`stats['words']['total'] = 0`

			`first_line = True`
			`positions = {}`
			`for line in f.readlines():`
			`line = line.split(',')`
			`lines.append(line)`
			`if first_line:`
			`positions['freq'] = line.index('Frequency')`
			`for lci, lc in enumerate(LEMMA_COLUMNS):`
			`positions[str(lci + 1)] = line.index(lc)`
			`first_line = False`
			`continue`
			`for pos in range(1, 6):`
			`pos = str(pos)`
			`word = line[positions[pos]]`
			`add_word(stats, pos, word, line[positions['freq']])`
			`stats['words']['total'] += int(line[positions['freq']])`

			`return lines, stats`


			`def logDice_new(stats, positions, line, rest):`
			`fi = [int(stats['words'][r][line[positions[r]]]) for r in rest]`
			`res = 14 + log2(2 * int(line[positions['freq']]) / sum(fi))`
			`return res`


			`def deltaP_new(stats, positions, line, rest, delta21=True):`
			`fi = [int(stats['words'][r][line[positions[r]]]) for r in rest]`
			`fx = fi[0] if delta21 else fi[1]`
			`fy = fi[1] if delta21 else fi[0]`
			`freq = int(line[positions['freq']])`
			`N = int(stats['words']['total'])`
			`res = (freq / fx) - ((fy - freq) / (N - fx))`
			`return res`


			`def write_new_stats(wf, original_text, stats, file_name, word_order):`
			`structure_id = file_name.split('.')[-1]`
			`core_rest, all_rest = word_order[structure_id]`

			`first_line = True`
			`positions = {}`
			`for line in original_text:`
			`line[-1] = line[-1][:-1]`
			`# handle header file`
			`if first_line:`
			`line += ['structure_frequency', 'logDice_core', 'logDice_all',`
			`'weighted_logDice_frequency', 'deltaP12_structure',`
			`'deltaP21_structure', 'deltaP_structure']`

			`for i in range(5):`
			`new_pos = 6 + i + i * 5`
			`line = line[:new_pos] + ['C' + str(i + 1) + '_lemma_structure_frequency'] + line[new_pos:]`

			`positions['freq'] = line.index('Frequency')`
			`for lci, lc in enumerate(LEMMA_COLUMNS):`
			`positions[str(lci + 1)] = line.index(lc)`
			`positions['delta12'] = line.index('Delta_p12')`
			`positions['delta21'] = line.index('Delta_p21')`
			`positions['logDice_core'] = line.index('LogDice_core')`
			`positions['logDice_all'] = line.index('LogDice_all')`
			`line[positions['logDice_core']] = 'logDice_core_corpus'`
			`line[positions['logDice_all']] = 'logDice_all_corpus'`
			`first_line = False`
			`line = line[:positions['logDice_all'] + 1] + ['weighted_logDice_frequency_corpus'] + line[positions['logDice_all'] + 1:]`
			`line = line[:positions['delta21'] + 1] + ['deltaP'] + line[positions['delta21'] + 1:]`
			`# TODO INSERT 'deltaP', and weightedlogDice_frequency and , 'weighted_logDice_frequency_corpus'`
			`wf.write(','.join(line) + '\n')`
			`continue`

			`lemma_struct_freq = []`
			`for i in range(5):`
			`new_pos = 1 + i * 5`
			`freq = str(stats['words'][str(i + 1)][line[new_pos]]) if line[new_pos] != '' else '0'`
			`lemma_struct_freq.append(freq)`

			`for i in range(5):`
			`new_pos = 6 + i + i * 5`
			`line = line[:new_pos] + [lemma_struct_freq[i]] + line[new_pos:]`

			`# add structure_frequency`
			`structure_frequency = int(stats['words']['total'])`
			`line.append("{:.5f}".format(structure_frequency))`
			`# add logDice_core_new`
			`logDice_core_new = logDice_new(stats, positions, line, core_rest)`
			`line.append("{:.5f}".format(logDice_core_new))`
			`# add logDice_all_new`
			`logDice_all_new = logDice_new(stats, positions, line, all_rest)`
			`line.append("{:.5f}".format(logDice_all_new))`
			`weighted_logDice_frequency_corpus = 0.3 * int(line[positions['freq']]) + 0.7 * float(`
			`line[positions['logDice_core']])`
			`# line.append("{:.5f}".format(weighted_logDice_frequency_corpus))`
			`weighted_logDice_frequency = 0.3 * int(line[positions['freq']]) + 0.7 * logDice_core_new`
			`line.append("{:.5f}".format(weighted_logDice_frequency))`
			`# add deltaP12_structure`
			`deltaP12_structure = deltaP_new(stats, positions, line, core_rest, delta21=False)`
			`line.append("{:.5f}".format(deltaP12_structure))`
			`# add deltaP21_structure`
			`deltaP21_structure = deltaP_new(stats, positions, line, core_rest, delta21=True)`
			`line.append("{:.5f}".format(deltaP21_structure))`

			`deltaP12 = float(line[positions['delta12']])`
			`deltaP21 = float(line[positions['delta21']])`

			`deltaP = abs(deltaP12 - deltaP21)`
			`# line.append("{:.5f}".format(deltaP))`

			`deltaP_structure = abs(deltaP12_structure - deltaP21_structure)`
			`line.append("{:.5f}".format(deltaP_structure))`




			`line = line[:positions['logDice_all'] + 1] + ["{:.5f}".format(weighted_logDice_frequency_corpus)] + line[positions[`
			`'logDice_all'] + 1:]`
			`line = line[:positions['delta21'] + 1] + ["{:.5f}".format(deltaP)] + line[positions['delta21'] + 1:]`

			`# TODO ADD OTHER COLUMNS AS IN #823 task`
			`wf.write(','.join(line) + '\n')`


			`def main(args):`
Extended recalculate statistics to filtered output 2021-02-16 16:01:02 +00:00			`if not args.ignore_recalculation:`
			`word_order = load_word_order(args.word_order_file)`
			`for file_name in os.listdir(args.input):`
			`read_file_path = os.path.join(args.input, file_name)`
			`write_file_path = os.path.join(args.output, file_name)`
			`with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf:`
			`original_text, stats = get_new_stats(rf)`
			`freq_pos = original_text[0].index('Frequency')`
			`if args.frequency_limit > 1:`
			`original_text = [original_text[0]] + [l for l in original_text[1:] if int(l[freq_pos]) >= 10]`
			`if args.sorted:`
			`if len(original_text) > 1:`
			`original_text = [original_text[0]] + sorted(original_text[1:], key=lambda x: -1 * int(x[freq_pos]))`
			`else:`
			`original_text = [original_text[0]]`
			`write_new_stats(wf, original_text, stats, file_name, word_order)`

			`if args.format_output:`
			`for file_name in os.listdir(args.output):`
			`read_file_path = os.path.join(args.output, file_name)`
			`write_file_path = os.path.join(args.formatted_output, file_name)`
			`with open(read_file_path, 'r', encoding="utf-8") as rf, open(write_file_path, 'w') as wf:`
			`first_line = True`
			`lines = []`
			`formatted_output = []`
			`for line in rf:`
			`line = line[:-1].split(',')`
			`if first_line:`
			`# sorting`
			`a = line[-17]`
			`b = line[-15]`
			`# post frequency`
			`c = line[-6]`
			`d = line[-8]`
			`formatted_output.append(line[:-14] + [line[-6], line[-8]])`

			`first_line = False`
			`continue`
			`lines.append(line[:-14] + [line[-6], line[-8]])`

			`lines = [line for line in lines if int(line[-3]) >= 10]`
			`lines = sorted(lines, key=lambda x: (-int(x[-3]), x[-5]))`
			`formatted_output += lines`
			`for line in formatted_output:`
			`wf.write(','.join(line) + '\n')`
			`break`

Added script for file extension 2020-08-20 14:13:22 +00:00
			`if __name__ == '__main__':`
			`parser = argparse.ArgumentParser(`
			`description='Extract structures from a parsed corpus.')`
			`parser.add_argument('input',`
			`help='Path to folder that contains all input files.')`
			`parser.add_argument('output',`
			`help='Path to folder that contains all input files.')`
			`parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.')`
Modified readme.md + Removed obligatory sloleks_db + Added frequency_limit and sorted parameters in recalculate_statistics.py 2020-09-02 08:53:45 +00:00			`parser.add_argument('--frequency_limit', type=int, default=1, help='File that contains word order for DeltaP calculations.')`
			`parser.add_argument('--sorted', action='store_true', help='File that contains word order for DeltaP calculations.')`
Extended recalculate statistics to filtered output 2021-02-16 16:01:02 +00:00			`parser.add_argument('--format_output', action='store_true', help='Format and cut data as specified in #1808 on redmine.')`
			`parser.add_argument('--ignore_recalculation', action='store_true', help='Ignore recalculation.')`
			`parser.add_argument('--formatted_output', default=None, help='Destination of final results.')`
Added script for file extension 2020-08-20 14:13:22 +00:00
			`args = parser.parse_args()`
			`logging.basicConfig(stream=sys.stderr)`

			`start = time.time()`
			`main(args)`
			`logging.info("TIME: {}".format(time.time() - start))`