From 39692e839fbc7f4d38eb145b6a8eaac9d6f4f494 Mon Sep 17 00:00:00 2001 From: Luka Date: Tue, 16 Feb 2021 17:01:02 +0100 Subject: [PATCH] Extended recalculate statistics to filtered output --- scripts/recalculate_statistics.py | 69 ++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 16 deletions(-) diff --git a/scripts/recalculate_statistics.py b/scripts/recalculate_statistics.py index d565586..86c57cb 100644 --- a/scripts/recalculate_statistics.py +++ b/scripts/recalculate_statistics.py @@ -1,4 +1,7 @@ + + import argparse +import csv import logging import os import sys @@ -166,23 +169,54 @@ def write_new_stats(wf, original_text, stats, file_name, word_order): wf.write(','.join(line) + '\n') - def main(args): - word_order = load_word_order(args.word_order_file) - for file_name in os.listdir(args.input): - read_file_path = os.path.join(args.input, file_name) - write_file_path = os.path.join(args.output, file_name) - with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf: - original_text, stats = get_new_stats(rf) - freq_pos = original_text[0].index('Frequency') - if args.frequency_limit > 1: - original_text = [original_text[0]] + [l for l in original_text[1:] if int(l[freq_pos]) >= 10] - if args.sorted: - if len(original_text) > 1: - original_text = [original_text[0]] + sorted(original_text[1:], key=lambda x: -1 * int(x[freq_pos])) - else: - original_text = [original_text[0]] - write_new_stats(wf, original_text, stats, file_name, word_order) + if not args.ignore_recalculation: + word_order = load_word_order(args.word_order_file) + for file_name in os.listdir(args.input): + read_file_path = os.path.join(args.input, file_name) + write_file_path = os.path.join(args.output, file_name) + with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf: + original_text, stats = get_new_stats(rf) + freq_pos = original_text[0].index('Frequency') + if args.frequency_limit > 1: + original_text = [original_text[0]] + [l for l in original_text[1:] if int(l[freq_pos]) >= 10] + if args.sorted: + if len(original_text) > 1: + original_text = [original_text[0]] + sorted(original_text[1:], key=lambda x: -1 * int(x[freq_pos])) + else: + original_text = [original_text[0]] + write_new_stats(wf, original_text, stats, file_name, word_order) + + if args.format_output: + for file_name in os.listdir(args.output): + read_file_path = os.path.join(args.output, file_name) + write_file_path = os.path.join(args.formatted_output, file_name) + with open(read_file_path, 'r', encoding="utf-8") as rf, open(write_file_path, 'w') as wf: + first_line = True + lines = [] + formatted_output = [] + for line in rf: + line = line[:-1].split(',') + if first_line: + # sorting + a = line[-17] + b = line[-15] + # post frequency + c = line[-6] + d = line[-8] + formatted_output.append(line[:-14] + [line[-6], line[-8]]) + + first_line = False + continue + lines.append(line[:-14] + [line[-6], line[-8]]) + + lines = [line for line in lines if int(line[-3]) >= 10] + lines = sorted(lines, key=lambda x: (-int(x[-3]), x[-5])) + formatted_output += lines + for line in formatted_output: + wf.write(','.join(line) + '\n') + break + if __name__ == '__main__': parser = argparse.ArgumentParser( @@ -194,6 +228,9 @@ if __name__ == '__main__': parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.') parser.add_argument('--frequency_limit', type=int, default=1, help='File that contains word order for DeltaP calculations.') parser.add_argument('--sorted', action='store_true', help='File that contains word order for DeltaP calculations.') + parser.add_argument('--format_output', action='store_true', help='Format and cut data as specified in #1808 on redmine.') + parser.add_argument('--ignore_recalculation', action='store_true', help='Ignore recalculation.') + parser.add_argument('--formatted_output', default=None, help='Destination of final results.') args = parser.parse_args() logging.basicConfig(stream=sys.stderr)