Extended recalculate statistics to filtered output
This commit is contained in:
parent
f1366548b6
commit
39692e839f
|
@ -1,4 +1,7 @@
|
||||||
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import csv
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
@ -166,8 +169,8 @@ def write_new_stats(wf, original_text, stats, file_name, word_order):
|
||||||
wf.write(','.join(line) + '\n')
|
wf.write(','.join(line) + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
|
if not args.ignore_recalculation:
|
||||||
word_order = load_word_order(args.word_order_file)
|
word_order = load_word_order(args.word_order_file)
|
||||||
for file_name in os.listdir(args.input):
|
for file_name in os.listdir(args.input):
|
||||||
read_file_path = os.path.join(args.input, file_name)
|
read_file_path = os.path.join(args.input, file_name)
|
||||||
|
@ -184,6 +187,37 @@ def main(args):
|
||||||
original_text = [original_text[0]]
|
original_text = [original_text[0]]
|
||||||
write_new_stats(wf, original_text, stats, file_name, word_order)
|
write_new_stats(wf, original_text, stats, file_name, word_order)
|
||||||
|
|
||||||
|
if args.format_output:
|
||||||
|
for file_name in os.listdir(args.output):
|
||||||
|
read_file_path = os.path.join(args.output, file_name)
|
||||||
|
write_file_path = os.path.join(args.formatted_output, file_name)
|
||||||
|
with open(read_file_path, 'r', encoding="utf-8") as rf, open(write_file_path, 'w') as wf:
|
||||||
|
first_line = True
|
||||||
|
lines = []
|
||||||
|
formatted_output = []
|
||||||
|
for line in rf:
|
||||||
|
line = line[:-1].split(',')
|
||||||
|
if first_line:
|
||||||
|
# sorting
|
||||||
|
a = line[-17]
|
||||||
|
b = line[-15]
|
||||||
|
# post frequency
|
||||||
|
c = line[-6]
|
||||||
|
d = line[-8]
|
||||||
|
formatted_output.append(line[:-14] + [line[-6], line[-8]])
|
||||||
|
|
||||||
|
first_line = False
|
||||||
|
continue
|
||||||
|
lines.append(line[:-14] + [line[-6], line[-8]])
|
||||||
|
|
||||||
|
lines = [line for line in lines if int(line[-3]) >= 10]
|
||||||
|
lines = sorted(lines, key=lambda x: (-int(x[-3]), x[-5]))
|
||||||
|
formatted_output += lines
|
||||||
|
for line in formatted_output:
|
||||||
|
wf.write(','.join(line) + '\n')
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description='Extract structures from a parsed corpus.')
|
description='Extract structures from a parsed corpus.')
|
||||||
|
@ -194,6 +228,9 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.')
|
parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.')
|
||||||
parser.add_argument('--frequency_limit', type=int, default=1, help='File that contains word order for DeltaP calculations.')
|
parser.add_argument('--frequency_limit', type=int, default=1, help='File that contains word order for DeltaP calculations.')
|
||||||
parser.add_argument('--sorted', action='store_true', help='File that contains word order for DeltaP calculations.')
|
parser.add_argument('--sorted', action='store_true', help='File that contains word order for DeltaP calculations.')
|
||||||
|
parser.add_argument('--format_output', action='store_true', help='Format and cut data as specified in #1808 on redmine.')
|
||||||
|
parser.add_argument('--ignore_recalculation', action='store_true', help='Ignore recalculation.')
|
||||||
|
parser.add_argument('--formatted_output', default=None, help='Destination of final results.')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
logging.basicConfig(stream=sys.stderr)
|
logging.basicConfig(stream=sys.stderr)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user