import argparse import logging import os import sys import time from math import log2 CORE_RESTRICTIONS = ['s', 'p', 'r', 'gg'] ALL_RESTRICTIONS = CORE_RESTRICTIONS + ['vp', 'vd', 'd'] LEMMA_COLUMNS = ['C1_Lemma', 'C2_Lemma', 'C3_Lemma', 'C4_Lemma', 'C5_Lemma'] def load_word_order(word_order_file): with open(word_order_file, 'r') as f: lines = {} for line in f: l = line.split('|') if l[6] not in [e[0] for e in lines] and l[6] != '' and l[6] != 'NSSS': pos_tags = l[2].split('-') core_rest = sorted([str(pt_i + 1) for cr in CORE_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr]) assert len(core_rest) == 2, 'Core restrictions are incorrect!' all_rest = sorted([str(pt_i + 1) for cr in ALL_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr]) lines[l[6]] = [core_rest, all_rest] return lines def add_word(stats, pos, word, freq): if word == '': return if word not in stats['words'][pos]: stats['words'][pos][word] = int(freq) else: stats['words'][pos][word] += int(freq) def get_new_stats(f): lines = [] stats = {} stats['words'] = {} stats['words']['1'] = {} stats['words']['2'] = {} stats['words']['3'] = {} stats['words']['4'] = {} stats['words']['5'] = {} stats['words']['total'] = 0 first_line = True positions = {} for line in f.readlines(): line = line.split(',') lines.append(line) if first_line: positions['freq'] = line.index('Frequency') for lci, lc in enumerate(LEMMA_COLUMNS): positions[str(lci + 1)] = line.index(lc) first_line = False continue for pos in range(1, 6): pos = str(pos) word = line[positions[pos]] add_word(stats, pos, word, line[positions['freq']]) stats['words']['total'] += int(line[positions['freq']]) return lines, stats def logDice_new(stats, positions, line, rest): fi = [int(stats['words'][r][line[positions[r]]]) for r in rest] res = 14 + log2(2 * int(line[positions['freq']]) / sum(fi)) return res def deltaP_new(stats, positions, line, rest, delta21=True): fi = [int(stats['words'][r][line[positions[r]]]) for r in rest] fx = fi[0] if delta21 else fi[1] fy = fi[1] if delta21 else fi[0] freq = int(line[positions['freq']]) N = int(stats['words']['total']) res = (freq / fx) - ((fy - freq) / (N - fx)) return res def write_new_stats(wf, original_text, stats, file_name, word_order): structure_id = file_name.split('.')[-1] core_rest, all_rest = word_order[structure_id] first_line = True positions = {} for line in original_text: line[-1] = line[-1][:-1] # handle header file if first_line: line += ['structure_frequency', 'logDice_core', 'logDice_all', 'weighted_logDice_frequency', 'deltaP12_structure', 'deltaP21_structure', 'deltaP_structure'] for i in range(5): new_pos = 6 + i + i * 5 line = line[:new_pos] + ['C' + str(i + 1) + '_lemma_structure_frequency'] + line[new_pos:] positions['freq'] = line.index('Frequency') for lci, lc in enumerate(LEMMA_COLUMNS): positions[str(lci + 1)] = line.index(lc) positions['delta12'] = line.index('Delta_p12') positions['delta21'] = line.index('Delta_p21') positions['logDice_core'] = line.index('LogDice_core') positions['logDice_all'] = line.index('LogDice_all') line[positions['logDice_core']] = 'logDice_core_corpus' line[positions['logDice_all']] = 'logDice_all_corpus' first_line = False line = line[:positions['logDice_all'] + 1] + ['weighted_logDice_frequency_corpus'] + line[positions['logDice_all'] + 1:] line = line[:positions['delta21'] + 1] + ['deltaP'] + line[positions['delta21'] + 1:] # TODO INSERT 'deltaP', and weightedlogDice_frequency and , 'weighted_logDice_frequency_corpus' wf.write(','.join(line) + '\n') continue lemma_struct_freq = [] for i in range(5): new_pos = 1 + i * 5 freq = str(stats['words'][str(i + 1)][line[new_pos]]) if line[new_pos] != '' else '0' lemma_struct_freq.append(freq) for i in range(5): new_pos = 6 + i + i * 5 line = line[:new_pos] + [lemma_struct_freq[i]] + line[new_pos:] # add structure_frequency structure_frequency = int(stats['words']['total']) line.append("{:.5f}".format(structure_frequency)) # add logDice_core_new logDice_core_new = logDice_new(stats, positions, line, core_rest) line.append("{:.5f}".format(logDice_core_new)) # add logDice_all_new logDice_all_new = logDice_new(stats, positions, line, all_rest) line.append("{:.5f}".format(logDice_all_new)) weighted_logDice_frequency_corpus = 0.3 * int(line[positions['freq']]) + 0.7 * float( line[positions['logDice_core']]) # line.append("{:.5f}".format(weighted_logDice_frequency_corpus)) weighted_logDice_frequency = 0.3 * int(line[positions['freq']]) + 0.7 * logDice_core_new line.append("{:.5f}".format(weighted_logDice_frequency)) # add deltaP12_structure deltaP12_structure = deltaP_new(stats, positions, line, core_rest, delta21=False) line.append("{:.5f}".format(deltaP12_structure)) # add deltaP21_structure deltaP21_structure = deltaP_new(stats, positions, line, core_rest, delta21=True) line.append("{:.5f}".format(deltaP21_structure)) deltaP12 = float(line[positions['delta12']]) deltaP21 = float(line[positions['delta21']]) deltaP = abs(deltaP12 - deltaP21) # line.append("{:.5f}".format(deltaP)) deltaP_structure = abs(deltaP12_structure - deltaP21_structure) line.append("{:.5f}".format(deltaP_structure)) line = line[:positions['logDice_all'] + 1] + ["{:.5f}".format(weighted_logDice_frequency_corpus)] + line[positions[ 'logDice_all'] + 1:] line = line[:positions['delta21'] + 1] + ["{:.5f}".format(deltaP)] + line[positions['delta21'] + 1:] # TODO ADD OTHER COLUMNS AS IN #823 task wf.write(','.join(line) + '\n') def main(args): word_order = load_word_order(args.word_order_file) for file_name in os.listdir(args.input): read_file_path = os.path.join(args.input, file_name) write_file_path = os.path.join(args.output, file_name) with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf: original_text, stats = get_new_stats(rf) freq_pos = original_text[0].index('Frequency') original_text = [original_text[0]] + [l for l in original_text[1:] if int(l[freq_pos]) >= 10] if len(original_text) > 1: original_text = [original_text[0]] + sorted(original_text[1:], key=lambda x: -1 * int(x[freq_pos])) else: original_text = [original_text[0]] write_new_stats(wf, original_text, stats, file_name, word_order) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Extract structures from a parsed corpus.') parser.add_argument('input', help='Path to folder that contains all input files.') parser.add_argument('output', help='Path to folder that contains all input files.') parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.') args = parser.parse_args() logging.basicConfig(stream=sys.stderr) start = time.time() main(args) logging.info("TIME: {}".format(time.time() - start))