diff --git a/README.md b/README.md index 58ee3b1..86b56b3 100644 --- a/README.md +++ b/README.md @@ -17,4 +17,10 @@ Suggested running with saved mysql file in tmpfs. Instructions: ```bash sudo mkdir /mnt/tmp sudo mount -t tmpfs tmpfs /mnt/tmp +``` + +If running on big corpuses (ie. Gigafida have database in RAM): +```bash +sudo mount -t tmpfs tmpfs /mnt/tmp +sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp ``` \ No newline at end of file diff --git a/scripts/recalculate_statistics.py b/scripts/recalculate_statistics.py new file mode 100644 index 0000000..4239ab7 --- /dev/null +++ b/scripts/recalculate_statistics.py @@ -0,0 +1,193 @@ +import argparse +import logging +import os +import sys +import time +from math import log2 + +CORE_RESTRICTIONS = ['s', 'p', 'r', 'gg'] +ALL_RESTRICTIONS = CORE_RESTRICTIONS + ['vp', 'vd', 'd'] +LEMMA_COLUMNS = ['C1_Lemma', 'C2_Lemma', 'C3_Lemma', 'C4_Lemma', 'C5_Lemma'] + + +def load_word_order(word_order_file): + with open(word_order_file, 'r') as f: + lines = {} + for line in f: + l = line.split('|') + if l[6] not in [e[0] for e in lines] and l[6] != '' and l[6] != 'NSSS': + pos_tags = l[2].split('-') + core_rest = sorted([str(pt_i + 1) for cr in CORE_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr]) + assert len(core_rest) == 2, 'Core restrictions are incorrect!' + all_rest = sorted([str(pt_i + 1) for cr in ALL_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr]) + lines[l[6]] = [core_rest, all_rest] + return lines + + +def add_word(stats, pos, word, freq): + if word == '': + return + + if word not in stats['words'][pos]: + stats['words'][pos][word] = int(freq) + else: + stats['words'][pos][word] += int(freq) + + +def get_new_stats(f): + lines = [] + stats = {} + stats['words'] = {} + stats['words']['1'] = {} + stats['words']['2'] = {} + stats['words']['3'] = {} + stats['words']['4'] = {} + stats['words']['5'] = {} + stats['words']['total'] = 0 + + first_line = True + positions = {} + for line in f.readlines(): + line = line.split(',') + lines.append(line) + if first_line: + positions['freq'] = line.index('Frequency') + for lci, lc in enumerate(LEMMA_COLUMNS): + positions[str(lci + 1)] = line.index(lc) + first_line = False + continue + for pos in range(1, 6): + pos = str(pos) + word = line[positions[pos]] + add_word(stats, pos, word, line[positions['freq']]) + stats['words']['total'] += int(line[positions['freq']]) + + return lines, stats + + +def logDice_new(stats, positions, line, rest): + fi = [int(stats['words'][r][line[positions[r]]]) for r in rest] + res = 14 + log2(2 * int(line[positions['freq']]) / sum(fi)) + return res + + +def deltaP_new(stats, positions, line, rest, delta21=True): + fi = [int(stats['words'][r][line[positions[r]]]) for r in rest] + fx = fi[0] if delta21 else fi[1] + fy = fi[1] if delta21 else fi[0] + freq = int(line[positions['freq']]) + N = int(stats['words']['total']) + res = (freq / fx) - ((fy - freq) / (N - fx)) + return res + + +def write_new_stats(wf, original_text, stats, file_name, word_order): + structure_id = file_name.split('.')[-1] + core_rest, all_rest = word_order[structure_id] + + first_line = True + positions = {} + for line in original_text: + line[-1] = line[-1][:-1] + # handle header file + if first_line: + line += ['structure_frequency', 'logDice_core', 'logDice_all', + 'weighted_logDice_frequency', 'deltaP12_structure', + 'deltaP21_structure', 'deltaP_structure'] + + for i in range(5): + new_pos = 6 + i + i * 5 + line = line[:new_pos] + ['C' + str(i + 1) + '_lemma_structure_frequency'] + line[new_pos:] + + positions['freq'] = line.index('Frequency') + for lci, lc in enumerate(LEMMA_COLUMNS): + positions[str(lci + 1)] = line.index(lc) + positions['delta12'] = line.index('Delta_p12') + positions['delta21'] = line.index('Delta_p21') + positions['logDice_core'] = line.index('LogDice_core') + positions['logDice_all'] = line.index('LogDice_all') + line[positions['logDice_core']] = 'logDice_core_corpus' + line[positions['logDice_all']] = 'logDice_all_corpus' + first_line = False + line = line[:positions['logDice_all'] + 1] + ['weighted_logDice_frequency_corpus'] + line[positions['logDice_all'] + 1:] + line = line[:positions['delta21'] + 1] + ['deltaP'] + line[positions['delta21'] + 1:] + # TODO INSERT 'deltaP', and weightedlogDice_frequency and , 'weighted_logDice_frequency_corpus' + wf.write(','.join(line) + '\n') + continue + + lemma_struct_freq = [] + for i in range(5): + new_pos = 1 + i * 5 + freq = str(stats['words'][str(i + 1)][line[new_pos]]) if line[new_pos] != '' else '0' + lemma_struct_freq.append(freq) + + for i in range(5): + new_pos = 6 + i + i * 5 + line = line[:new_pos] + [lemma_struct_freq[i]] + line[new_pos:] + + # add structure_frequency + structure_frequency = int(stats['words']['total']) + line.append("{:.5f}".format(structure_frequency)) + # add logDice_core_new + logDice_core_new = logDice_new(stats, positions, line, core_rest) + line.append("{:.5f}".format(logDice_core_new)) + # add logDice_all_new + logDice_all_new = logDice_new(stats, positions, line, all_rest) + line.append("{:.5f}".format(logDice_all_new)) + weighted_logDice_frequency_corpus = 0.3 * int(line[positions['freq']]) + 0.7 * float( + line[positions['logDice_core']]) + # line.append("{:.5f}".format(weighted_logDice_frequency_corpus)) + weighted_logDice_frequency = 0.3 * int(line[positions['freq']]) + 0.7 * logDice_core_new + line.append("{:.5f}".format(weighted_logDice_frequency)) + # add deltaP12_structure + deltaP12_structure = deltaP_new(stats, positions, line, core_rest, delta21=False) + line.append("{:.5f}".format(deltaP12_structure)) + # add deltaP21_structure + deltaP21_structure = deltaP_new(stats, positions, line, core_rest, delta21=True) + line.append("{:.5f}".format(deltaP21_structure)) + + deltaP12 = float(line[positions['delta12']]) + deltaP21 = float(line[positions['delta21']]) + + deltaP = abs(deltaP12 - deltaP21) + # line.append("{:.5f}".format(deltaP)) + + deltaP_structure = abs(deltaP12_structure - deltaP21_structure) + line.append("{:.5f}".format(deltaP_structure)) + + + + + line = line[:positions['logDice_all'] + 1] + ["{:.5f}".format(weighted_logDice_frequency_corpus)] + line[positions[ + 'logDice_all'] + 1:] + line = line[:positions['delta21'] + 1] + ["{:.5f}".format(deltaP)] + line[positions['delta21'] + 1:] + + # TODO ADD OTHER COLUMNS AS IN #823 task + wf.write(','.join(line) + '\n') + + + +def main(args): + word_order = load_word_order(args.word_order_file) + for file_name in os.listdir(args.input): + read_file_path = os.path.join(args.input, file_name) + write_file_path = os.path.join(args.output, file_name) + with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf: + original_text, stats = get_new_stats(rf) + write_new_stats(wf, original_text, stats, file_name, word_order) + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Extract structures from a parsed corpus.') + parser.add_argument('input', + help='Path to folder that contains all input files.') + parser.add_argument('output', + help='Path to folder that contains all input files.') + parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.') + + args = parser.parse_args() + logging.basicConfig(stream=sys.stderr) + + start = time.time() + main(args) + logging.info("TIME: {}".format(time.time() - start))