Added script for file extension

2020-08-20 16:13:22 +02:00 · 2020-08-20 16:13:22 +02:00 · edea80e6e0
commit edea80e6e0
parent e8fdbfdb6a
2 changed files with 199 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -18,3 +18,9 @@ Suggested running with saved mysql file in tmpfs. Instructions:
 sudo mkdir /mnt/tmp
 sudo mount -t tmpfs tmpfs /mnt/tmp
 ```
+
+If running on big corpuses (ie. Gigafida have database in RAM):
+```bash
+sudo mount -t tmpfs tmpfs /mnt/tmp
+sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp
+```
--- a/scripts/recalculate_statistics.py
+++ b/scripts/recalculate_statistics.py
@ -0,0 +1,193 @@
+import argparse
+import logging
+import os
+import sys
+import time
+from math import log2
+
+CORE_RESTRICTIONS = ['s', 'p', 'r', 'gg']
+ALL_RESTRICTIONS = CORE_RESTRICTIONS + ['vp', 'vd', 'd']
+LEMMA_COLUMNS = ['C1_Lemma', 'C2_Lemma', 'C3_Lemma', 'C4_Lemma', 'C5_Lemma']
+
+
+def load_word_order(word_order_file):
+    with open(word_order_file, 'r') as f:
+        lines = {}
+        for line in f:
+            l = line.split('|')
+            if l[6] not in [e[0] for e in lines] and l[6] != '' and l[6] != 'NSSS':
+                pos_tags = l[2].split('-')
+                core_rest = sorted([str(pt_i + 1) for cr in CORE_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr])
+                assert len(core_rest) == 2, 'Core restrictions are incorrect!'
+                all_rest = sorted([str(pt_i + 1) for cr in ALL_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr])
+                lines[l[6]] = [core_rest, all_rest]
+        return lines
+
+
+def add_word(stats, pos, word, freq):
+    if word == '':
+        return
+
+    if word not in stats['words'][pos]:
+        stats['words'][pos][word] = int(freq)
+    else:
+        stats['words'][pos][word] += int(freq)
+
+
+def get_new_stats(f):
+    lines = []
+    stats = {}
+    stats['words'] = {}
+    stats['words']['1'] = {}
+    stats['words']['2'] = {}
+    stats['words']['3'] = {}
+    stats['words']['4'] = {}
+    stats['words']['5'] = {}
+    stats['words']['total'] = 0
+
+    first_line = True
+    positions = {}
+    for line in f.readlines():
+        line = line.split(',')
+        lines.append(line)
+        if first_line:
+            positions['freq'] = line.index('Frequency')
+            for lci, lc in enumerate(LEMMA_COLUMNS):
+                positions[str(lci + 1)] = line.index(lc)
+            first_line = False
+            continue
+        for pos in range(1, 6):
+            pos = str(pos)
+            word = line[positions[pos]]
+            add_word(stats, pos, word, line[positions['freq']])
+        stats['words']['total'] += int(line[positions['freq']])
+
+    return lines, stats
+
+
+def logDice_new(stats, positions, line, rest):
+    fi = [int(stats['words'][r][line[positions[r]]]) for r in rest]
+    res = 14 + log2(2 * int(line[positions['freq']]) / sum(fi))
+    return res
+
+
+def deltaP_new(stats, positions, line, rest, delta21=True):
+    fi = [int(stats['words'][r][line[positions[r]]]) for r in rest]
+    fx = fi[0] if delta21 else fi[1]
+    fy = fi[1] if delta21 else fi[0]
+    freq = int(line[positions['freq']])
+    N = int(stats['words']['total'])
+    res = (freq / fx) - ((fy - freq) / (N - fx))
+    return res
+
+
+def write_new_stats(wf, original_text, stats, file_name, word_order):
+    structure_id = file_name.split('.')[-1]
+    core_rest, all_rest = word_order[structure_id]
+
+    first_line = True
+    positions = {}
+    for line in original_text:
+        line[-1] = line[-1][:-1]
+        # handle header file
+        if first_line:
+            line += ['structure_frequency', 'logDice_core', 'logDice_all',
+                     'weighted_logDice_frequency', 'deltaP12_structure',
+                     'deltaP21_structure', 'deltaP_structure']
+
+            for i in range(5):
+                new_pos = 6 + i + i * 5
+                line = line[:new_pos] + ['C' + str(i + 1) + '_lemma_structure_frequency'] + line[new_pos:]
+
+            positions['freq'] = line.index('Frequency')
+            for lci, lc in enumerate(LEMMA_COLUMNS):
+                positions[str(lci + 1)] = line.index(lc)
+            positions['delta12'] = line.index('Delta_p12')
+            positions['delta21'] = line.index('Delta_p21')
+            positions['logDice_core'] = line.index('LogDice_core')
+            positions['logDice_all'] = line.index('LogDice_all')
+            line[positions['logDice_core']] = 'logDice_core_corpus'
+            line[positions['logDice_all']] = 'logDice_all_corpus'
+            first_line = False
+            line = line[:positions['logDice_all'] + 1] + ['weighted_logDice_frequency_corpus'] + line[positions['logDice_all'] + 1:]
+            line = line[:positions['delta21'] + 1] + ['deltaP'] + line[positions['delta21'] + 1:]
+            # TODO INSERT 'deltaP',  and weightedlogDice_frequency and , 'weighted_logDice_frequency_corpus'
+            wf.write(','.join(line) + '\n')
+            continue
+
+        lemma_struct_freq = []
+        for i in range(5):
+            new_pos = 1 + i * 5
+            freq = str(stats['words'][str(i + 1)][line[new_pos]]) if line[new_pos] != '' else '0'
+            lemma_struct_freq.append(freq)
+
+        for i in range(5):
+            new_pos = 6 + i + i * 5
+            line = line[:new_pos] + [lemma_struct_freq[i]] + line[new_pos:]
+
+        # add structure_frequency
+        structure_frequency = int(stats['words']['total'])
+        line.append("{:.5f}".format(structure_frequency))
+        # add logDice_core_new
+        logDice_core_new = logDice_new(stats, positions, line, core_rest)
+        line.append("{:.5f}".format(logDice_core_new))
+        # add logDice_all_new
+        logDice_all_new = logDice_new(stats, positions, line, all_rest)
+        line.append("{:.5f}".format(logDice_all_new))
+        weighted_logDice_frequency_corpus = 0.3 * int(line[positions['freq']]) + 0.7 * float(
+            line[positions['logDice_core']])
+        # line.append("{:.5f}".format(weighted_logDice_frequency_corpus))
+        weighted_logDice_frequency = 0.3 * int(line[positions['freq']]) + 0.7 * logDice_core_new
+        line.append("{:.5f}".format(weighted_logDice_frequency))
+        # add deltaP12_structure
+        deltaP12_structure = deltaP_new(stats, positions, line, core_rest, delta21=False)
+        line.append("{:.5f}".format(deltaP12_structure))
+        # add deltaP21_structure
+        deltaP21_structure = deltaP_new(stats, positions, line, core_rest, delta21=True)
+        line.append("{:.5f}".format(deltaP21_structure))
+
+        deltaP12 = float(line[positions['delta12']])
+        deltaP21 = float(line[positions['delta21']])
+
+        deltaP = abs(deltaP12 - deltaP21)
+        # line.append("{:.5f}".format(deltaP))
+
+        deltaP_structure = abs(deltaP12_structure - deltaP21_structure)
+        line.append("{:.5f}".format(deltaP_structure))
+
+
+
+
+        line = line[:positions['logDice_all'] + 1] + ["{:.5f}".format(weighted_logDice_frequency_corpus)] + line[positions[
+                                                                                                      'logDice_all'] + 1:]
+        line = line[:positions['delta21'] + 1] + ["{:.5f}".format(deltaP)] + line[positions['delta21'] + 1:]
+
+        # TODO ADD OTHER COLUMNS AS IN #823 task
+        wf.write(','.join(line) + '\n')
+
+
+
+def main(args):
+    word_order = load_word_order(args.word_order_file)
+    for file_name in os.listdir(args.input):
+        read_file_path = os.path.join(args.input, file_name)
+        write_file_path = os.path.join(args.output, file_name)
+        with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf:
+            original_text, stats = get_new_stats(rf)
+            write_new_stats(wf, original_text, stats, file_name, word_order)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Extract structures from a parsed corpus.')
+    parser.add_argument('input',
+                        help='Path to folder that contains all input files.')
+    parser.add_argument('output',
+                        help='Path to folder that contains all input files.')
+    parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.')
+
+    args = parser.parse_args()
+    logging.basicConfig(stream=sys.stderr)
+
+    start = time.time()
+    main(args)
+    logging.info("TIME: {}".format(time.time() - start))