Added script for file extension
This commit is contained in:
		
							parent
							
								
									e8fdbfdb6a
								
							
						
					
					
						commit
						edea80e6e0
					
				| @ -18,3 +18,9 @@ Suggested running with saved mysql file in tmpfs. Instructions: | ||||
| sudo mkdir /mnt/tmp | ||||
| sudo mount -t tmpfs tmpfs /mnt/tmp | ||||
| ``` | ||||
| 
 | ||||
| If running on big corpuses (ie. Gigafida have database in RAM): | ||||
| ```bash | ||||
| sudo mount -t tmpfs tmpfs /mnt/tmp | ||||
| sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp | ||||
| ``` | ||||
							
								
								
									
										193
									
								
								scripts/recalculate_statistics.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										193
									
								
								scripts/recalculate_statistics.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,193 @@ | ||||
| import argparse | ||||
| import logging | ||||
| import os | ||||
| import sys | ||||
| import time | ||||
| from math import log2 | ||||
| 
 | ||||
| CORE_RESTRICTIONS = ['s', 'p', 'r', 'gg'] | ||||
| ALL_RESTRICTIONS = CORE_RESTRICTIONS + ['vp', 'vd', 'd'] | ||||
| LEMMA_COLUMNS = ['C1_Lemma', 'C2_Lemma', 'C3_Lemma', 'C4_Lemma', 'C5_Lemma'] | ||||
| 
 | ||||
| 
 | ||||
| def load_word_order(word_order_file): | ||||
|     with open(word_order_file, 'r') as f: | ||||
|         lines = {} | ||||
|         for line in f: | ||||
|             l = line.split('|') | ||||
|             if l[6] not in [e[0] for e in lines] and l[6] != '' and l[6] != 'NSSS': | ||||
|                 pos_tags = l[2].split('-') | ||||
|                 core_rest = sorted([str(pt_i + 1) for cr in CORE_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr]) | ||||
|                 assert len(core_rest) == 2, 'Core restrictions are incorrect!' | ||||
|                 all_rest = sorted([str(pt_i + 1) for cr in ALL_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr]) | ||||
|                 lines[l[6]] = [core_rest, all_rest] | ||||
|         return lines | ||||
| 
 | ||||
| 
 | ||||
| def add_word(stats, pos, word, freq): | ||||
|     if word == '': | ||||
|         return | ||||
| 
 | ||||
|     if word not in stats['words'][pos]: | ||||
|         stats['words'][pos][word] = int(freq) | ||||
|     else: | ||||
|         stats['words'][pos][word] += int(freq) | ||||
| 
 | ||||
| 
 | ||||
| def get_new_stats(f): | ||||
|     lines = [] | ||||
|     stats = {} | ||||
|     stats['words'] = {} | ||||
|     stats['words']['1'] = {} | ||||
|     stats['words']['2'] = {} | ||||
|     stats['words']['3'] = {} | ||||
|     stats['words']['4'] = {} | ||||
|     stats['words']['5'] = {} | ||||
|     stats['words']['total'] = 0 | ||||
| 
 | ||||
|     first_line = True | ||||
|     positions = {} | ||||
|     for line in f.readlines(): | ||||
|         line = line.split(',') | ||||
|         lines.append(line) | ||||
|         if first_line: | ||||
|             positions['freq'] = line.index('Frequency') | ||||
|             for lci, lc in enumerate(LEMMA_COLUMNS): | ||||
|                 positions[str(lci + 1)] = line.index(lc) | ||||
|             first_line = False | ||||
|             continue | ||||
|         for pos in range(1, 6): | ||||
|             pos = str(pos) | ||||
|             word = line[positions[pos]] | ||||
|             add_word(stats, pos, word, line[positions['freq']]) | ||||
|         stats['words']['total'] += int(line[positions['freq']]) | ||||
| 
 | ||||
|     return lines, stats | ||||
| 
 | ||||
| 
 | ||||
| def logDice_new(stats, positions, line, rest): | ||||
|     fi = [int(stats['words'][r][line[positions[r]]]) for r in rest] | ||||
|     res = 14 + log2(2 * int(line[positions['freq']]) / sum(fi)) | ||||
|     return res | ||||
| 
 | ||||
| 
 | ||||
| def deltaP_new(stats, positions, line, rest, delta21=True): | ||||
|     fi = [int(stats['words'][r][line[positions[r]]]) for r in rest] | ||||
|     fx = fi[0] if delta21 else fi[1] | ||||
|     fy = fi[1] if delta21 else fi[0] | ||||
|     freq = int(line[positions['freq']]) | ||||
|     N = int(stats['words']['total']) | ||||
|     res = (freq / fx) - ((fy - freq) / (N - fx)) | ||||
|     return res | ||||
| 
 | ||||
| 
 | ||||
| def write_new_stats(wf, original_text, stats, file_name, word_order): | ||||
|     structure_id = file_name.split('.')[-1] | ||||
|     core_rest, all_rest = word_order[structure_id] | ||||
| 
 | ||||
|     first_line = True | ||||
|     positions = {} | ||||
|     for line in original_text: | ||||
|         line[-1] = line[-1][:-1] | ||||
|         # handle header file | ||||
|         if first_line: | ||||
|             line += ['structure_frequency', 'logDice_core', 'logDice_all', | ||||
|                      'weighted_logDice_frequency', 'deltaP12_structure', | ||||
|                      'deltaP21_structure', 'deltaP_structure'] | ||||
| 
 | ||||
|             for i in range(5): | ||||
|                 new_pos = 6 + i + i * 5 | ||||
|                 line = line[:new_pos] + ['C' + str(i + 1) + '_lemma_structure_frequency'] + line[new_pos:] | ||||
| 
 | ||||
|             positions['freq'] = line.index('Frequency') | ||||
|             for lci, lc in enumerate(LEMMA_COLUMNS): | ||||
|                 positions[str(lci + 1)] = line.index(lc) | ||||
|             positions['delta12'] = line.index('Delta_p12') | ||||
|             positions['delta21'] = line.index('Delta_p21') | ||||
|             positions['logDice_core'] = line.index('LogDice_core') | ||||
|             positions['logDice_all'] = line.index('LogDice_all') | ||||
|             line[positions['logDice_core']] = 'logDice_core_corpus' | ||||
|             line[positions['logDice_all']] = 'logDice_all_corpus' | ||||
|             first_line = False | ||||
|             line = line[:positions['logDice_all'] + 1] + ['weighted_logDice_frequency_corpus'] + line[positions['logDice_all'] + 1:] | ||||
|             line = line[:positions['delta21'] + 1] + ['deltaP'] + line[positions['delta21'] + 1:] | ||||
|             # TODO INSERT 'deltaP',  and weightedlogDice_frequency and , 'weighted_logDice_frequency_corpus' | ||||
|             wf.write(','.join(line) + '\n') | ||||
|             continue | ||||
| 
 | ||||
|         lemma_struct_freq = [] | ||||
|         for i in range(5): | ||||
|             new_pos = 1 + i * 5 | ||||
|             freq = str(stats['words'][str(i + 1)][line[new_pos]]) if line[new_pos] != '' else '0' | ||||
|             lemma_struct_freq.append(freq) | ||||
| 
 | ||||
|         for i in range(5): | ||||
|             new_pos = 6 + i + i * 5 | ||||
|             line = line[:new_pos] + [lemma_struct_freq[i]] + line[new_pos:] | ||||
| 
 | ||||
|         # add structure_frequency | ||||
|         structure_frequency = int(stats['words']['total']) | ||||
|         line.append("{:.5f}".format(structure_frequency)) | ||||
|         # add logDice_core_new | ||||
|         logDice_core_new = logDice_new(stats, positions, line, core_rest) | ||||
|         line.append("{:.5f}".format(logDice_core_new)) | ||||
|         # add logDice_all_new | ||||
|         logDice_all_new = logDice_new(stats, positions, line, all_rest) | ||||
|         line.append("{:.5f}".format(logDice_all_new)) | ||||
|         weighted_logDice_frequency_corpus = 0.3 * int(line[positions['freq']]) + 0.7 * float( | ||||
|             line[positions['logDice_core']]) | ||||
|         # line.append("{:.5f}".format(weighted_logDice_frequency_corpus)) | ||||
|         weighted_logDice_frequency = 0.3 * int(line[positions['freq']]) + 0.7 * logDice_core_new | ||||
|         line.append("{:.5f}".format(weighted_logDice_frequency)) | ||||
|         # add deltaP12_structure | ||||
|         deltaP12_structure = deltaP_new(stats, positions, line, core_rest, delta21=False) | ||||
|         line.append("{:.5f}".format(deltaP12_structure)) | ||||
|         # add deltaP21_structure | ||||
|         deltaP21_structure = deltaP_new(stats, positions, line, core_rest, delta21=True) | ||||
|         line.append("{:.5f}".format(deltaP21_structure)) | ||||
| 
 | ||||
|         deltaP12 = float(line[positions['delta12']]) | ||||
|         deltaP21 = float(line[positions['delta21']]) | ||||
| 
 | ||||
|         deltaP = abs(deltaP12 - deltaP21) | ||||
|         # line.append("{:.5f}".format(deltaP)) | ||||
| 
 | ||||
|         deltaP_structure = abs(deltaP12_structure - deltaP21_structure) | ||||
|         line.append("{:.5f}".format(deltaP_structure)) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|         line = line[:positions['logDice_all'] + 1] + ["{:.5f}".format(weighted_logDice_frequency_corpus)] + line[positions[ | ||||
|                                                                                                       'logDice_all'] + 1:] | ||||
|         line = line[:positions['delta21'] + 1] + ["{:.5f}".format(deltaP)] + line[positions['delta21'] + 1:] | ||||
| 
 | ||||
|         # TODO ADD OTHER COLUMNS AS IN #823 task | ||||
|         wf.write(','.join(line) + '\n') | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| def main(args): | ||||
|     word_order = load_word_order(args.word_order_file) | ||||
|     for file_name in os.listdir(args.input): | ||||
|         read_file_path = os.path.join(args.input, file_name) | ||||
|         write_file_path = os.path.join(args.output, file_name) | ||||
|         with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf: | ||||
|             original_text, stats = get_new_stats(rf) | ||||
|             write_new_stats(wf, original_text, stats, file_name, word_order) | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     parser = argparse.ArgumentParser( | ||||
|         description='Extract structures from a parsed corpus.') | ||||
|     parser.add_argument('input', | ||||
|                         help='Path to folder that contains all input files.') | ||||
|     parser.add_argument('output', | ||||
|                         help='Path to folder that contains all input files.') | ||||
|     parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.') | ||||
| 
 | ||||
|     args = parser.parse_args() | ||||
|     logging.basicConfig(stream=sys.stderr) | ||||
| 
 | ||||
|     start = time.time() | ||||
|     main(args) | ||||
|     logging.info("TIME: {}".format(time.time() - start)) | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user