2020-08-20 14:13:22 +00:00
|
|
|
import argparse
|
|
|
|
import logging
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
from math import log2
|
|
|
|
|
|
|
|
CORE_RESTRICTIONS = ['s', 'p', 'r', 'gg']
|
|
|
|
ALL_RESTRICTIONS = CORE_RESTRICTIONS + ['vp', 'vd', 'd']
|
|
|
|
LEMMA_COLUMNS = ['C1_Lemma', 'C2_Lemma', 'C3_Lemma', 'C4_Lemma', 'C5_Lemma']
|
|
|
|
|
|
|
|
|
|
|
|
def load_word_order(word_order_file):
|
|
|
|
with open(word_order_file, 'r') as f:
|
|
|
|
lines = {}
|
|
|
|
for line in f:
|
|
|
|
l = line.split('|')
|
|
|
|
if l[6] not in [e[0] for e in lines] and l[6] != '' and l[6] != 'NSSS':
|
|
|
|
pos_tags = l[2].split('-')
|
|
|
|
core_rest = sorted([str(pt_i + 1) for cr in CORE_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr])
|
|
|
|
assert len(core_rest) == 2, 'Core restrictions are incorrect!'
|
|
|
|
all_rest = sorted([str(pt_i + 1) for cr in ALL_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr])
|
|
|
|
lines[l[6]] = [core_rest, all_rest]
|
|
|
|
return lines
|
|
|
|
|
|
|
|
|
|
|
|
def add_word(stats, pos, word, freq):
|
|
|
|
if word == '':
|
|
|
|
return
|
|
|
|
|
|
|
|
if word not in stats['words'][pos]:
|
|
|
|
stats['words'][pos][word] = int(freq)
|
|
|
|
else:
|
|
|
|
stats['words'][pos][word] += int(freq)
|
|
|
|
|
|
|
|
|
|
|
|
def get_new_stats(f):
|
|
|
|
lines = []
|
|
|
|
stats = {}
|
|
|
|
stats['words'] = {}
|
|
|
|
stats['words']['1'] = {}
|
|
|
|
stats['words']['2'] = {}
|
|
|
|
stats['words']['3'] = {}
|
|
|
|
stats['words']['4'] = {}
|
|
|
|
stats['words']['5'] = {}
|
|
|
|
stats['words']['total'] = 0
|
|
|
|
|
|
|
|
first_line = True
|
|
|
|
positions = {}
|
|
|
|
for line in f.readlines():
|
|
|
|
line = line.split(',')
|
|
|
|
lines.append(line)
|
|
|
|
if first_line:
|
|
|
|
positions['freq'] = line.index('Frequency')
|
|
|
|
for lci, lc in enumerate(LEMMA_COLUMNS):
|
|
|
|
positions[str(lci + 1)] = line.index(lc)
|
|
|
|
first_line = False
|
|
|
|
continue
|
|
|
|
for pos in range(1, 6):
|
|
|
|
pos = str(pos)
|
|
|
|
word = line[positions[pos]]
|
|
|
|
add_word(stats, pos, word, line[positions['freq']])
|
|
|
|
stats['words']['total'] += int(line[positions['freq']])
|
|
|
|
|
|
|
|
return lines, stats
|
|
|
|
|
|
|
|
|
|
|
|
def logDice_new(stats, positions, line, rest):
|
|
|
|
fi = [int(stats['words'][r][line[positions[r]]]) for r in rest]
|
|
|
|
res = 14 + log2(2 * int(line[positions['freq']]) / sum(fi))
|
|
|
|
return res
|
|
|
|
|
|
|
|
|
|
|
|
def deltaP_new(stats, positions, line, rest, delta21=True):
|
|
|
|
fi = [int(stats['words'][r][line[positions[r]]]) for r in rest]
|
|
|
|
fx = fi[0] if delta21 else fi[1]
|
|
|
|
fy = fi[1] if delta21 else fi[0]
|
|
|
|
freq = int(line[positions['freq']])
|
|
|
|
N = int(stats['words']['total'])
|
|
|
|
res = (freq / fx) - ((fy - freq) / (N - fx))
|
|
|
|
return res
|
|
|
|
|
|
|
|
|
|
|
|
def write_new_stats(wf, original_text, stats, file_name, word_order):
|
|
|
|
structure_id = file_name.split('.')[-1]
|
|
|
|
core_rest, all_rest = word_order[structure_id]
|
|
|
|
|
|
|
|
first_line = True
|
|
|
|
positions = {}
|
|
|
|
for line in original_text:
|
|
|
|
line[-1] = line[-1][:-1]
|
|
|
|
# handle header file
|
|
|
|
if first_line:
|
|
|
|
line += ['structure_frequency', 'logDice_core', 'logDice_all',
|
|
|
|
'weighted_logDice_frequency', 'deltaP12_structure',
|
|
|
|
'deltaP21_structure', 'deltaP_structure']
|
|
|
|
|
|
|
|
for i in range(5):
|
|
|
|
new_pos = 6 + i + i * 5
|
|
|
|
line = line[:new_pos] + ['C' + str(i + 1) + '_lemma_structure_frequency'] + line[new_pos:]
|
|
|
|
|
|
|
|
positions['freq'] = line.index('Frequency')
|
|
|
|
for lci, lc in enumerate(LEMMA_COLUMNS):
|
|
|
|
positions[str(lci + 1)] = line.index(lc)
|
|
|
|
positions['delta12'] = line.index('Delta_p12')
|
|
|
|
positions['delta21'] = line.index('Delta_p21')
|
|
|
|
positions['logDice_core'] = line.index('LogDice_core')
|
|
|
|
positions['logDice_all'] = line.index('LogDice_all')
|
|
|
|
line[positions['logDice_core']] = 'logDice_core_corpus'
|
|
|
|
line[positions['logDice_all']] = 'logDice_all_corpus'
|
|
|
|
first_line = False
|
|
|
|
line = line[:positions['logDice_all'] + 1] + ['weighted_logDice_frequency_corpus'] + line[positions['logDice_all'] + 1:]
|
|
|
|
line = line[:positions['delta21'] + 1] + ['deltaP'] + line[positions['delta21'] + 1:]
|
|
|
|
# TODO INSERT 'deltaP', and weightedlogDice_frequency and , 'weighted_logDice_frequency_corpus'
|
|
|
|
wf.write(','.join(line) + '\n')
|
|
|
|
continue
|
|
|
|
|
|
|
|
lemma_struct_freq = []
|
|
|
|
for i in range(5):
|
|
|
|
new_pos = 1 + i * 5
|
|
|
|
freq = str(stats['words'][str(i + 1)][line[new_pos]]) if line[new_pos] != '' else '0'
|
|
|
|
lemma_struct_freq.append(freq)
|
|
|
|
|
|
|
|
for i in range(5):
|
|
|
|
new_pos = 6 + i + i * 5
|
|
|
|
line = line[:new_pos] + [lemma_struct_freq[i]] + line[new_pos:]
|
|
|
|
|
|
|
|
# add structure_frequency
|
|
|
|
structure_frequency = int(stats['words']['total'])
|
|
|
|
line.append("{:.5f}".format(structure_frequency))
|
|
|
|
# add logDice_core_new
|
|
|
|
logDice_core_new = logDice_new(stats, positions, line, core_rest)
|
|
|
|
line.append("{:.5f}".format(logDice_core_new))
|
|
|
|
# add logDice_all_new
|
|
|
|
logDice_all_new = logDice_new(stats, positions, line, all_rest)
|
|
|
|
line.append("{:.5f}".format(logDice_all_new))
|
|
|
|
weighted_logDice_frequency_corpus = 0.3 * int(line[positions['freq']]) + 0.7 * float(
|
|
|
|
line[positions['logDice_core']])
|
|
|
|
# line.append("{:.5f}".format(weighted_logDice_frequency_corpus))
|
|
|
|
weighted_logDice_frequency = 0.3 * int(line[positions['freq']]) + 0.7 * logDice_core_new
|
|
|
|
line.append("{:.5f}".format(weighted_logDice_frequency))
|
|
|
|
# add deltaP12_structure
|
|
|
|
deltaP12_structure = deltaP_new(stats, positions, line, core_rest, delta21=False)
|
|
|
|
line.append("{:.5f}".format(deltaP12_structure))
|
|
|
|
# add deltaP21_structure
|
|
|
|
deltaP21_structure = deltaP_new(stats, positions, line, core_rest, delta21=True)
|
|
|
|
line.append("{:.5f}".format(deltaP21_structure))
|
|
|
|
|
|
|
|
deltaP12 = float(line[positions['delta12']])
|
|
|
|
deltaP21 = float(line[positions['delta21']])
|
|
|
|
|
|
|
|
deltaP = abs(deltaP12 - deltaP21)
|
|
|
|
# line.append("{:.5f}".format(deltaP))
|
|
|
|
|
|
|
|
deltaP_structure = abs(deltaP12_structure - deltaP21_structure)
|
|
|
|
line.append("{:.5f}".format(deltaP_structure))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
line = line[:positions['logDice_all'] + 1] + ["{:.5f}".format(weighted_logDice_frequency_corpus)] + line[positions[
|
|
|
|
'logDice_all'] + 1:]
|
|
|
|
line = line[:positions['delta21'] + 1] + ["{:.5f}".format(deltaP)] + line[positions['delta21'] + 1:]
|
|
|
|
|
|
|
|
# TODO ADD OTHER COLUMNS AS IN #823 task
|
|
|
|
wf.write(','.join(line) + '\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(args):
|
|
|
|
word_order = load_word_order(args.word_order_file)
|
|
|
|
for file_name in os.listdir(args.input):
|
|
|
|
read_file_path = os.path.join(args.input, file_name)
|
|
|
|
write_file_path = os.path.join(args.output, file_name)
|
|
|
|
with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf:
|
|
|
|
original_text, stats = get_new_stats(rf)
|
2020-08-21 13:05:30 +00:00
|
|
|
freq_pos = original_text[0].index('Frequency')
|
2020-09-02 08:53:45 +00:00
|
|
|
if args.frequency_limit > 1:
|
|
|
|
original_text = [original_text[0]] + [l for l in original_text[1:] if int(l[freq_pos]) >= 10]
|
|
|
|
if args.sorted:
|
|
|
|
if len(original_text) > 1:
|
|
|
|
original_text = [original_text[0]] + sorted(original_text[1:], key=lambda x: -1 * int(x[freq_pos]))
|
|
|
|
else:
|
|
|
|
original_text = [original_text[0]]
|
2020-08-20 14:13:22 +00:00
|
|
|
write_new_stats(wf, original_text, stats, file_name, word_order)
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description='Extract structures from a parsed corpus.')
|
|
|
|
parser.add_argument('input',
|
|
|
|
help='Path to folder that contains all input files.')
|
|
|
|
parser.add_argument('output',
|
|
|
|
help='Path to folder that contains all input files.')
|
|
|
|
parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.')
|
2020-09-02 08:53:45 +00:00
|
|
|
parser.add_argument('--frequency_limit', type=int, default=1, help='File that contains word order for DeltaP calculations.')
|
|
|
|
parser.add_argument('--sorted', action='store_true', help='File that contains word order for DeltaP calculations.')
|
2020-08-20 14:13:22 +00:00
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(stream=sys.stderr)
|
|
|
|
|
|
|
|
start = time.time()
|
|
|
|
main(args)
|
|
|
|
logging.info("TIME: {}".format(time.time() - start))
|