Modified frequency treshold, commenting lines, added logDice, added temporary calc values
This commit is contained in:
parent
c6eaf12753
commit
9cd3758362
|
@ -375,7 +375,7 @@ def main():
|
||||||
# query_tree = [{"children": [{}, {}, {}, {}]}, {"children": [{"children": [{}]}, {}, {}]}, {"children": [{"children": [{}, {}]}, {}]}, {"children": [{"children": [{}]}, {"children": [{}]}]},
|
# query_tree = [{"children": [{}, {}, {}, {}]}, {"children": [{"children": [{}]}, {}, {}]}, {"children": [{"children": [{}, {}]}, {}]}, {"children": [{"children": [{}]}, {"children": [{}]}]},
|
||||||
# {"children": [{"children": [{"children": [{}]}]}, {}]}, {"children": [{"children": [{"children": [{}]}, {}]}]}, {"children": [{"children": [{"children": [{}, {}]}]}]},
|
# {"children": [{"children": [{"children": [{}]}]}, {}]}, {"children": [{"children": [{"children": [{}]}, {}]}]}, {"children": [{"children": [{"children": [{}, {}]}]}]},
|
||||||
# {"children": [{"children": [{"children": [{"children": [{}]}]}]}]}, {'children': [{'children': [{}, {}, {}]}]}]
|
# {"children": [{"children": [{"children": [{"children": [{}]}]}]}]}, {'children': [{'children': [{}, {}, {}]}]}]
|
||||||
tree_size_range = config.get('settings', 'tree_size').split('-')
|
tree_size_range = config.get('settings', 'tree_size', fallback='0').split('-')
|
||||||
tree_size_range = [int(r) for r in tree_size_range]
|
tree_size_range = [int(r) for r in tree_size_range]
|
||||||
|
|
||||||
if tree_size_range[0] > 1:
|
if tree_size_range[0] > 1:
|
||||||
|
@ -440,8 +440,8 @@ def main():
|
||||||
filters['complete_tree_type'] = config.get('settings', 'tree_type') == 'complete'
|
filters['complete_tree_type'] = config.get('settings', 'tree_type') == 'complete'
|
||||||
filters['association_measures'] = config.getboolean('settings', 'association_measures')
|
filters['association_measures'] = config.getboolean('settings', 'association_measures')
|
||||||
filters['nodes_number'] = config.getboolean('settings', 'nodes_number')
|
filters['nodes_number'] = config.getboolean('settings', 'nodes_number')
|
||||||
filters['frequency_threshold'] = config.getfloat('settings', 'frequency_threshold')
|
filters['frequency_threshold'] = config.getfloat('settings', 'frequency_threshold', fallback=0)
|
||||||
filters['lines_threshold'] = config.getint('settings', 'lines_threshold')
|
filters['lines_threshold'] = config.getint('settings', 'lines_threshold', fallback=0)
|
||||||
filters['print_root'] = config.getboolean('settings', 'print_root')
|
filters['print_root'] = config.getboolean('settings', 'print_root')
|
||||||
|
|
||||||
|
|
||||||
|
@ -571,7 +571,7 @@ def main():
|
||||||
if filters['print_root']:
|
if filters['print_root']:
|
||||||
header += ['Root node']
|
header += ['Root node']
|
||||||
if filters['association_measures']:
|
if filters['association_measures']:
|
||||||
header += ['MI', 'MI3', 'Dice', 't-score', 'simple-LL']
|
header += ['MI', 'MI3', 'Dice', 'logDice', 't-score', 'simple-LL']
|
||||||
# header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Absolute frequency']
|
# header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Absolute frequency']
|
||||||
writer.writerow(header)
|
writer.writerow(header)
|
||||||
|
|
||||||
|
@ -581,13 +581,13 @@ def main():
|
||||||
# body
|
# body
|
||||||
for k, v in sorted_list:
|
for k, v in sorted_list:
|
||||||
v['object'].get_array()
|
v['object'].get_array()
|
||||||
absolute_frequency = v['number'] * 1000000.0 / corpus_size
|
relative_frequency = v['number'] * 1000000.0 / corpus_size
|
||||||
if filters['frequency_threshold'] and filters['frequency_threshold'] > absolute_frequency:
|
if filters['frequency_threshold'] and filters['frequency_threshold'] > v['number']:
|
||||||
break
|
break
|
||||||
words_only = [word_att for word in v['object'].array for word_att in word] + ['' for i in range((tree_size_range[-1] - len(v['object'].array)) * len(v['object'].array[0]))]
|
words_only = [word_att for word in v['object'].array for word_att in word] + ['' for i in range((tree_size_range[-1] - len(v['object'].array)) * len(v['object'].array[0]))]
|
||||||
# words_only = printable_answers(k)
|
# words_only = printable_answers(k)
|
||||||
row = [v['object'].get_key()[1:-1]] + words_only + [str(v['number'])]
|
row = [v['object'].get_key()[1:-1]] + words_only + [str(v['number'])]
|
||||||
row += ['%.4f' % absolute_frequency]
|
row += ['%.4f' % relative_frequency]
|
||||||
if filters['node_order']:
|
if filters['node_order']:
|
||||||
row += [v['object'].order]
|
row += [v['object'].order]
|
||||||
row += [v['object'].get_key_sorted()[1:-1]]
|
row += [v['object'].get_key_sorted()[1:-1]]
|
||||||
|
|
|
@ -76,12 +76,11 @@ def get_collocabilities(ngram, unigrams_dict, corpus_size):
|
||||||
O = ngram['number']
|
O = ngram['number']
|
||||||
E = mul_fwi / pow(N, n-1)
|
E = mul_fwi / pow(N, n-1)
|
||||||
|
|
||||||
# ['MI', 'MI3', 'Dice', 't-score', 'simple-LL']
|
# ['MI', 'MI3', 'Dice', 'logDice', 't-score', 'simple-LL']
|
||||||
# mi = Math.log(O / E) / Math.log(2);
|
|
||||||
mi = math.log(O / E, 2)
|
mi = math.log(O / E, 2)
|
||||||
# Math.log(Math.pow(O, 3.0) / E) / Math.log(2);
|
|
||||||
mi3 = math.log(pow(O, 3) / E, 2)
|
mi3 = math.log(pow(O, 3) / E, 2)
|
||||||
dice = n * O / sum_fwi
|
dice = n * O / sum_fwi
|
||||||
|
logdice = 14 + math.log(dice, 2)
|
||||||
tscore = (O - E) / math.sqrt(O)
|
tscore = (O - E) / math.sqrt(O)
|
||||||
simplell = 2 * (O * math.log10(O / E) - (O - E))
|
simplell = 2 * (O * math.log10(O / E) - (O - E))
|
||||||
return ['%.4f' % mi, '%.4f' % mi3, '%.4f' % dice, '%.4f' % tscore, '%.4f' % simplell]
|
return [('%.4f; N=%.4f, n=%.4f; O=%.4f, E=%.4f, sum_fwi=%.4f, mul_fwi=%.4f' % (mi, N, n, O, E, sum_fwi, mul_fwi)), '%.4f' % mi3, '%.4f' % dice, '%.4f' % logdice, '%.4f' % tscore, '%.4f' % simplell]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user