STARK/generic.py

101 lines
3.3 KiB
Python

# Copyright 2019 CJVT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import sys
def create_output_string_form(tree):
return tree.form.get_value()
def create_output_string_deprel(tree):
return tree.deprel.get_value()
def create_output_string_lemma(tree):
return tree.lemma.get_value()
def create_output_string_upos(tree):
return tree.upos.get_value()
def create_output_string_xpos(tree):
return tree.xpos.get_value()
def create_output_string_feats(tree):
return tree.feats.get_value()
def generate_key(node, create_output_strings, print_lemma=True):
array = [[create_output_string(node) for create_output_string in create_output_strings]]
if create_output_string_lemma in create_output_strings and print_lemma:
key_array = [[create_output_string(
node) if create_output_string != create_output_string_lemma else 'L=' + create_output_string(node) for
create_output_string in create_output_strings]]
else:
key_array = array
if len(array[0]) > 1:
key = '&'.join(key_array[0])
else:
# output_string = create_output_strings[0](node)
key = key_array[0][0]
return array, key
def generate_name(node, create_output_strings, print_lemma=True):
array = [create_output_string(node) for create_output_string in create_output_strings]
if create_output_string_lemma in create_output_strings and print_lemma:
name_array = [create_output_string(
node) if create_output_string != create_output_string_lemma else 'L=' + create_output_string(node) for
create_output_string in create_output_strings]
else:
name_array = array
if len(array) > 1:
name = '&'.join(name_array)
else:
# output_string = create_output_strings[0](node)
name = name_array[0]
return array, name
def get_collocabilities(ngram, unigrams_dict, corpus_size):
sum_fwi = 0.0
mul_fwi = 1.0
for key_array in ngram['object'].array:
# create key for unigrams
if len(key_array) > 1:
key = '&'.join(key_array)
else:
# output_string = create_output_strings[0](node)
key = key_array[0]
sum_fwi += unigrams_dict[key]
mul_fwi *= unigrams_dict[key]
if mul_fwi < 0:
mul_fwi = sys.maxsize
# number of all words
N = corpus_size
# n of ngram
n = len(ngram['object'].array)
O = ngram['number']
E = mul_fwi / pow(N, n-1)
# ['MI', 'MI3', 'Dice', 'logDice', 't-score', 'simple-LL']
mi = math.log(O / E, 2)
mi3 = math.log(pow(O, 3) / E, 2)
dice = n * O / sum_fwi
logdice = 14 + math.log(dice, 2)
tscore = (O - E) / math.sqrt(O)
simplell = 2 * (O * math.log10(O / E) - (O - E))
return ['%.4f' % mi, '%.4f' % mi3, '%.4f' % dice, '%.4f' % logdice, '%.4f' % tscore, '%.4f' % simplell]