Added functionality for other corpuses
This commit is contained in:
parent
05a51673b3
commit
471b333df9
2
Tree.py
2
Tree.py
|
@ -32,6 +32,8 @@ class Tree(object):
|
||||||
feats_dict[feats] = Value(feats)
|
feats_dict[feats] = Value(feats)
|
||||||
self.feats = feats_dict[feats]
|
self.feats = feats_dict[feats]
|
||||||
for feat in feats_detailed.keys():
|
for feat in feats_detailed.keys():
|
||||||
|
if feat not in feats_detailed_dict:
|
||||||
|
feats_detailed_dict[feat] = {}
|
||||||
if next(iter(feats_detailed[feat])) not in feats_detailed_dict[feat]:
|
if next(iter(feats_detailed[feat])) not in feats_detailed_dict[feat]:
|
||||||
feats_detailed_dict[feat][next(iter(feats_detailed[feat]))] = Value(next(iter(feats_detailed[feat])))
|
feats_detailed_dict[feat][next(iter(feats_detailed[feat]))] = Value(next(iter(feats_detailed[feat])))
|
||||||
if not feat in self.feats_detailed:
|
if not feat in self.feats_detailed:
|
||||||
|
|
|
@ -15,24 +15,26 @@ import pyconll
|
||||||
from Tree import Tree, create_output_string_form, create_output_string_deprel, create_output_string_lemma, create_output_string_upos, create_output_string_xpos, create_output_string_feats
|
from Tree import Tree, create_output_string_form, create_output_string_deprel, create_output_string_lemma, create_output_string_upos, create_output_string_xpos, create_output_string_feats
|
||||||
|
|
||||||
# for separate searches of feats
|
# for separate searches of feats
|
||||||
feats_detailed_list = [
|
# feats_detailed_list = [
|
||||||
# lexical features
|
# # lexical features
|
||||||
'PronType', 'NumType', 'Poss', 'Reflex', 'Foreign', 'Abbr',
|
# 'PronType', 'NumType', 'Poss', 'Reflex', 'Foreign', 'Abbr',
|
||||||
|
#
|
||||||
|
# # Inflectional features (nominal)
|
||||||
|
# 'Gender', 'Animacy', 'NounClass', 'Number', 'Case', 'Definite', 'Degree',
|
||||||
|
#
|
||||||
|
# # Inflectional features (verbal)
|
||||||
|
# 'VerbForm', 'Mood', 'Tense', 'Aspect', 'Voice', 'Evident', 'Polarity', 'Person', 'Polite', 'Clusivity',
|
||||||
|
#
|
||||||
|
# # Other
|
||||||
|
# 'Variant', 'Number[psor]', 'Gender[psor]', 'NumForm'
|
||||||
|
# ]
|
||||||
|
|
||||||
# Inflectional features (nominal)
|
# feats_detailed_list = []
|
||||||
'Gender', 'Animacy', 'NounClass', 'Number', 'Case', 'Definite', 'Degree',
|
|
||||||
|
|
||||||
# Inflectional features (verbal)
|
# feats_detailed_dict = {key: {} for key in feats_detailed_list}
|
||||||
'VerbForm', 'Mood', 'Tense', 'Aspect', 'Voice', 'Evident', 'Polarity', 'Person', 'Polite', 'Clusivity',
|
|
||||||
|
|
||||||
# Other
|
|
||||||
'Variant', 'Number[psor]', 'Gender[psor]', 'NumForm'
|
|
||||||
]
|
|
||||||
|
|
||||||
feats_detailed_dict = {key: {} for key in feats_detailed_list}
|
|
||||||
|
|
||||||
|
|
||||||
def decode_query(orig_query, dependency_type):
|
def decode_query(orig_query, dependency_type, feats_detailed_list):
|
||||||
new_query = False
|
new_query = False
|
||||||
|
|
||||||
# if command in bracelets remove them and treat command as new query
|
# if command in bracelets remove them and treat command as new query
|
||||||
|
@ -129,6 +131,8 @@ def create_trees(config):
|
||||||
form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, feats_dict = {}, {}, {}, {}, {}, {}
|
form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, feats_dict = {}, {}, {}, {}, {}, {}
|
||||||
|
|
||||||
all_trees = []
|
all_trees = []
|
||||||
|
corpus_size = 0
|
||||||
|
feats_detailed_dict = {}
|
||||||
|
|
||||||
for sentence in train:
|
for sentence in train:
|
||||||
root = None
|
root = None
|
||||||
|
@ -149,7 +153,8 @@ def create_trees(config):
|
||||||
token_nodes.append(node)
|
token_nodes.append(node)
|
||||||
if token.deprel == 'root':
|
if token.deprel == 'root':
|
||||||
root = node
|
root = node
|
||||||
root_id = int(token.id)
|
|
||||||
|
corpus_size += 1
|
||||||
|
|
||||||
for token_id, token in enumerate(token_nodes):
|
for token_id, token in enumerate(token_nodes):
|
||||||
if int(token.parent) == 0:
|
if int(token.parent) == 0:
|
||||||
|
@ -177,14 +182,14 @@ def create_trees(config):
|
||||||
|
|
||||||
|
|
||||||
with open(trees_read_outputfile, 'wb') as output:
|
with open(trees_read_outputfile, 'wb') as output:
|
||||||
pickle.dump((all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict), output)
|
pickle.dump((all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict), output)
|
||||||
else:
|
else:
|
||||||
print('Reading trees:')
|
print('Reading trees:')
|
||||||
print('Completed')
|
print('Completed')
|
||||||
with open(trees_read_outputfile, 'rb') as pkl_file:
|
with open(trees_read_outputfile, 'rb') as pkl_file:
|
||||||
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = pickle.load(pkl_file)
|
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict) = pickle.load(pkl_file)
|
||||||
|
|
||||||
return all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict
|
return all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size, feats_detailed_dict
|
||||||
|
|
||||||
|
|
||||||
# def order_independent_queries(query_tree):
|
# def order_independent_queries(query_tree):
|
||||||
|
@ -339,6 +344,14 @@ def main():
|
||||||
# create queries
|
# create queries
|
||||||
tree_size = 0
|
tree_size = 0
|
||||||
|
|
||||||
|
# 261 - 9 grams
|
||||||
|
# 647 - 10 grams
|
||||||
|
# 1622 - 11 grams
|
||||||
|
# 4126 - 12 grams
|
||||||
|
# 10598 - 13 grams
|
||||||
|
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict, corpus_size,
|
||||||
|
feats_detailed_list) = create_trees(config)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# if config.getint('settings', 'tree_size') == 2:
|
# if config.getint('settings', 'tree_size') == 2:
|
||||||
|
@ -366,16 +379,9 @@ def main():
|
||||||
for i in range(tree_size_range[0], tree_size_range[1] + 1):
|
for i in range(tree_size_range[0], tree_size_range[1] + 1):
|
||||||
query_tree.extend(create_ngrams_query_trees(i, [{}]))
|
query_tree.extend(create_ngrams_query_trees(i, [{}]))
|
||||||
else:
|
else:
|
||||||
query_tree = [decode_query('(' + config.get('settings', 'query') + ')', '')]
|
query_tree = [decode_query('(' + config.get('settings', 'query') + ')', '', feats_detailed_list)]
|
||||||
# order_independent_queries(query_tree)
|
# order_independent_queries(query_tree)
|
||||||
|
|
||||||
# 261 - 9 grams
|
|
||||||
# 647 - 10 grams
|
|
||||||
# 1622 - 11 grams
|
|
||||||
# 4126 - 12 grams
|
|
||||||
# 10598 - 13 grams
|
|
||||||
(all_trees, form_dict, lemma_dict, upos_dict, xpos_dict, deprel_dict) = create_trees(config)
|
|
||||||
|
|
||||||
|
|
||||||
# set filters
|
# set filters
|
||||||
assert config.get('settings', 'node_type') in ['deprel', 'lemma', 'upos', 'xpos', 'form', 'feats'], '"node_type" is not set up correctly'
|
assert config.get('settings', 'node_type') in ['deprel', 'lemma', 'upos', 'xpos', 'form', 'feats'], '"node_type" is not set up correctly'
|
||||||
|
@ -507,6 +513,10 @@ def main():
|
||||||
else:
|
else:
|
||||||
len_words = int(len(config.get('settings', 'query').split(" "))/2 + 1)
|
len_words = int(len(config.get('settings', 'query').split(" "))/2 + 1)
|
||||||
header = ["Structure"] + ["Word #" + str(i) for i in range(1, len_words + 1)] + ['Number of occurences']
|
header = ["Structure"] + ["Word #" + str(i) for i in range(1, len_words + 1)] + ['Number of occurences']
|
||||||
|
if config.get('settings', 'relative_number'):
|
||||||
|
header += ['Relative frequency']
|
||||||
|
if config.get('settings', 'nodes_number'):
|
||||||
|
header += ['Nodes number']
|
||||||
# header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Number of occurences']
|
# header = [" ".join(words[i:i + span]) for i in range(0, len(words), span)] + ['Number of occurences']
|
||||||
writer.writerow(header)
|
writer.writerow(header)
|
||||||
|
|
||||||
|
@ -514,7 +524,13 @@ def main():
|
||||||
for k, v in sorted_list:
|
for k, v in sorted_list:
|
||||||
words_only = v['object'].array + ['' for i in range(tree_size_range[-1] - len(v['object'].array))]
|
words_only = v['object'].array + ['' for i in range(tree_size_range[-1] - len(v['object'].array))]
|
||||||
# words_only = printable_answers(k)
|
# words_only = printable_answers(k)
|
||||||
writer.writerow([k] + words_only + [str(v['number'])])
|
row = [k] + words_only + [str(v['number'])]
|
||||||
|
if config.get('settings', 'relative_number'):
|
||||||
|
row += ['%.4f' % (v['number'] * 1000000.0 / corpus_size)]
|
||||||
|
if config.get('settings', 'nodes_number'):
|
||||||
|
row += ['%d' % len(v['object'].array)]
|
||||||
|
|
||||||
|
writer.writerow(row)
|
||||||
|
|
||||||
return "Done"
|
return "Done"
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user