Added creation of inputs per vowels

This commit is contained in:
lkrsnik 2017-06-23 11:49:21 +02:00
parent 2c16f10e13
commit dfe4b9a362
2 changed files with 236 additions and 52 deletions

View File

@ -2,7 +2,9 @@
<project version="4"> <project version="4">
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment=""> <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
<change type="NEW" beforePath="" afterPath="$PROJECT_DIR$/theano_tutorial/logistic_regression_loop.py" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" afterPath="$PROJECT_DIR$/character_based_ffnn/character_based_ffnn_keras.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
</list> </list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" /> <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="TRACKING_ENABLED" value="true" /> <option name="TRACKING_ENABLED" value="true" />
@ -17,10 +19,10 @@
<component name="ExecutionTargetManager" SELECTED_TARGET="default_target" /> <component name="ExecutionTargetManager" SELECTED_TARGET="default_target" />
<component name="FileEditorManager"> <component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300"> <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file leaf-file-name="test.py" pinned="false" current-in-tab="true"> <file leaf-file-name="test.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/theano_tutorial/test.py"> <entry file="file://$PROJECT_DIR$/theano_tutorial/test.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1368"> <state relative-caret-position="1332">
<caret line="76" column="9" lean-forward="false" selection-start-line="76" selection-start-column="9" selection-end-line="76" selection-end-column="9" /> <caret line="76" column="9" lean-forward="false" selection-start-line="76" selection-start-column="9" selection-end-line="76" selection-end-column="9" />
<folding> <folding>
<element signature="e#0#18#0" expanded="true" /> <element signature="e#0#18#0" expanded="true" />
@ -29,6 +31,18 @@
</provider> </provider>
</entry> </entry>
</file> </file>
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="334">
<caret line="76" column="44" lean-forward="false" selection-start-line="76" selection-start-column="23" selection-end-line="76" selection-end-column="44" />
<folding>
<element signature="e#24#63#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file leaf-file-name="logistic_regression.py" pinned="false" current-in-tab="false"> <file leaf-file-name="logistic_regression.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/theano_tutorial/logistic_regression.py"> <entry file="file://$PROJECT_DIR$/theano_tutorial/logistic_regression.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
@ -111,6 +125,10 @@
<find>nearly_zeros</find> <find>nearly_zeros</find>
<find>rand</find> <find>rand</find>
<find>u</find> <find>u</find>
<find>shuffle_inputs</find>
<find>num_all_vowels</find>
<find>create_and_save_inputs</find>
<find>load_shuffle_vector</find>
</findStrings> </findStrings>
</component> </component>
<component name="Git.Settings"> <component name="Git.Settings">
@ -125,6 +143,7 @@
<option value="$PROJECT_DIR$/theano_tutorial/tutorial_loop.py" /> <option value="$PROJECT_DIR$/theano_tutorial/tutorial_loop.py" />
<option value="$PROJECT_DIR$/theano_tutorial/logistic_regression.py" /> <option value="$PROJECT_DIR$/theano_tutorial/logistic_regression.py" />
<option value="$PROJECT_DIR$/theano_tutorial/logistic_regression_loop.py" /> <option value="$PROJECT_DIR$/theano_tutorial/logistic_regression_loop.py" />
<option value="$PROJECT_DIR$/prepare_data.py" />
</list> </list>
</option> </option>
</component> </component>
@ -149,8 +168,53 @@
<foldersAlwaysOnTop value="true" /> <foldersAlwaysOnTop value="true" />
</navigator> </navigator>
<panes> <panes>
<pane id="ProjectPane">
<subPane>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="character_based_ffnn" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="character_based_ffnn" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="internal_representations" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
</subPane>
</pane>
<pane id="Scratches" /> <pane id="Scratches" />
<pane id="ProjectPane" />
<pane id="Scope" /> <pane id="Scope" />
</panes> </panes>
</component> </component>
@ -414,9 +478,9 @@
</component> </component>
<component name="ToolWindowManager"> <component name="ToolWindowManager">
<frame x="65" y="24" width="1855" height="1056" extended-state="6" /> <frame x="65" y="24" width="1855" height="1056" extended-state="6" />
<editor active="false" /> <editor active="true" />
<layout> <layout>
<window_info id="Project" active="true" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.16375546" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" /> <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.16375546" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
<window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" /> <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
<window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" /> <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" />
<window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="10" side_tool="false" content_ui="tabs" /> <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="10" side_tool="false" content_ui="tabs" />
@ -468,16 +532,6 @@
<watches-manager /> <watches-manager />
</component> </component>
<component name="editorHistoryManager"> <component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/theano_tutorial/tutorial_loop.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding>
<element signature="e#0#13#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/theano_tutorial/tutorial_conditions.py"> <entry file="file://$PROJECT_DIR$/theano_tutorial/tutorial_conditions.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="18"> <state relative-caret-position="18">
@ -940,7 +994,7 @@
</entry> </entry>
<entry file="file://$PROJECT_DIR$/theano_tutorial/test.py"> <entry file="file://$PROJECT_DIR$/theano_tutorial/test.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1368"> <state relative-caret-position="1332">
<caret line="76" column="9" lean-forward="false" selection-start-line="76" selection-start-column="9" selection-end-line="76" selection-end-column="9" /> <caret line="76" column="9" lean-forward="false" selection-start-line="76" selection-start-column="9" selection-end-line="76" selection-end-column="9" />
<folding> <folding>
<element signature="e#0#18#0" expanded="true" /> <element signature="e#0#18#0" expanded="true" />
@ -948,5 +1002,15 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="334">
<caret line="76" column="44" lean-forward="false" selection-start-line="76" selection-start-column="23" selection-end-line="76" selection-end-column="44" />
<folding>
<element signature="e#24#63#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</component> </component>
</project> </project>

View File

@ -5,6 +5,7 @@ from __future__ import unicode_literals
import numpy as np import numpy as np
import h5py import h5py
import gc import gc
import StringIO
def save_inputs(file_name, X, y): def save_inputs(file_name, X, y):
h5f = h5py.File(file_name, 'w') h5f = h5py.File(file_name, 'w')
@ -13,14 +14,29 @@ def save_inputs(file_name, X, y):
h5f.create_dataset(k,data=v) h5f.create_dataset(k,data=v)
h5f.close() h5f.close()
def create_and_save_inputs(file_name): def create_and_save_inputs(file_name, part, X, y, X_pure):
X, y, X_pure = generate_full_vowel_matrix_inputs() # X, y, X_pure = generate_full_vowel_matrix_inputs()
h5f = h5py.File(file_name, 'w') h5f = h5py.File(file_name + part + '.h5', 'w')
adict=dict(X=X, y=y, X_pure=X_pure) adict=dict(X=X, y=y, X_pure=X_pure)
for k,v in adict.items(): for k,v in adict.items():
h5f.create_dataset(k,data=v) h5f.create_dataset(k,data=v)
h5f.close() h5f.close()
def create_and_save_shuffle_vector(file_name, shuffle_vector):
# X, y, X_pure = generate_full_vowel_matrix_inputs()
h5f = h5py.File(file_name + '_shuffle_vector.h5', 'w')
adict=dict(shuffle_vector=shuffle_vector)
for k,v in adict.items():
h5f.create_dataset(k,data=v)
h5f.close()
def load_shuffle_vector(file_name):
h5f = h5py.File(file_name,'r')
shuffle_vector = h5f['shuffle_vector'][[179859, 385513, 893430]]
h5f.close()
return shuffle_vector
def load_inputs(file_name): def load_inputs(file_name):
h5f = h5py.File(file_name,'r') h5f = h5py.File(file_name,'r')
X = h5f['X'][:] X = h5f['X'][:]
@ -29,6 +45,15 @@ def load_inputs(file_name):
h5f.close() h5f.close()
return X, y return X, y
def load_extended_inputs(file_name):
h5f = h5py.File(file_name,'r')
X = h5f['X'][:]
y = h5f['y'][:]
X_pure = h5f['X_pure'][:]
h5f.close()
return X, y, X_pure
def save_model(model, file_name): def save_model(model, file_name):
h5f = h5py.File(file_name, 'w') h5f = h5py.File(file_name, 'w')
adict=dict(W1=model['W1'], b1=model['b1'], W2=model['W2'], b2=model['b2']) adict=dict(W1=model['W1'], b1=model['b1'], W2=model['W2'], b2=model['b2'])
@ -49,7 +74,7 @@ def load_model(file_name):
def read_content(): def read_content():
print('READING CONTENT...') print('READING CONTENT...')
with open('../../data/SlovarIJS_BESEDE_utf8.lex') as f: with open('../data/SlovarIJS_BESEDE_utf8.lex') as f:
content = f.readlines() content = f.readlines()
print('CONTENT READ SUCCESSFULY') print('CONTENT READ SUCCESSFULY')
return [x.decode('utf8').split('\t') for x in content] return [x.decode('utf8').split('\t') for x in content]
@ -62,6 +87,11 @@ def is_vowel(word_list, position, vowels):
return True return True
return False return False
def is_accetuated_vowel(word_list, position, accetuated_vowels):
if word_list[position] in accetuated_vowels:
return True
return False
def create_dict(): def create_dict():
content = read_content() content = read_content()
@ -271,67 +301,146 @@ def count_vowels(content, vowels):
return num_all_vowels return num_all_vowels
def generate_full_vowel_matrix_inputs(): def generate_full_vowel_matrix_inputs(name, split_number):
h5f = h5py.File(name + '.h5', 'w')
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict() dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
num_all_vowels = count_vowels(content, vowels)
data_X = h5f.create_dataset('X', (num_all_vowels, max_word, len(dictionary)),
maxshape=(num_all_vowels, max_word, len(dictionary)),
dtype=np.uint8)
data_y = h5f.create_dataset('y', (num_all_vowels,),
maxshape=(num_all_vowels,),
dtype=np.uint8)
data_X_pure = h5f.create_dataset('X_pure', (num_all_vowels,),
maxshape=(num_all_vowels,),
dtype=np.uint8)
gc.collect() gc.collect()
# print (2018553 * max_word * len(dictionary) / (2**30.0)) # print (2018553 * max_word * len(dictionary) / (2**30.0))
print('GENERATING X AND y...') print('GENERATING X AND y...')
# X = np.zeros((len(content), max_word*len(dictionary))) # X = np.zeros((len(content), max_word*len(dictionary)))
y = np.zeros((len(content), max_num_vowels * max_num_vowels )) # y = np.zeros((len(content), max_num_vowels * max_num_vowels))
# X = np.zeros((2018553, max_word, len(dictionary))) # X = np.zeros((2018553, max_word, len(dictionary)))
X_pure = [] X_pure = []
X = [] X = []
y = []
part_len = len(content)/float(split_number)
current_part_generation = 1
i = 0 i = 0
num_all_vowels = 0
old_num_all_vowels = 0
for el in content: for el in content:
j = 0 j = 0
# word = []
X_el = np.zeros((max_word, len(dictionary))) X_el = np.zeros((max_word, len(dictionary)))
for c in list(el[0]): for c in list(el[0]):
index = 0 index = 0
# character = np.zeros(len(dictionary))
for d in dictionary: for d in dictionary:
if c == d: if c == d:
X_el[j][index] = 1 X_el[j][index] = 1
# character[index] = 1
break break
index += 1 index += 1
# word.append(character)
j += 1 j += 1
# for c in list(el[0]):
vowel_i = 0 vowel_i = 0
for m in range(len(el[0])): for m in range(len(el[0])):
if is_vowel(list(el[0]), m, vowels): if is_vowel(list(el[0]), m, vowels):
X.append(X_el) X.append(X_el)
X_pure.append(vowel_i) X_pure.append(vowel_i)
vowel_i += 1 vowel_i += 1
if is_accetuated_vowel(list(el[3]), m, accetuated_vowels):
y.append(1)
else:
y.append(0)
if current_part_generation * part_len <= i:
print('Saving part '+ str(current_part_generation))
# create_and_save_inputs(name, str(current_part_generation), np.array(X), np.zeros(len(X)), np.array(X_pure))
# adict = dict(X=np.array(X), y=np.zeros(len(X)), X_pure=np.array(X_pure))
# for k, v in adict.items():
# h5f.create_dataset(k, data=v)
# print (len(np.array(X)))
data_X[old_num_all_vowels:num_all_vowels + 1] = np.array(X)
data_y[old_num_all_vowels:num_all_vowels + 1] = np.array(y)
data_X_pure[old_num_all_vowels:num_all_vowels + 1] = np.array(X_pure)
old_num_all_vowels = num_all_vowels + 1
X_pure = []
X = []
y = []
current_part_generation += 1
num_all_vowels += 1
if i%10000 == 0:
print i
# text_file.write("Purchase Amount: %s" % TotalAmount)
j = 0 j = 0
# X.append(word) # X.append(word)
word_accetuations = [] # word_accetuations = []
num_vowels = 0 # num_vowels = 0
for c in list(el[3]): # for c in list(el[3]):
index = 0 # index = 0
if is_vowel(el[3], j, vowels): # if is_vowel(el[3], j, vowels):
num_vowels += 1 # num_vowels += 1
for d in accetuated_vowels: # for d in accetuated_vowels:
if c == d: # if c == d:
word_accetuations.append(num_vowels) # word_accetuations.append(num_vowels)
break # break
index += 1 # index += 1
j += 1 # j += 1
y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1 # y[i][generate_presentable_y(word_accetuations, list(el[3]), max_num_vowels)] = 1
i += 1 i += 1
# print(len(X))
# del X_pure
# del dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels
X = np.array(X) print('Saving part ' + str(current_part_generation))
X_pure = np.array(X_pure) # create_and_save_inputs(name, str(current_part_generation), np.array(X), np.zeros(len(X)), np.array(X_pure))
print('GENERATION SUCCESSFUL!')
print('SHUFFELING INPUTS...') data_X[old_num_all_vowels:num_all_vowels] = np.array(X)
X, y, X_pure = shuffle_inputs(X, y, X_pure) data_y[old_num_all_vowels:num_all_vowels] = np.array(y)
print('INPUTS SHUFFELED!') data_X_pure[old_num_all_vowels:num_all_vowels] = np.array(X_pure)
return X, y, X_pure
# adict = dict(X=X, y=y, X_pure=X_pure)
# for k, v in adict.items():
# h5f.create_dataset(k, data=v)
h5f.close()
def shuffle_full_vowel_inputs(name, orderd_name, parts):
# internal_representations/inputs/X_ordered_part
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
num_all_vowels = count_vowels(content, vowels)
# s = np.arange(num_all_vowels)
# np.random.shuffle(s)
# create_and_save_shuffle_vector(name, s)
s = load_shuffle_vector('internal_representations/inputs/X_shuffled_part_shuffle_vector.h5')
print('Shuffled vector loaded!')
section_range = [0, (num_all_vowels + 1)/parts]
for h in range(3, parts+1):
gc.collect()
new_X = np.zeros((section_range[1], max_word, len(dictionary)))
new_X_pure = np.zeros(section_range[1])
new_y = np.zeros(section_range[1])
for i in range(1, parts+1):
X, y, X_pure = load_extended_inputs(orderd_name + str(parts) + '.h5')
for j in range(X.shape[0]):
if s[j] >= section_range[0] and s[j] < section_range[1]:
new_X[s[j]] = X[j]
new_y[s[j]] = y[j]
new_X_pure[s[j]] = X_pure[j]
print('CREATED ' + str(h) + '. PART OF SHUFFLED MATRIX')
create_and_save_inputs(name, str(h), new_X, new_y, new_X_pure)
section_range[0] = section_range[1]
if section_range[1] + (num_all_vowels + 1)/parts < num_all_vowels:
section_range[1] += (num_all_vowels + 1)/parts
else:
section_range[1] = num_all_vowels
def decode_position(y, max_num_vowels): def decode_position(y, max_num_vowels):
@ -345,6 +454,17 @@ def decode_position(y, max_num_vowels):
i += 1 i += 1
return [pos % max_num_vowels, pos / max_num_vowels] return [pos % max_num_vowels, pos / max_num_vowels]
def decode_input(word_encoded, dictionary):
word = ''
for el in word_encoded:
i = 0
for num in el:
if num == 1:
word += dictionary[i]
break
i += 1
return word
def decode_position_from_number(y, max_num_vowels): def decode_position_from_number(y, max_num_vowels):
return [y % max_num_vowels, y / max_num_vowels] return [y % max_num_vowels, y / max_num_vowels]