Added some runnable applications of this model

This commit is contained in:
Luka 2018-04-27 11:15:37 +02:00
parent 4175c45ceb
commit 048825648a
16 changed files with 329 additions and 1389 deletions

View File

@ -1,11 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.5.2 (~/miniconda3/bin/python)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>

View File

@ -1,9 +0,0 @@
<component name="ProjectDictionaryState">
<dictionary name="luka">
<words>
<w>accentuations</w>
<w>nonresonant</w>
<w>overfitting</w>
</words>
</dictionary>
</component>

View File

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="PROJECT" charset="UTF-8" />
</component>
</project>

View File

@ -1,22 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5.2 (~/miniconda3/bin/python)" project-jdk-type="Python SDK" />
<component name="SvnConfiguration">
<configuration>$USER_HOME$/.subversion</configuration>
</component>
<component name="masterDetails">
<states>
<state key="ScopeChooserConfigurable.UI">
<settings>
<splitter-proportions>
<option name="proportions">
<list>
<option value="0.2" />
</list>
</option>
</splitter-proportions>
</settings>
</state>
</states>
</component>
</project>

View File

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/accetuation.iml" filepath="$PROJECT_DIR$/.idea/accetuation.iml" />
</modules>
</component>
</project>

View File

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

File diff suppressed because it is too large Load Diff

71
accentuate.py Normal file
View File

@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import pickle
import numpy as np
from keras.models import load_model
import sys
from prepare_data import *
# obtain data from parameters
if len(sys.argv) < 3:
print('Please provide arguments for this script to work. First argument should be location of file with unaccented words and morphological data '
'and second the name of file where you would like to save file to. Example: python accentuate.py \'test_data/unaccented_dictionary\' '
'\'test_data/accented_data\'')
raise Exception
read_location = sys.argv[1]
write_location = sys.argv[2]
# get environment variables necessary for calculations
pickle_input = open('preprocessed_data/environment.pkl', 'rb')
environment = pickle.load(pickle_input)
dictionary = environment['dictionary']
max_word = environment['max_word']
max_num_vowels = environment['max_num_vowels']
vowels = environment['vowels']
accented_vowels = environment['accented_vowels']
feature_dictionary = environment['feature_dictionary']
syllable_dictionary = environment['syllable_dictionary']
# load models
data = Data('l', shuffle_all_inputs=False)
letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(
'cnn/word_accetuation/cnn_dictionary/v5_3/20_final_epoch.h5',
'cnn/word_accetuation/syllables/v3_3/20_final_epoch.h5',
'cnn/word_accetuation/syllabled_letters/v3_3/20_final_epoch.h5')
letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model = data.load_location_models(
'cnn/word_accetuation/cnn_dictionary/v5_2/20_final_epoch.h5',
'cnn/word_accetuation/syllables/v3_2/20_final_epoch.h5',
'cnn/word_accetuation/syllabled_letters/v3_2/20_final_epoch.h5')
letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models(
'cnn/accent_classification/letters/v3_1/20_final_epoch.h5',
'cnn/accent_classification/syllables/v2_1/20_final_epoch.h5',
'cnn/accent_classification/syllabled_letters/v2_1/20_final_epoch.h5')
letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model = data.load_type_models(
'cnn/accent_classification/letters/v3_0/20_final_epoch.h5',
'cnn/accent_classification/syllables/v2_0/20_final_epoch.h5',
'cnn/accent_classification/syllabled_letters/v2_0/20_final_epoch.h5')
# read from data
content = data._read_content(read_location)
# format data for accentuate_word function it has to be like [['besedišči', '', 'Ncnpi', 'besedišči'], ]
content = [[el[0], '', el[1][:-1], el[0]] for el in content[:-1]]
# use environment variables and models to accentuate words
data = Data('l', shuffle_all_inputs=False)
location_accented_words, accented_words = data.accentuate_word(content, letter_location_model, syllable_location_model, syllabled_letters_location_model,
letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model,
letter_type_model, syllable_type_model, syllabled_letter_type_model,
letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)
# save accentuated words
with open(write_location, 'w') as f:
for i in range(len(location_accented_words)):
f.write(location_accented_words[i] + ' ' + accented_words[i] + '\n')
f.write('\n')

View File

@ -0,0 +1,79 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import sys
sys.path.insert(0, '../../../')
from prepare_data import *
import pickle
# from keras import backend as Input
np.random.seed(7)
# obtain data from parameters
if len(sys.argv) < 3:
print('Please provide arguments for this script to work. First argument should be location of file with unaccented words and morphological data, '
'second the name of file where you would like to save results to and third location of ReLDI tagger. Example: python accentuate.py '
'\'test_data/original_connected_text\' \'test_data/accented_connected_text\' \'../reldi_tagger\'')
raise Exception
read_location = sys.argv[1]
write_location = sys.argv[2]
reldi_location = sys.argv[3]
# get environment variables necessary for calculations
pickle_input = open('preprocessed_data/environment.pkl', 'rb')
environment = pickle.load(pickle_input)
dictionary = environment['dictionary']
max_word = environment['max_word']
max_num_vowels = environment['max_num_vowels']
vowels = environment['vowels']
accented_vowels = environment['accented_vowels']
feature_dictionary = environment['feature_dictionary']
syllable_dictionary = environment['syllable_dictionary']
# get models
data = Data('l', shuffle_all_inputs=False)
letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(
'cnn/word_accetuation/cnn_dictionary/v5_3/20_final_epoch.h5',
'cnn/word_accetuation/syllables/v3_3/20_final_epoch.h5',
'cnn/word_accetuation/syllabled_letters/v3_3/20_final_epoch.h5')
letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model = data.load_location_models(
'cnn/word_accetuation/cnn_dictionary/v5_2/20_final_epoch.h5',
'cnn/word_accetuation/syllables/v3_2/20_final_epoch.h5',
'cnn/word_accetuation/syllabled_letters/v3_2/20_final_epoch.h5')
letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models(
'cnn/accent_classification/letters/v3_1/20_final_epoch.h5',
'cnn/accent_classification/syllables/v2_1/20_final_epoch.h5',
'cnn/accent_classification/syllabled_letters/v2_1/20_final_epoch.h5')
letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model = data.load_type_models(
'cnn/accent_classification/letters/v3_0/20_final_epoch.h5',
'cnn/accent_classification/syllables/v2_0/20_final_epoch.h5',
'cnn/accent_classification/syllabled_letters/v2_0/20_final_epoch.h5')
# get word tags
tagged_words, original_text = data.tag_words(reldi_location, read_location)
# find accentuation locations
predictions = data.get_ensemble_location_predictions(tagged_words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
syllable_dictionary)
location_accented_text = data.create_connected_text_locations(tagged_words, original_text, predictions, vowels)
# accentuate text
location_y = np.around(predictions)
type_predictions = data.get_ensemble_type_predictions(tagged_words, location_y, letter_type_model, syllable_type_model, syllabled_letter_type_model,
letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
syllable_dictionary)
accented_text = data.create_connected_text_accented(tagged_words, original_text, type_predictions, location_y, vowels, accented_vowels)
# save accentuated text
with open(write_location, 'w') as f:
f.write(accented_text)

74
learn_location_weights.py Normal file
View File

@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
# text in Western (Windows 1252)
import pickle
import numpy as np
np.random.seed(7)
import sys
from prepare_data import *
# preprocess data
# data = Data('l', allow_shuffle_vector_generation=True, save_generated_data=False, shuffle_all_inputs=True)
data = Data('l', save_generated_data=False, shuffle_all_inputs=True)
data.generate_data('../../internal_representations/inputs/letters_word_accentuation_train',
'../../internal_representations/inputs/letters_word_accentuation_test',
'../../internal_representations/inputs/letters_word_accentuation_validate',
content_location='../accetuation/data/',
content_name='SlovarIJS_BESEDE_utf8.lex',
inputs_location='../accetuation/cnn/internal_representations/inputs/',
content_shuffle_vector='content_shuffle_vector',
shuffle_vector='shuffle_vector')
# combine all data (if it is unwanted comment code below)
data.x_train = np.concatenate((data.x_train, data.x_test, data.x_validate), axis=0)
data.x_other_features_train = np.concatenate((data.x_other_features_train, data.x_other_features_test, data.x_other_features_validate), axis=0)
data.y_train = np.concatenate((data.y_train, data.y_test, data.y_validate), axis=0)
# build neural network architecture
nn_output_dim = 10
batch_size = 16
actual_epoch = 20
num_fake_epoch = 20
conv_input_shape=(23, 36)
othr_input = (140, )
conv_input = Input(shape=conv_input_shape, name='conv_input')
x_conv = Conv1D(115, (3), padding='same', activation='relu')(conv_input)
x_conv = Conv1D(46, (3), padding='same', activation='relu')(x_conv)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-3, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(loss='mean_squared_error', optimizer=opt, metrics=[actual_accuracy,])
# model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
# start learning
history = model.fit_generator(data.generator('train', batch_size, content_name='SlovarIJS_BESEDE_utf8.lex', content_location='../accetuation/data/'),
data.x_train.shape[0]/(batch_size * num_fake_epoch),
epochs=actual_epoch*num_fake_epoch,
validation_data=data.generator('test', batch_size),
validation_steps=data.x_test.shape[0]/(batch_size * num_fake_epoch))
# save generated data
name = 'test_data/20_epoch'
model.save(name + '.h5')
output = open(name + '_history.pkl', 'wb')
pickle.dump(history.history, output)
output.close()

View File

@ -7,6 +7,7 @@ import h5py
import math
import keras.backend as K
import os.path
from os import remove
import codecs
from copy import copy
@ -666,7 +667,7 @@ class Data:
loc += batch_size
# generator for inputs for tracking of data fitting
def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator, accented_vowels, oversampling):
def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator, accented_vowels, oversampling=np.ones(13)):
size = orig_x.shape[0]
while 1:
loc = 0
@ -1655,6 +1656,95 @@ class Data:
return location_accented_words, accented_words
def tag_words(self, reldi_location, original_location):
# generates text with every word in new line
with open(original_location) as f:
original_text = f.readlines()
original_text = ''.join(original_text)
# print(original_text)
text_with_whitespaces = original_text.replace(',', ' ,').replace('.', ' .').replace('\n', ' ').replace("\"", " \" ").replace(":",
" :").replace(
"ć", "č").replace('', '-')
# print('-------------------------------------------------')
text_with_whitespaces = '\n'.join(text_with_whitespaces.split())
text_with_whitespaces += '\n\n'
# print(text_with_whitespaces)
with open('.words_with_whitespaces', "w") as text_file:
text_file.write(text_with_whitespaces)
# generates text with PoS tags
import subprocess
myinput = open('.words_with_whitespaces', 'r')
myoutput = open('.word_tags', 'w')
# print(myinput.readlines())
python3_command = reldi_location + "/tagger.py sl" # launch your python2 script using bash
process = subprocess.run(python3_command.split(), stdin=myinput, stdout=myoutput)
# generates interesting words
pointless_words = ['.', ',', '\"', ':', '-']
with open('.word_tags', "r") as text_file:
tagged_input_words = []
for x in text_file.readlines()[:-1]:
splited_line = x[:-1].split('\t')
if splited_line[0] not in pointless_words and not any(char.isdigit() for char in splited_line[0]):
tagged_input_words.append([splited_line[0].lower(), '', splited_line[1], splited_line[0].lower()])
remove(".words_with_whitespaces")
remove(".word_tags")
return tagged_input_words, original_text
def create_connected_text_locations(self, tagged_input_words, original_text, predictions, vowels):
if 'A' not in vowels:
vowels.extend(['A', 'E', 'I', 'O', 'U'])
accented_words = [self.assign_location_stress(tagged_input_words[i][0][::-1], self.decode_y(predictions[i]), vowels)[::-1] for i in
range(len(tagged_input_words))]
# print(accented_words[:20])
# print(tagged_input_words[:20])
words_and_accetuation_loc = [[tagged_input_words[i][0], self.decode_y(predictions[i])] for i in range(len(tagged_input_words))]
original_text_list = list(original_text)
original_text_lowercase = original_text.lower()
end_pos = 0
for word in words_and_accetuation_loc:
posit = original_text_lowercase.find(word[0], end_pos)
if posit != -1:
start_pos = posit
end_pos = start_pos + len(word[0])
original_text_list[start_pos:end_pos] = list(
self.assign_location_stress(''.join(original_text_list[start_pos:end_pos][::-1]), word[1], vowels)[::-1])
return ''.join(original_text_list)
def create_connected_text_accented(self, tagged_input_words, original_text, type_predictions, location_y, vowels, accented_vowels):
input_words = [el[0] for el in tagged_input_words]
words = self.assign_stress_types(type_predictions, input_words, location_y, vowels, accented_vowels)
# print(original_text)
original_text_list = list(original_text)
original_text_lowercase = original_text.lower()
end_pos = 0
for i in range(len(words)):
posit = original_text_lowercase.find(input_words[i], end_pos)
if posit != -1:
start_pos = posit
end_pos = start_pos + len(words[i])
orig_word = original_text_list[start_pos:end_pos]
new_word = list(words[i])
for j in range(len(orig_word)):
if orig_word[j].isupper():
new_word[j] = new_word[j].upper()
original_text_list[start_pos:end_pos] = new_word
return ''.join(original_text_list)
# def count_vowels(content, vowels):
# num_all_vowels = 0
# for el in content:

Binary file not shown.

View File

@ -0,0 +1 @@
Izbrúhi na sóncu só žé vëčkrat pokazáli zóbe nášim satelítom, poslédično nášim mobílnim telefónom, navigáciji, celo eléktričnemu omréžju. Á vesóljskega vreména šë në morémo napovédati kakó bî ga láhko, se tá téden na Blédu pogovárja okóli 70 znánstvenikov Evrópske vesóljske agéncije, ki jé sebój pripeljála svôjo näjvéčjo ikóno, británca Mátta Taylorja.

6
test_data/accented_data Normal file
View File

@ -0,0 +1,6 @@
absolutístični absolutístični
spoštljívejše spoštljívejše
tresóče tresóče
razneséna raznesěna
žvížgih žvížgih

View File

@ -0,0 +1 @@
Izbruhi na soncu so že večkrat pokazali zobe našim satelitom, posledično našim mobilnim telefonom, navigaciji, celo električnemu omrežju. A vesoljskega vremena še ne moremo napovedati kako bi ga lahko, se ta teden na Bledu pogovarja okoli 70 znanstvenikov Evropske vesoljske agencije, ki je seboj pripeljala svojo največjo ikono, britanca Matta Taylorja.

View File

@ -0,0 +1,6 @@
absolutistični Afpmsay-n
spoštljivejše Afcfsg
tresoče Afpfsg
raznesena Vmp--sfp
žvižgih Ncmdl