diff --git a/.idea/accetuation.iml b/.idea/accetuation.iml deleted file mode 100644 index 4b7c000..0000000 --- a/.idea/accetuation.iml +++ /dev/null @@ -1,11 +0,0 @@ - - - - - - - - - - \ No newline at end of file diff --git a/.idea/dictionaries/luka.xml b/.idea/dictionaries/luka.xml deleted file mode 100644 index 1a1714b..0000000 --- a/.idea/dictionaries/luka.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - accentuations - nonresonant - overfitting - - - \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml deleted file mode 100644 index 97626ba..0000000 --- a/.idea/encodings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 23db218..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,22 +0,0 @@ - - - - - $USER_HOME$/.subversion - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index f3d2052..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml deleted file mode 100644 index c56c4eb..0000000 --- a/.idea/workspace.xml +++ /dev/null @@ -1,1326 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - count_vowels - count - sylla - # word - accented_word - get_ensemble_type_predictions - rever - accentuate_wo - content - transla - feature_dic - _create_slovene_feature_dictionary - feature_dictionary - morp - convert_multext - _convert_multext - _convert_to_multext_east_v4 - decode_x - assign_word_accentuation_type - accented_vowels - test_ - à - _get_accented_vowels - ô - ó - accent_class - i - for i - _syllable_generator - _generator_instance - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 1486720239842 - - - 1492074623429 - - - 1523366019388 - - - 1523368401204 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/TEST/workbench.py - 36 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/accentuate.py b/accentuate.py new file mode 100644 index 0000000..c66d955 --- /dev/null +++ b/accentuate.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import pickle +import numpy as np +from keras.models import load_model +import sys + +from prepare_data import * + +# obtain data from parameters +if len(sys.argv) < 3: + print('Please provide arguments for this script to work. First argument should be location of file with unaccented words and morphological data ' + 'and second the name of file where you would like to save file to. Example: python accentuate.py \'test_data/unaccented_dictionary\' ' + '\'test_data/accented_data\'') + raise Exception +read_location = sys.argv[1] +write_location = sys.argv[2] + +# get environment variables necessary for calculations +pickle_input = open('preprocessed_data/environment.pkl', 'rb') +environment = pickle.load(pickle_input) +dictionary = environment['dictionary'] +max_word = environment['max_word'] +max_num_vowels = environment['max_num_vowels'] +vowels = environment['vowels'] +accented_vowels = environment['accented_vowels'] +feature_dictionary = environment['feature_dictionary'] +syllable_dictionary = environment['syllable_dictionary'] + +# load models +data = Data('l', shuffle_all_inputs=False) +letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models( + 'cnn/word_accetuation/cnn_dictionary/v5_3/20_final_epoch.h5', + 'cnn/word_accetuation/syllables/v3_3/20_final_epoch.h5', + 'cnn/word_accetuation/syllabled_letters/v3_3/20_final_epoch.h5') + +letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model = data.load_location_models( + 'cnn/word_accetuation/cnn_dictionary/v5_2/20_final_epoch.h5', + 'cnn/word_accetuation/syllables/v3_2/20_final_epoch.h5', + 'cnn/word_accetuation/syllabled_letters/v3_2/20_final_epoch.h5') + +letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models( + 'cnn/accent_classification/letters/v3_1/20_final_epoch.h5', + 'cnn/accent_classification/syllables/v2_1/20_final_epoch.h5', + 'cnn/accent_classification/syllabled_letters/v2_1/20_final_epoch.h5') + +letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model = data.load_type_models( + 'cnn/accent_classification/letters/v3_0/20_final_epoch.h5', + 'cnn/accent_classification/syllables/v2_0/20_final_epoch.h5', + 'cnn/accent_classification/syllabled_letters/v2_0/20_final_epoch.h5') + +# read from data +content = data._read_content(read_location) + +# format data for accentuate_word function it has to be like [['besedišči', '', 'Ncnpi', 'besedišči'], ] +content = [[el[0], '', el[1][:-1], el[0]] for el in content[:-1]] + +# use environment variables and models to accentuate words +data = Data('l', shuffle_all_inputs=False) +location_accented_words, accented_words = data.accentuate_word(content, letter_location_model, syllable_location_model, syllabled_letters_location_model, + letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model, + letter_type_model, syllable_type_model, syllabled_letter_type_model, + letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model, + dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary) + +# save accentuated words +with open(write_location, 'w') as f: + for i in range(len(location_accented_words)): + f.write(location_accented_words[i] + ' ' + accented_words[i] + '\n') + f.write('\n') \ No newline at end of file diff --git a/accentuate_connected_text.py b/accentuate_connected_text.py new file mode 100644 index 0000000..cbf7e95 --- /dev/null +++ b/accentuate_connected_text.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import sys + +sys.path.insert(0, '../../../') +from prepare_data import * + +import pickle + +# from keras import backend as Input +np.random.seed(7) + +# obtain data from parameters +if len(sys.argv) < 3: + print('Please provide arguments for this script to work. First argument should be location of file with unaccented words and morphological data, ' + 'second the name of file where you would like to save results to and third location of ReLDI tagger. Example: python accentuate.py ' + '\'test_data/original_connected_text\' \'test_data/accented_connected_text\' \'../reldi_tagger\'') + raise Exception +read_location = sys.argv[1] +write_location = sys.argv[2] +reldi_location = sys.argv[3] + +# get environment variables necessary for calculations +pickle_input = open('preprocessed_data/environment.pkl', 'rb') +environment = pickle.load(pickle_input) +dictionary = environment['dictionary'] +max_word = environment['max_word'] +max_num_vowels = environment['max_num_vowels'] +vowels = environment['vowels'] +accented_vowels = environment['accented_vowels'] +feature_dictionary = environment['feature_dictionary'] +syllable_dictionary = environment['syllable_dictionary'] + +# get models +data = Data('l', shuffle_all_inputs=False) +letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models( + 'cnn/word_accetuation/cnn_dictionary/v5_3/20_final_epoch.h5', + 'cnn/word_accetuation/syllables/v3_3/20_final_epoch.h5', + 'cnn/word_accetuation/syllabled_letters/v3_3/20_final_epoch.h5') + +letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model = data.load_location_models( + 'cnn/word_accetuation/cnn_dictionary/v5_2/20_final_epoch.h5', + 'cnn/word_accetuation/syllables/v3_2/20_final_epoch.h5', + 'cnn/word_accetuation/syllabled_letters/v3_2/20_final_epoch.h5') + +letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models( + 'cnn/accent_classification/letters/v3_1/20_final_epoch.h5', + 'cnn/accent_classification/syllables/v2_1/20_final_epoch.h5', + 'cnn/accent_classification/syllabled_letters/v2_1/20_final_epoch.h5') + +letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model = data.load_type_models( + 'cnn/accent_classification/letters/v3_0/20_final_epoch.h5', + 'cnn/accent_classification/syllables/v2_0/20_final_epoch.h5', + 'cnn/accent_classification/syllabled_letters/v2_0/20_final_epoch.h5') + +# get word tags +tagged_words, original_text = data.tag_words(reldi_location, read_location) + +# find accentuation locations +predictions = data.get_ensemble_location_predictions(tagged_words, letter_location_model, syllable_location_model, syllabled_letters_location_model, + letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model, + dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, + syllable_dictionary) + +location_accented_text = data.create_connected_text_locations(tagged_words, original_text, predictions, vowels) + +# accentuate text +location_y = np.around(predictions) +type_predictions = data.get_ensemble_type_predictions(tagged_words, location_y, letter_type_model, syllable_type_model, syllabled_letter_type_model, + letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model, + dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, + syllable_dictionary) + +accented_text = data.create_connected_text_accented(tagged_words, original_text, type_predictions, location_y, vowels, accented_vowels) + +# save accentuated text +with open(write_location, 'w') as f: + f.write(accented_text) diff --git a/learn_location_weights.py b/learn_location_weights.py new file mode 100644 index 0000000..f2581d9 --- /dev/null +++ b/learn_location_weights.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals +# text in Western (Windows 1252) + +import pickle +import numpy as np +np.random.seed(7) + +import sys +from prepare_data import * + +# preprocess data +# data = Data('l', allow_shuffle_vector_generation=True, save_generated_data=False, shuffle_all_inputs=True) +data = Data('l', save_generated_data=False, shuffle_all_inputs=True) +data.generate_data('../../internal_representations/inputs/letters_word_accentuation_train', + '../../internal_representations/inputs/letters_word_accentuation_test', + '../../internal_representations/inputs/letters_word_accentuation_validate', + content_location='../accetuation/data/', + content_name='SlovarIJS_BESEDE_utf8.lex', + inputs_location='../accetuation/cnn/internal_representations/inputs/', + content_shuffle_vector='content_shuffle_vector', + shuffle_vector='shuffle_vector') + +# combine all data (if it is unwanted comment code below) +data.x_train = np.concatenate((data.x_train, data.x_test, data.x_validate), axis=0) +data.x_other_features_train = np.concatenate((data.x_other_features_train, data.x_other_features_test, data.x_other_features_validate), axis=0) +data.y_train = np.concatenate((data.y_train, data.y_test, data.y_validate), axis=0) + +# build neural network architecture +nn_output_dim = 10 +batch_size = 16 +actual_epoch = 20 +num_fake_epoch = 20 + +conv_input_shape=(23, 36) +othr_input = (140, ) + +conv_input = Input(shape=conv_input_shape, name='conv_input') +x_conv = Conv1D(115, (3), padding='same', activation='relu')(conv_input) +x_conv = Conv1D(46, (3), padding='same', activation='relu')(x_conv) +x_conv = MaxPooling1D(pool_size=2)(x_conv) +x_conv = Flatten()(x_conv) + +othr_input = Input(shape=othr_input, name='othr_input') + +x = concatenate([x_conv, othr_input]) +x = Dense(256, activation='relu')(x) +x = Dropout(0.3)(x) +x = Dense(256, activation='relu')(x) +x = Dropout(0.3)(x) +x = Dense(256, activation='relu')(x) +x = Dropout(0.3)(x) +x = Dense(nn_output_dim, activation='sigmoid')(x) + +model = Model(inputs=[conv_input, othr_input], outputs=x) +opt = optimizers.Adam(lr=1E-3, beta_1=0.9, beta_2=0.999, epsilon=1e-08) +model.compile(loss='mean_squared_error', optimizer=opt, metrics=[actual_accuracy,]) +# model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy']) + + +# start learning +history = model.fit_generator(data.generator('train', batch_size, content_name='SlovarIJS_BESEDE_utf8.lex', content_location='../accetuation/data/'), + data.x_train.shape[0]/(batch_size * num_fake_epoch), + epochs=actual_epoch*num_fake_epoch, + validation_data=data.generator('test', batch_size), + validation_steps=data.x_test.shape[0]/(batch_size * num_fake_epoch)) + + +# save generated data +name = 'test_data/20_epoch' +model.save(name + '.h5') +output = open(name + '_history.pkl', 'wb') +pickle.dump(history.history, output) +output.close() diff --git a/prepare_data.py b/prepare_data.py index 250807e..75eefcb 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -7,6 +7,7 @@ import h5py import math import keras.backend as K import os.path +from os import remove import codecs from copy import copy @@ -666,7 +667,7 @@ class Data: loc += batch_size # generator for inputs for tracking of data fitting - def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator, accented_vowels, oversampling): + def _syllable_generator(self, orig_x, orig_x_additional, orig_y, batch_size, translator, accented_vowels, oversampling=np.ones(13)): size = orig_x.shape[0] while 1: loc = 0 @@ -1655,6 +1656,95 @@ class Data: return location_accented_words, accented_words + def tag_words(self, reldi_location, original_location): + # generates text with every word in new line + with open(original_location) as f: + original_text = f.readlines() + original_text = ''.join(original_text) + # print(original_text) + text_with_whitespaces = original_text.replace(',', ' ,').replace('.', ' .').replace('\n', ' ').replace("\"", " \" ").replace(":", + " :").replace( + "ć", "č").replace('–', '-') + # print('-------------------------------------------------') + text_with_whitespaces = '\n'.join(text_with_whitespaces.split()) + text_with_whitespaces += '\n\n' + # print(text_with_whitespaces) + with open('.words_with_whitespaces', "w") as text_file: + text_file.write(text_with_whitespaces) + + # generates text with PoS tags + import subprocess + + myinput = open('.words_with_whitespaces', 'r') + myoutput = open('.word_tags', 'w') + # print(myinput.readlines()) + python3_command = reldi_location + "/tagger.py sl" # launch your python2 script using bash + + process = subprocess.run(python3_command.split(), stdin=myinput, stdout=myoutput) + + # generates interesting words + pointless_words = ['.', ',', '\"', ':', '-'] + with open('.word_tags', "r") as text_file: + tagged_input_words = [] + for x in text_file.readlines()[:-1]: + splited_line = x[:-1].split('\t') + if splited_line[0] not in pointless_words and not any(char.isdigit() for char in splited_line[0]): + tagged_input_words.append([splited_line[0].lower(), '', splited_line[1], splited_line[0].lower()]) + + remove(".words_with_whitespaces") + remove(".word_tags") + return tagged_input_words, original_text + + def create_connected_text_locations(self, tagged_input_words, original_text, predictions, vowels): + if 'A' not in vowels: + vowels.extend(['A', 'E', 'I', 'O', 'U']) + accented_words = [self.assign_location_stress(tagged_input_words[i][0][::-1], self.decode_y(predictions[i]), vowels)[::-1] for i in + range(len(tagged_input_words))] + + # print(accented_words[:20]) + # print(tagged_input_words[:20]) + + words_and_accetuation_loc = [[tagged_input_words[i][0], self.decode_y(predictions[i])] for i in range(len(tagged_input_words))] + + original_text_list = list(original_text) + original_text_lowercase = original_text.lower() + end_pos = 0 + for word in words_and_accetuation_loc: + posit = original_text_lowercase.find(word[0], end_pos) + if posit != -1: + start_pos = posit + end_pos = start_pos + len(word[0]) + + original_text_list[start_pos:end_pos] = list( + self.assign_location_stress(''.join(original_text_list[start_pos:end_pos][::-1]), word[1], vowels)[::-1]) + + return ''.join(original_text_list) + + def create_connected_text_accented(self, tagged_input_words, original_text, type_predictions, location_y, vowels, accented_vowels): + + input_words = [el[0] for el in tagged_input_words] + words = self.assign_stress_types(type_predictions, input_words, location_y, vowels, accented_vowels) + + # print(original_text) + + original_text_list = list(original_text) + original_text_lowercase = original_text.lower() + end_pos = 0 + for i in range(len(words)): + posit = original_text_lowercase.find(input_words[i], end_pos) + if posit != -1: + start_pos = posit + end_pos = start_pos + len(words[i]) + + orig_word = original_text_list[start_pos:end_pos] + new_word = list(words[i]) + for j in range(len(orig_word)): + if orig_word[j].isupper(): + new_word[j] = new_word[j].upper() + + original_text_list[start_pos:end_pos] = new_word + + return ''.join(original_text_list) # def count_vowels(content, vowels): # num_all_vowels = 0 # for el in content: diff --git a/preprocessed_data/environment.pkl b/preprocessed_data/environment.pkl new file mode 100644 index 0000000..7912fd3 Binary files /dev/null and b/preprocessed_data/environment.pkl differ diff --git a/test_data/accented_connected_text b/test_data/accented_connected_text new file mode 100644 index 0000000..6e078f8 --- /dev/null +++ b/test_data/accented_connected_text @@ -0,0 +1 @@ +Izbrúhi na sóncu só žé vëčkrat pokazáli zóbe nášim satelítom, poslédično nášim mobílnim telefónom, navigáciji, celo eléktričnemu omréžju. Á vesóljskega vreména šë në morémo napovédati – kakó bî ga láhko, se tá téden na Blédu pogovárja okóli 70 znánstvenikov Evrópske vesóljske agéncije, ki jé sebój pripeljála svôjo näjvéčjo ikóno, británca Mátta Taylorja. diff --git a/test_data/accented_data b/test_data/accented_data new file mode 100644 index 0000000..c193212 --- /dev/null +++ b/test_data/accented_data @@ -0,0 +1,6 @@ +absolutístični absolutístični +spoštljívejše spoštljívejše +tresóče tresóče +razneséna raznesěna +žvížgih žvížgih + diff --git a/test_data/original_connected_text b/test_data/original_connected_text new file mode 100644 index 0000000..cd04ba3 --- /dev/null +++ b/test_data/original_connected_text @@ -0,0 +1 @@ +Izbruhi na soncu so že večkrat pokazali zobe našim satelitom, posledično našim mobilnim telefonom, navigaciji, celo električnemu omrežju. A vesoljskega vremena še ne moremo napovedati – kako bi ga lahko, se ta teden na Bledu pogovarja okoli 70 znanstvenikov Evropske vesoljske agencije, ki je seboj pripeljala svojo največjo ikono, britanca Matta Taylorja. diff --git a/test_data/unaccented_dictionary b/test_data/unaccented_dictionary new file mode 100644 index 0000000..ed1c01b --- /dev/null +++ b/test_data/unaccented_dictionary @@ -0,0 +1,6 @@ +absolutistični Afpmsay-n +spoštljivejše Afcfsg +tresoče Afpfsg +raznesena Vmp--sfp +žvižgih Ncmdl +