diff --git a/.idea/workspace.xml b/.idea/workspace.xml index cc8febf..576c140 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,8 +2,8 @@ - - + + @@ -35,8 +35,8 @@ - - + + @@ -44,6 +44,16 @@ + + + + + + + + + + @@ -139,6 +149,17 @@ StringIO shuffle_inputs generator + content, feature_dictionary + decode + create_feature_dictionary + with + read + generate + shuffle + X_ + dictionary + create_dict + split_content @@ -157,6 +178,7 @@ @@ -165,7 +187,7 @@ @@ -182,7 +204,6 @@ - @@ -195,23 +216,10 @@ - - - - - - - - + @@ -474,7 +482,7 @@ - + @@ -495,25 +503,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -822,16 +800,6 @@ - - - - - - - - - - @@ -882,6 +850,14 @@ + + + + + + + + @@ -892,6 +868,23 @@ - + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/hyphenation b/hyphenation new file mode 100644 index 0000000..01d18cc --- /dev/null +++ b/hyphenation @@ -0,0 +1,1113 @@ +.av5r +.di6spo +.ek3s +.ek5v +.is1 +.iz1 +.obi4d +.ob5it +.od1 +.po4d5n +.po4v5s +.pre6d7n +.se4k5s +.si4s +.st4 +.voz5l +.voz5n +.zliz6 +a1a +a1b +ab5ba +ab6rod +a1c +ac5ci +a1č +a1d +ad2l +a6dobl +ad6rl. +ad6rla +ad6rob +ad5ur +a1e1 +a1f +af5ga +af1t +a1g +a1h +a4hm +ah5mi +ah5mo +a1i +ai2n1 +a1j +a4j5ek +a4jf +aj5fi +aj5fo +aj5ha +aj5he +aj5im +aj6imo +aj3os +aj6stb +a5ju. +aj3uč +aj3ug +aj5žn +a1k +ak4s +a4kst +a1l +a1m +a4mz +a1n +an6dga +an6dhi +a4nm +an5mi +an5zi +a1o +ao2b1 +a1p +a4ph +a1ra +ar6dwa +a1re +a1ri +a1ro +a1ru +ar5xa +ar5xo +ar5xu +a1s +a4sš +as5šč +a1š +a1t +a4tf +at4i +a1u1 +a4uf +a2uk +a4ul +a1v +av5ši +a4vž +av5ža +ay5to +a1ze +az5fo +a4zig +az3la +az3le +az4lil +az4lit +az4liv +a4zob +a4z3oč +az5ora +az5oro +a4zra +az4red +az5vp +a1ž +až5mi +ba6bba +ban3č4 +ba4u +2b1c +2b1č +2b1d +be1 +be4v +b1h +bi1 +b1ja +b4ja. +b5jel +b3jem +b5jet +2b1k +b3lep +b5leta +b5lil +b5lit +b5liv +b1m +4bmi +2b1n +bo1 +bo6chm +b5ord +bo5vp +b3rab +b5ras +b3raš +b3rez +bre4zg +bre4zi +bre4zr +b5reže +b3rob +br6žda +2b1s +2b1š +2b1t +bu5ki +bu5ku +bu5kv +bu5ry +2b1v +b1z +b1ž +2cc +2ch. +ch5ma +2ck +c1ka +ck1o2 +c5ko. +ckov3 +ck1s +ck5we +2c1n +2c1t +2č1b +2č1g +či1 +1čj +2č1k +1čl +4č3let +č5mes +2č1n +4čop +2č1p +2č1s +4čup +2d1b +2d1c +2d1č +2d1d +dd6voj +d2e +6d5elem +de4min +de4mn +de4z3i +2d1g +2d1h +di5ck +4dind +d4i5no +dis1 +di4skr +di6spr +2d1j +2d1k +5dlet +d2li +d5lit +d5liv +d1lo +2d3m +4d3nac +4d5nač +4d5nap +4d3nar +4dnas +4d5neb +d5niv +4d5niz +4d5njač +4d3nož +d2o +4dobč +4d5obd +2d3o2f +do5rd +do5vč +do5v4z +2d1p +d5raz +d3rep +dre6pn +d4rev +2d1s +2d1š +2d1t +dteks6 +d4ur +du5ro +du5um +2d1v +4d3vi +2d1z2 +e1a +e1b +eb4j +eb6liz +e1c +e1č +e4čd +eč5de +eč5di +eč5do +eč3le +eč5op +e4čt +eč5ti +eč5to +eč5tr +eč5up +e2č1v +eč6vrs +e1d +e4df +ed5ig +ed2l +ed5ob +ed6obe +ed6obr +e4dobs +e4d3oč +ed5vč +ed5zb +e1e +e4ep +e1f +e4ff +ef5fe +ef5ta +e1g +e1h +e1i +ei6pzi +ei2z +eiz5e +e1j +e1k +ek6mal +ek6tre +e1l +e1m +e1n +e1o1 +eob4j +eob4r +eo4dl +eo4z5n +e1p +ep5nik +e1ra +era6z5l +era5z4r +era5z4v +e1re +e4rf +e1ri +e1ro +e4rr +e1ru +e1s +es5da +e5sta +e5sti. +e5stih +e5stil +e1š +e4šp +eš5po +e1t +4eth +e4tinš +e1u1 +e1v +eve6t5l +ev5ha +ev6pre +ev6ste +ev5stv +2ew +ew6ind +ew5le +e4wt +ew5to +e4yw +e1z +ez5dj +e3z4dr +ez2g +ez5gl +e5zij +ez6ijo +ez5imn +e5zis +ez6ist +ez5iz +ez4l +ez6lom +ez6man +ez4mo +e4zob +e4z5or +ez4re +e4zt +e4z5u4m5 +e4zž +e1ž +1fa +fe1 +fe6ljt +ff5ma +fi6zlj +2f1n +fo6uri +fre4u +2f1s +2ft +ft5ve +fu1 +2g1d +ge6ige +gel5č4 +ge6njč +gi6tpr +go1 +go5vz +2g1t +gu1 +ha4u +2h1č +he4i +2h1k +4hl. +h4lo +2h1n +h5ren +2h1š +2h1t +1hu +hu6ffm +i1a +i1b +i1c +i4cs +i1ča +i1če +i1či +ič5ra +i1ču +ič5vr +i1d +4idor +i1e1 +i1f +i1g +4igh +i1h +i1i +ii2n1 +i1j +i1k +i4kč +ik5ča +i1l +il5č4k +4ile +4ilo +i1m +i4mh +im5hi +i1n +1ind +2ine +3i4n3os +1inp +3inse +1inš +4inšk +3intr +i1o1 +i1p +i1r +4ire +i1s +is4a +is6ert +isis4 +i4skv +2iss +i1š +i1t +it5pr +i1u +i1v +iv5jo +i1x +i1z +iz1l +iz4la +izli4z +iz5me +iz5mo +iz6ode +iz5po +i2zr +iz1u +iz6ure +i1ž +j5akt +2j1b +2j1c +2j1č +2j1d +je4ks4 +2j1g +2jh +j1hi +4jime +4j5int +2j1k +2j1l +2j1m +2j1n +4job +2j1od +jod4l +2jos +4jož +2j1p +2j1r +jra1 +jraz4 +2j1s +jsis6t +2j1š +2j1t +ju1 +2juč +ju5dm +2jus +ju2ž1 +2j1v +2j1z +jz6ves +2k1c +2k1d +ke5ti +ki1 +2k1m +1kn +ko1 +kok4 +ko5kd +ko6vše +koz6lo +1kre +2ks. +k5sat +ks1c +ks1p +ks4po +ks1t +4kst. +ks6taz +ks5te +2k1t +3ktr +4ktra +ku5ro +k5vip +la4ir +la6vz. +2l1b +2l1c +2l1č +2l1d +le1 +le4e +le6ipz +le5me +2l1f +2l1g +lg5ča +2l1h +l2i1 +li6dž. +1liz +4l5izd +2lj. +4ljc +2ljč +2ljk +2ljn +2ljs +2ljš +lju5d6j +2l1k +2l1l +2l1m +2l1n +lo1 +1loč +2l1p +2l1s +2l1š +2l1t +lu5ki +lu5ku +2l1v +2l1z +2l1ž +2m1b +2m1c +2m1č +2m1d +me4d5n +me6dos +me4dr +2m1f +4mind +4minp +4minš +mi6th. +2m1k +2m1m +m5niv +mo6št. +mo6vš. +2m1p +2m1s +2m1š +2m1t +m5urn +2m1v +my5hi +2m1ž +na1 +5načel +na4d5nj +nad5r +na6dra +na4dre +na6dur +1naj +na6jak +na4j5en +naj3o +na6joč +na4j3u +1nas +na4v3z +navze6 +1naz +naz6or +2n1b +2n1c +2nč +n1ča +n1če +n1či +n1ču +2n3d2 +nd5ga +nd5hi +n4dm +ne1 +ne3d2 +1neh +ne3zm +nez4v +2n1f +2n1g +n4gh +ng5ha +n4gv +ng5vi +2n1h +2nj. +2njc +nje4v5s +2njk +2njs +2njš +4njv +2n1k +2n1l +2n1n +no5rd +n4ost +2n1p +2n1s +nsis4 +2n1š +2n1t +nteks4 +n4tg +nt5ga +nt5ge +n4tv +nt5vi +nu1 +2n1v +ny5qu +2n1z +nz4i +2n1ž +o1a +o4as +o1b +ob5gl +ob5ide +ob5jo +5obla +5obro +o4bz +o1c +oc5ke +oc5ki +o4cr +o1č +o1d +od5dv +od5nal +o6drep +od5zd +o2d1ž +o1e +oele4 +o1f +o1g +4ogl +o1h +o1i +oiz2 +o1j +o1k +o4kb +ok5ba +ok5be +o4kt +o1l +o6l5avt +ol6gča +o4lr +ol5re +o1m +o1n +o1o +ood4l +o2ol +o4om +o1p +o4pm +op5me +4opy +o1ra +or4deč +o1re +o1ri +o1ro +o1ru +o1s +5oseb +ose4m5 +o1š +o1t +o1u +ou5ki +ou5ku +o1v +ov5sem +ov5šk +o2v1z +o5vza +ov3zd +o1y +o1z +oz4b +ozd5j +oz4g +oz5lo +oz6lož +oz2n +oz5nic +oz5niš +oz2o +oz2r +oz2v +o1ž +ož5mi +2p1c +2p3č2 +pč5ka +pe1 +1peč +pe4kt +pet3l +pe4tle +pe4v5s +pev5t4 +4phs +ph5so +pi5zo +2p1k +4ploz +po1 +po6dfa +po4d3l +po4dna +po4d5oč +po6lob +po6std +prez4 +2p1s +2p1š +2p1t +pz6ig. +qu2 +3raču +2rae +ra6jžn +rav5z +ra6vza +ra4z5id +3razl +ra4z5or +2r1b +2r1c +2r1č +2r1d +re1 +3real +re6cht +re5čv +5redč +re6dig +re6dnju +re6iba +re5jo +re5km +re6sda +rev6sk +re6znač +re6zus +re6zve +r1f +2r1g +2r1h +ri1 +r4in +ri5n4o +riz4g +riz4l +riz4n +2r1j +2r1k +2r1l +2r1m +2r1n +ro1 +rob6id +3rodi +ro5zo +2r1p +r1r +2r1s +2r1š +2r1t +r4th +rt5ha +ru5kl +2r1v +r3v2j +rv5jo +ry5an +2r1z +rz2l +r1ž +rž5da +2s1b +1sc +4sc. +s2ci +se4k5sa +sek5si +se5ma +se5vp +2s1f +si1 +s4id +si6gn. +sis1 +2s1j +2sk. +s2kn +4skre +s4lav +s4on +soni5 +sonič4 +1sp +s4plod +spod4l +2s1s +2st. +3ste +s4ten +4stf +s4tič +5stim +s4tir +2stk +2stm +1str +s4tra. +su1 +su4bo +sve5t +š2č +2šč. +2ščk +2ščn +še2s +2š1j +ta5wi +taz4 +2t1b +2t1c +tch5o +2t1d +tek6st +5tema +te5xa +t1f +4tind +4tinos +4tinp +4tinse +4t3int +2t1k +6tletno +2t1m +4tnaj +to6vž. +trt5u +tr6tur +2t1s +2t1t +tu1 +4tz. +2u1a +u1b +ub4j +u4bp +ub5po +u1c +u1č +u1d +ud6mi. +u1e +u1f +u1g +u1h +u1i +u1j +u1ka +u1ke +u1ko +u1l +u1m +u1n +u1p +up6čka +u1ra +u1re +4urg +u1ri +u1s +1usp +u1š +uše3s +u1t +u4th +uth5o +u1v +ux5em +u1z +u1ž +2v1b +2v1c +2vč +v1ča +v1če +v4čer +v1či +2v1d +ve4čl +ve4čm +ve4i +ve4tin +vetle6t +v1f +v1g +vi5dv +vid6va +1viv +vi6žg. +2v1j +4vjo +2v1k +2v1m +2v1n +vo5rd +voz5le +2v1p +3v2pa +v4pij +v4pil +v5skn +v5šek +4všk +2v1t +vt4k +vz2 +v2za +3v2zg +2v3zk +2vzo +v3zp +v2zu +1wa +wo2 +x1f +1ye +2y1f +y1j +y1l +y1w +1z2a +z6ane. +za5uk +za3vp +za1z2 +za5zd +2z1b +3zbir +z1c +2z1č +2z1d2 +zd5ju +z3dv +z1g +z4gni +z5got +2z1h +1zi +z1ig +2z1is +4z5išč +2z1j +2z1k +z3ku +z5las +z1li +3zlil +5zlit +5zliv +zliz5 +1zlj +3zlog +z5lom +3zlož +z1lu +2z1m +1zn +1zo +z1ob +2z1od +z1og +z2ol +z4om +2z1p +1z1r +4zredč +4zreš +4zrez +4zrež +4zri +4zru +2z1s +z1š +z1t +1zu +z4uj +2z1up +2z1uz +z1v2 +z4ven +z3vn +3z4voj +z4vok +2z1z2 +z1ž +2ž1b +2ž1c +2ž1č +2ž1j +2ž1k +4žmi +.č8 +.š8 +.ž8 +8ž. +8š. +8č. +8b. +8c. +8d. +8f. +8g. +8h. +8j. +8k. +8l. +8m. +8n. +8p. +8r. +8s. +8t. +8v. +8z. +8x. +8y. +8w. +8q. +.b8 +.c8 +.d8 +.f8 +.g8 +.h8 +.j8 +.k8 +.l8 +.m8 +.n8 +.p8 +.r8 +.s8 +.t8 +.v8 +.z8 +.x8 +.y8 +.w8 +.q8 +.i4z1 +.e2k3s +.e2k5v diff --git a/prepare_data.py b/prepare_data.py index 0d38a6b..afda497 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -7,6 +7,7 @@ import h5py import gc import math import keras.backend as K +import os.path # functions for saving, loading and shuffling whole arrays to ram @@ -34,9 +35,15 @@ def load_inputs(file_name, other_features=False): return X, y -def shuffle_inputs(X, y, X_pure=[]): - s = np.arange(X.shape[0]) - np.random.shuffle(s) +def shuffle_inputs(X, y, shuffle_vector_location, X_pure=[]): + if os.path.exists(shuffle_vector_location): + s = load_shuffle_vector(shuffle_vector_location) + else: + s = np.arange(X.shape[0]) + np.random.shuffle(s) + create_and_save_shuffle_vector(shuffle_vector_location, s) + # s = np.arange(X.shape[0]) + # np.random.shuffle(s) X = X[s] y = y[s] if X_pure != []: @@ -57,7 +64,7 @@ def create_and_save_inputs(file_name, part, X, y, X_pure): def load_extended_inputs(file_name, obtain_range): - h5f = h5py.File(file_name,'r') + h5f = h5py.File(file_name, 'r') X = h5f['X'][obtain_range[0]:obtain_range[1]] y = h5f['y'][obtain_range[0]:obtain_range[1]] X_pure = h5f['X_pure'][obtain_range[0]:obtain_range[1]] @@ -69,16 +76,17 @@ def load_extended_inputs(file_name, obtain_range): # functions for creating and loading shuffle vector def create_and_save_shuffle_vector(file_name, shuffle_vector): # X, y, X_pure = generate_full_vowel_matrix_inputs() - h5f = h5py.File(file_name + '_shuffle_vector.h5', 'w') - adict=dict(shuffle_vector=shuffle_vector) + h5f = h5py.File(file_name, 'w') + adict = dict(shuffle_vector=shuffle_vector) for k, v in adict.items(): - h5f.create_dataset(k,data=v) + h5f.create_dataset(k, data=v) h5f.close() def load_shuffle_vector(file_name): - h5f = h5py.File(file_name,'r') - shuffle_vector = h5f['shuffle_vector'][[179859, 385513, 893430]] + h5f = h5py.File(file_name, 'r') + # shuffle_vector = h5f['shuffle_vector'][[179859, 385513, 893430]] + shuffle_vector = h5f['shuffle_vector'][:] h5f.close() return shuffle_vector @@ -138,7 +146,8 @@ def create_dict(): vowels.extend(accetuated_vowels) vowels.extend(default_vowels) - dictionary = [''] + dictionary_output = [''] + dictionary_input = [''] line = 0 max_word = 0 # ADD 'EMPTY' VOWEL @@ -154,12 +163,12 @@ def create_dict(): for c in list(el[3]): if is_vowel(list(el[3]), i, vowels): num_vowels += 1 - if c not in dictionary: - dictionary.append(c) + if c not in dictionary_output: + dictionary_output.append(c) i += 1 for c in list(el[0]): - if c not in dictionary: - dictionary.append(c) + if c not in dictionary_input: + dictionary_input.append(c) if num_vowels > max_num_vowels: max_num_vowels = num_vowels except Exception: @@ -167,10 +176,10 @@ def create_dict(): print(el) break line += 1 - dictionary = sorted(dictionary) + dictionary_input = sorted(dictionary_input) max_num_vowels += 1 print('DICTIONARY CREATION SUCCESSFUL!') - return dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels + return dictionary_input, max_word, max_num_vowels, content, vowels, accetuated_vowels # GENERATE X and y @@ -272,7 +281,22 @@ def generate_presentable_y(accetuations_list, word_list, max_num_vowels): # return X, y -def generate_full_matrix_inputs(): +def generate_full_matrix_inputs(content_shuffle_vector_location, shuffle_vector_location): + dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict() + train_content, test_content, validate_content = split_content(content, 0.2, content_shuffle_vector_location) + feature_dictionary = create_feature_dictionary() + + # Generate X and y + print('GENERATING X AND y...') + X_train, X_other_features_train, y_train = generate_X_and_y(dictionary, max_word, max_num_vowels, train_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_train.h5') + X_test, X_other_features_test, y_test = generate_X_and_y(dictionary, max_word, max_num_vowels, test_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_test.h5') + X_validate, X_other_features_validate, y_validate = generate_X_and_y(dictionary, max_word, max_num_vowels, validate_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_validate.h5') + print('GENERATION SUCCESSFUL!') + return X_train, X_other_features_train, y_train, X_test, X_other_features_test, y_test, X_validate, X_other_features_validate, y_validate + + +# generate full matrix, with old features +def old_generate_full_matrix_inputs(): dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict() train_content, validate_content = split_content(content, 0.2) feature_dictionary = create_feature_dictionary(content) @@ -286,7 +310,7 @@ def generate_full_matrix_inputs(): # Generate each y as an array of 11 numbers (with possible values between 0 and 1) -def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary): +def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location): y = np.zeros((len(content), max_num_vowels)) X = np.zeros((len(content), max_word, len(dictionary))) print('CREATING OTHER FEATURES...') @@ -328,7 +352,7 @@ def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, acce i += 1 print('SHUFFELING INPUTS...') - X, y, X_other_features = shuffle_inputs(X, y, X_pure=X_other_features) + X, y, X_other_features = shuffle_inputs(X, y, shuffle_vector_location, X_pure=X_other_features) print('INPUTS SHUFFELED!') return X, X_other_features, y @@ -559,6 +583,7 @@ def shuffle_full_vowel_inputs(name, orderd_name, parts): # Decoders for inputs and outputs def decode_X_features(feature_dictionary, X_other_features): + final_word = [] for word in X_other_features: final_word = [] i = 0 @@ -574,6 +599,7 @@ def decode_X_features(feature_dictionary, X_other_features): final_word.append(feature_dictionary[z][j][k]) i += 1 print(u''.join(final_word)) + return u''.join(final_word) def decode_position(y, max_num_vowels): @@ -650,7 +676,37 @@ def decode_position_from_vowel_to_final_number(y): # split content so that there is no overfitting -def split_content(content, ratio): +def split_content(content, test_and_validation_ratio, content_shuffle_vector_location, validation_ratio=0.5): + expanded_content = [el[1] if el[1] != '=' else el[0] for el in content] + # print(len(content)) + unique_content = sorted(set(expanded_content)) + + if os.path.exists(content_shuffle_vector_location): + s = load_shuffle_vector(content_shuffle_vector_location) + else: + s = np.arange(len(unique_content)) + np.random.shuffle(s) + create_and_save_shuffle_vector(content_shuffle_vector_location, s) + + split_num = math.floor(len(unique_content) * test_and_validation_ratio) + validation_num = math.floor(split_num * validation_ratio) + shuffled_unique_train_content = [unique_content[i] for i in range(len(s)) if s[i] >= split_num] + shuffled_unique_train_content_set = set(shuffled_unique_train_content) + + shuffled_unique_test_content = [unique_content[i] for i in range(len(s)) if split_num > s[i] >= validation_num] + shuffled_unique_test_content_set = set(shuffled_unique_test_content) + + shuffled_unique_validate_content = [unique_content[i] for i in range(len(s)) if s[i] < validation_num] + shuffled_unique_validate_content_set = set(shuffled_unique_validate_content) + + train_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_train_content_set] + test_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_test_content_set] + validate_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_validate_content_set] + return train_content, test_content, validate_content + + +# split content so that there is no overfitting with out split of validation and test data +def old_split_content(content, ratio): expanded_content = [el[1] if el[1] != '=' else el[0] for el in content] # print(len(content)) unique_content = sorted(set(expanded_content)) @@ -671,8 +727,8 @@ def split_content(content, ratio): return train_content, validate_content -# create feature dictionary -def create_feature_dictionary(content): +# X features that use MULTEX v3 as their encoding +def create_old_feature_dictionary(content): additional_data = [el[2] for el in content] possible_variants = sorted(set(additional_data)) categories = sorted(set([el[0] for el in possible_variants])) @@ -690,7 +746,8 @@ def create_feature_dictionary(content): return feature_dictionary -def create_X_features(content, feature_dictionary): +# X features that use MULTEX v3 as their encoding +def create_old_X_features(content, feature_dictionary): content = content X_other_features = [] for el in content: @@ -707,4 +764,213 @@ def create_X_features(content, feature_dictionary): else: X_el_other_features.extend([0] * feature[0]) X_other_features.append(X_el_other_features) - return np.array(X_other_features) \ No newline at end of file + return np.array(X_other_features) + + +def convert_to_MULTEXT_east_v4(old_features, feature_dictionary): + new_features = ['-'] * 9 + new_features[:len(old_features)] = old_features + if old_features[0] == 'A': + if old_features[1] == 'f' or old_features[1] == 'o': + new_features[1] = 'g' + return new_features[:len(feature_dictionary[0]) - 1] + if old_features[0] == 'C': + return new_features[:len(feature_dictionary[1]) - 1] + if old_features[0] == 'I': + return new_features[:len(feature_dictionary[2]) - 1] + if old_features[0] == 'M': + new_features[2:6] = old_features[1:5] + new_features[1] = old_features[5] + if new_features[2] == 'm': + new_features[2] = '-' + return new_features[:len(feature_dictionary[3]) - 1] + if old_features[0] == 'N': + if len(old_features) > 5: + new_features[5] = old_features[7] + return new_features[:len(feature_dictionary[4]) - 1] + if old_features[0] == 'P': + if new_features[8] == 'n': + new_features[8] = 'b' + return new_features[:len(feature_dictionary[5]) - 1] + if old_features[0] == 'Q': + return new_features[:len(feature_dictionary[6]) - 1] + if old_features[0] == 'R': + return new_features[:len(feature_dictionary[7]) - 1] + if old_features[0] == 'S': + if len(old_features) == 4: + new_features[1] = old_features[3] + else: + new_features[1] = '-' + return new_features[:len(feature_dictionary[8]) - 1] + if old_features[0] == 'V': + if old_features[1] == 'o' or old_features[1] == 'c': + new_features[1] = 'm' + new_features[3] = old_features[2] + new_features[2] = '-' + if old_features[2] == 'i': + new_features[3] = 'r' + if len(old_features) > 3 and old_features[3] == 'p': + new_features[3] = 'r' + elif len(old_features) > 3 and old_features[3] == 'f': + new_features[3] = 'f' + if len(old_features) >= 9: + new_features[7] = old_features[8] + else: + new_features[7] = '-' + return new_features[:len(feature_dictionary[9]) - 1] + return '' + + +def create_X_features(content, feature_dictionary): + content = content + X_other_features = [] + for el in content: + X_el_other_features = [] + converted_el = ''.join(convert_to_MULTEXT_east_v4(list(el[2]), feature_dictionary)) +# converted_el = el[2] + for feature in feature_dictionary: + if converted_el[0] == feature[1]: + X_el_other_features.append(1) + for i in range(2, len(feature)): + for j in range(len(feature[i])): + if i-1 < len(converted_el) and feature[i][j] == converted_el[i-1]: + X_el_other_features.append(1) + else: + X_el_other_features.append(0) + else: + X_el_other_features.extend([0] * feature[0]) + X_other_features.append(X_el_other_features) + return np.array(X_other_features) + + +def create_feature_dictionary(): + # old: http://nl.ijs.si/ME/Vault/V3/msd/html/ + # new: http://nl.ijs.si/ME/V4/msd/html/ + # changes: http://nl.ijs.si/jos/msd/html-en/msd.diffs.html + + return [[21, + 'A', + ['g', 's'], + ['p', 'c', 's'], + ['m', 'f', 'n'], + ['s', 'd', 'p'], + ['n', 'g', 'd', 'a', 'l', 'i'], + ['-', 'n', 'y']], + [3, 'C', ['c', 's']], + [1, 'I'], + [21, + 'M', + ['l'], + ['-', 'c', 'o', 's'], + ['m', 'f', 'n'], + ['s', 'd', 'p'], + ['n', 'g', 'd', 'a', 'l', 'i'], + ['-', 'n', 'y']], + [17, + 'N', + ['c'], + ['m', 'f', 'n'], + ['s', 'd', 'p'], + ['n', 'g', 'd', 'a', 'l', 'i'], + ['-', 'n', 'y']], + [40, + 'P', + ['p', 's', 'd', 'r', 'x', 'g', 'q', 'i', 'z'], + ['-', '1', '2', '3'], + ['-', 'm', 'f', 'n'], + ['-', 's', 'd', 'p'], + ['-', 'n', 'g', 'd', 'a', 'l', 'i'], + ['-', 's', 'd', 'p'], + ['-', 'm', 'f', 'n'], + ['-', 'y', 'b']], + [1, 'Q'], + [5, 'R', ['g'], ['p', 'c', 's']], + [7, 'S', ['-', 'g', 'd', 'a', 'l', 'i']], + [24, + 'V', + ['m'], + ['-'], + ['n', 'u', 'p', 'r', 'f', 'c'], + ['-', '1', '2', '3'], + ['-', 's', 'p', 'd'], + ['-', 'm', 'f', 'n'], + ['-', 'n', 'y']] + ] + + +def complete_feature_dict(): + # old: http://nl.ijs.si/ME/Vault/V3/msd/html/ + # new: http://nl.ijs.si/ME/V4/msd/html/ + # changes: http://nl.ijs.si/jos/msd/html-en/msd.diffs.html + return [[27, + 'A', + ['-', 'g', 's', 'p'], + ['-', 'p', 'c', 's'], + ['-', 'm', 'f', 'n'], + ['-', 's', 'd', 'p'], + ['-', 'n', 'g', 'd', 'a', 'l', 'i'], + ['-', 'n', 'y']], + [4, 'C', ['-', 'c', 's']], + [1, 'I'], + [28, + 'M', + ['-', 'd', 'r', 'l'], + ['-', 'c', 'o', 'p', 's'], + ['-', 'm', 'f', 'n'], + ['-', 's', 'd', 'p'], + ['-', 'n', 'g', 'd', 'a', 'l', 'i'], + ['-', 'n', 'y']], + [22, + 'N', + ['-', 'c', 'p'], + ['-', 'm', 'f', 'n'], + ['-', 's', 'd', 'p'], + ['-', 'n', 'g', 'd', 'a', 'l', 'i'], + ['-', 'n', 'y']], + [41, + 'P', + ['-', 'p', 's', 'd', 'r', 'x', 'g', 'q', 'i', 'z'], + ['-', '1', '2', '3'], + ['-', 'm', 'f', 'n'], + ['-', 's', 'd', 'p'], + ['-', 'n', 'g', 'd', 'a', 'l', 'i'], + ['-', 's', 'd', 'p'], + ['-', 'm', 'f', 'n'], + ['-', 'y', 'b']], + [1, 'Q'], + [8, 'R', ['-', 'g', 'r'], ['-', 'p', 'c', 's']], + [8, 'S', ['-', 'n', 'g', 'd', 'a', 'l', 'i']], + [31, + 'V', + ['-', 'm', 'a'], + ['-', 'e', 'p', 'b'], + ['-', 'n', 'u', 'p', 'r', 'f', 'c', 'm'], + ['-', '1', '2', '3'], + ['-', 's', 'p', 'd'], + ['-', 'm', 'f', 'n'], + ['-', 'n', 'y']] + ] + + +def check_feature_letter_usage(X_other_features, feature_dictionary): + case_numbers = np.sum(X_other_features, axis=0) + arrays = [1] * 164 + letters = list(decode_X_features(feature_dictionary, [arrays])) + print(sum(case_numbers)) + for i in range(len(letters)): + print(letters[i] + ': ' + str(case_numbers[i])) + + +def dict_occurances_in_dataset_rate(content): + feature_dictionary = complete_feature_dict() + # case = 3107 + # print(content[case]) + # print(feature_dictionary) + # X_other_features = create_X_features([content[case]], feature_dictionary) + X_other_features = create_X_features(content, feature_dictionary) + # print(X_other_features) + # print(decode_X_features(feature_dictionary, X_other_features)) + X_other_features = np.array(X_other_features) + + case_numbers = np.sum(X_other_features, axis=0) + print(case_numbers) diff --git a/tex_hyphenation.py b/tex_hyphenation.py new file mode 100644 index 0000000..85867d8 --- /dev/null +++ b/tex_hyphenation.py @@ -0,0 +1,101 @@ +import sys +sys.path.insert(0, '../../../') +from prepare_data import * +dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict() +feature_dictionary = create_feature_dictionary(content) + + +def read_hyphenation_pattern(): + with open('../../../hyphenation') as f: + content = f.readlines() + return [x[:-1] for x in content] + + +def find_hyphenation_patterns_in_text(text, pattern): + res = [] + index = 0 + while index < len(text): + index = text.find(pattern, index) + if index == -1: + break + res.append(index) + index += 1 # +2 because len('ll') == 2 + + return res + + +def create_hyphenation_dictionary(hyphenation_pattern): + dictionary = [] + for el in hyphenation_pattern: + substring = '' + anomalies_indices = [] + digit_location = 0 + for let in list(el): + if let.isdigit(): + anomalies_indices.append([digit_location, int(let)]) + else: + substring += let + digit_location += 1 + dictionary.append([substring, anomalies_indices]) + return dictionary + + +def split_hyphenated_word(split, word): + split = split[2:-2] + print(split) + word = list(word)[1:-1] + res = [] + hyphenate = '' + loc = 0 + for let in word: + hyphenate += let + if loc == len(split) or split[loc] % 2 == 1: + res.append(hyphenate) + hyphenate = '' + loc += 1 + return res + + +def hyphenate_word(word, hyphenation_dictionary): + word = word.replace('è', 'č') + word = '.' + word + '.' + split = [0] * (len(word) + 1) + for pattern in hyphenation_dictionary: + pattern_locations = find_hyphenation_patterns_in_text(word, pattern[0]) + for pattern_location in pattern_locations: + for el_hyphenation_dictionary in pattern[1]: + if split[pattern_location + el_hyphenation_dictionary[0]] < el_hyphenation_dictionary[1]: + split[pattern_location + el_hyphenation_dictionary[0]] = el_hyphenation_dictionary[1] + return split_hyphenated_word(split, word) + + +hyphenation_pattern = read_hyphenation_pattern() +# ['zz', [{0:2},{1:1},{2:2}]] +hyphenation_dictionary = create_hyphenation_dictionary(hyphenation_pattern) +separated_word = hyphenate_word('izziv', hyphenation_dictionary) +print(separated_word) + +all_words = [] +i = 0 +for el in content: + separated_word = hyphenate_word(el[0], hyphenation_dictionary) + all_words.append([el[0], separated_word]) + if i % 10000 == 0: + print(str(i)+'/'+str(len(content))) + i += 1 + +errors = [] +errors2 = [] +for word in all_words: + for hyphenated_part in word[1]: + num_vowels = 0 + for let in list(hyphenated_part): + if let in vowels: + num_vowels += 1 + if num_vowels == 0: + for let in list(hyphenated_part): + if let == 'r': + errors2.append(word[0]) + num_vowels += 1 + if num_vowels != 1: + errors.append(word) \ No newline at end of file