diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index cc8febf..576c140 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -2,8 +2,8 @@
 <project version="4">
   <component name="ChangeListManager">
     <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
-      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/1_epoch.h5" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/1_epoch.h5" />
-      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/1_epoch_history.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/1_epoch_history.pkl" />
+      <change type="NEW" beforePath="" afterPath="$PROJECT_DIR$/tex_hyphenation.py" />
+      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
       <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
       <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
     </list>
@@ -35,8 +35,8 @@
       <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
         <entry file="file://$PROJECT_DIR$/prepare_data.py">
           <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="442">
-              <caret line="462" column="19" lean-forward="false" selection-start-line="462" selection-start-column="4" selection-end-line="462" selection-end-column="19" />
+            <state relative-caret-position="284">
+              <caret line="592" column="36" lean-forward="true" selection-start-line="592" selection-start-column="36" selection-end-line="592" selection-end-column="36" />
               <folding>
                 <element signature="e#24#63#0" expanded="true" />
               </folding>
@@ -44,6 +44,16 @@
           </provider>
         </entry>
       </file>
+      <file leaf-file-name="tex_hyphenation.py" pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/tex_hyphenation.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="1206">
+              <caret line="67" column="105" lean-forward="false" selection-start-line="67" selection-start-column="105" selection-end-line="67" selection-end-column="105" />
+              <folding />
+            </state>
+          </provider>
+        </entry>
+      </file>
       <file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
         <entry file="file://$PROJECT_DIR$/workbench.py">
           <provider selected="true" editor-type-id="text-editor">
@@ -139,6 +149,17 @@
       <find>StringIO</find>
       <find>shuffle_inputs</find>
       <find>generator</find>
+      <find>content, feature_dictionary</find>
+      <find>decode</find>
+      <find>create_feature_dictionary</find>
+      <find>with</find>
+      <find>read</find>
+      <find>generate</find>
+      <find>shuffle</find>
+      <find>X_</find>
+      <find>dictionary</find>
+      <find>create_dict</find>
+      <find>split_content</find>
     </findStrings>
   </component>
   <component name="Git.Settings">
@@ -157,6 +178,7 @@
         <option value="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.py" />
         <option value="$PROJECT_DIR$/workbench.py" />
         <option value="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.ipynb" />
+        <option value="$PROJECT_DIR$/tex_hyphenation.py" />
         <option value="$PROJECT_DIR$/prepare_data.py" />
       </list>
     </option>
@@ -165,7 +187,7 @@
     <option name="x" value="65" />
     <option name="y" value="24" />
     <option name="width" value="1855" />
-    <option name="height" value="1056" />
+    <option name="height" value="1176" />
   </component>
   <component name="ProjectView">
     <navigator currentView="ProjectPane" proportions="" version="1">
@@ -182,7 +204,6 @@
       <foldersAlwaysOnTop value="true" />
     </navigator>
     <panes>
-      <pane id="Scratches" />
       <pane id="ProjectPane">
         <subPane>
           <PATH>
@@ -195,23 +216,10 @@
               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
             </PATH_ELEMENT>
           </PATH>
-          <PATH>
-            <PATH_ELEMENT>
-              <option name="myItemId" value="accetuation" />
-              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
-            </PATH_ELEMENT>
-            <PATH_ELEMENT>
-              <option name="myItemId" value="accetuation" />
-              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
-            </PATH_ELEMENT>
-            <PATH_ELEMENT>
-              <option name="myItemId" value="cnn" />
-              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
-            </PATH_ELEMENT>
-          </PATH>
         </subPane>
       </pane>
       <pane id="Scope" />
+      <pane id="Scratches" />
     </panes>
   </component>
   <component name="PropertiesComponent">
@@ -474,7 +482,7 @@
     <servers />
   </component>
   <component name="ToolWindowManager">
-    <frame x="65" y="24" width="1855" height="1056" extended-state="6" />
+    <frame x="65" y="24" width="1855" height="1176" extended-state="6" />
     <editor active="true" />
     <layout>
       <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.16375546" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
@@ -495,25 +503,6 @@
       <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
       <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.39979124" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
     </layout>
-    <layout-to-restore>
-      <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
-      <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
-      <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
-      <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
-      <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" />
-      <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
-      <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="10" side_tool="false" content_ui="tabs" />
-      <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32985386" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" />
-      <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.32985386" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
-      <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32985386" sideWeight="0.5" order="9" side_tool="false" content_ui="tabs" />
-      <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.16375546" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
-      <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
-      <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
-      <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
-      <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
-      <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" />
-      <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.39979124" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
-    </layout-to-restore>
   </component>
   <component name="VcsContentAnnotationSettings">
     <option name="myLimit" value="2678400000" />
@@ -529,17 +518,6 @@
     <watches-manager />
   </component>
   <component name="editorHistoryManager">
-    <entry file="file://$PROJECT_DIR$/theano_tutorial/tutorial_derivates.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="0">
-          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
-          <folding>
-            <element signature="e#0#18#0" expanded="true" />
-          </folding>
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/theanoTest.py" />
     <entry file="file://$PROJECT_DIR$/theano_tutorial/test.py">
       <provider selected="true" editor-type-id="text-editor">
         <state relative-caret-position="1368">
@@ -822,16 +800,6 @@
       </provider>
     </entry>
     <entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn_test_on_other_attributes.ipynb" />
-    <entry file="file://$PROJECT_DIR$/prepare_data.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="442">
-          <caret line="462" column="19" lean-forward="false" selection-start-line="462" selection-start-column="4" selection-end-line="462" selection-end-column="19" />
-          <folding>
-            <element signature="e#24#63#0" expanded="true" />
-          </folding>
-        </state>
-      </provider>
-    </entry>
     <entry file="file://$PROJECT_DIR$/theano_tutorial/logistic_regression.py">
       <provider selected="true" editor-type-id="text-editor">
         <state relative-caret-position="162">
@@ -882,6 +850,14 @@
     </entry>
     <entry file="file://$PROJECT_DIR$/theanoTest.py" />
     <entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.py" />
+    <entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.ipynb" />
+    <entry file="file://$PROJECT_DIR$/hyphenation">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="0">
+          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
+        </state>
+      </provider>
+    </entry>
     <entry file="file://$PROJECT_DIR$/workbench.py">
       <provider selected="true" editor-type-id="text-editor">
         <state relative-caret-position="396">
@@ -892,6 +868,23 @@
         </state>
       </provider>
     </entry>
-    <entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.ipynb" />
+    <entry file="file://$PROJECT_DIR$/tex_hyphenation.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="1206">
+          <caret line="67" column="105" lean-forward="false" selection-start-line="67" selection-start-column="105" selection-end-line="67" selection-end-column="105" />
+          <folding />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/prepare_data.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="284">
+          <caret line="592" column="36" lean-forward="true" selection-start-line="592" selection-start-column="36" selection-end-line="592" selection-end-column="36" />
+          <folding>
+            <element signature="e#24#63#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
   </component>
 </project>
\ No newline at end of file
diff --git a/hyphenation b/hyphenation
new file mode 100644
index 0000000..01d18cc
--- /dev/null
+++ b/hyphenation
@@ -0,0 +1,1113 @@
+.av5r
+.di6spo
+.ek3s
+.ek5v
+.is1
+.iz1
+.obi4d
+.ob5it
+.od1
+.po4d5n
+.po4v5s
+.pre6d7n
+.se4k5s
+.si4s
+.st4
+.voz5l
+.voz5n
+.zliz6
+a1a
+a1b
+ab5ba
+ab6rod
+a1c
+ac5ci
+a1č
+a1d
+ad2l
+a6dobl
+ad6rl.
+ad6rla
+ad6rob
+ad5ur
+a1e1
+a1f
+af5ga
+af1t
+a1g
+a1h
+a4hm
+ah5mi
+ah5mo
+a1i
+ai2n1
+a1j
+a4j5ek
+a4jf
+aj5fi
+aj5fo
+aj5ha
+aj5he
+aj5im
+aj6imo
+aj3os
+aj6stb
+a5ju.
+aj3uč
+aj3ug
+aj5žn
+a1k
+ak4s
+a4kst
+a1l
+a1m
+a4mz
+a1n
+an6dga
+an6dhi
+a4nm
+an5mi
+an5zi
+a1o
+ao2b1
+a1p
+a4ph
+a1ra
+ar6dwa
+a1re
+a1ri
+a1ro
+a1ru
+ar5xa
+ar5xo
+ar5xu
+a1s
+a4sš
+as5šč
+a1š
+a1t
+a4tf
+at4i
+a1u1
+a4uf
+a2uk
+a4ul
+a1v
+av5ši
+a4vž
+av5ža
+ay5to
+a1ze
+az5fo
+a4zig
+az3la
+az3le
+az4lil
+az4lit
+az4liv
+a4zob
+a4z3oč
+az5ora
+az5oro
+a4zra
+az4red
+az5vp
+a1ž
+až5mi
+ba6bba
+ban3č4
+ba4u
+2b1c
+2b1č
+2b1d
+be1
+be4v
+b1h
+bi1
+b1ja
+b4ja.
+b5jel
+b3jem
+b5jet
+2b1k
+b3lep
+b5leta
+b5lil
+b5lit
+b5liv
+b1m
+4bmi
+2b1n
+bo1
+bo6chm
+b5ord
+bo5vp
+b3rab
+b5ras
+b3raš
+b3rez
+bre4zg
+bre4zi
+bre4zr
+b5reže
+b3rob
+br6žda
+2b1s
+2b1š
+2b1t
+bu5ki
+bu5ku
+bu5kv
+bu5ry
+2b1v
+b1z
+b1ž
+2cc
+2ch.
+ch5ma
+2ck
+c1ka
+ck1o2
+c5ko.
+ckov3
+ck1s
+ck5we
+2c1n
+2c1t
+2č1b
+2č1g
+či1
+1čj
+2č1k
+1čl
+4č3let
+č5mes
+2č1n
+4čop
+2č1p
+2č1s
+4čup
+2d1b
+2d1c
+2d1č
+2d1d
+dd6voj
+d2e
+6d5elem
+de4min
+de4mn
+de4z3i
+2d1g
+2d1h
+di5ck
+4dind
+d4i5no
+dis1
+di4skr
+di6spr
+2d1j
+2d1k
+5dlet
+d2li
+d5lit
+d5liv
+d1lo
+2d3m
+4d3nac
+4d5nač
+4d5nap
+4d3nar
+4dnas
+4d5neb
+d5niv
+4d5niz
+4d5njač
+4d3nož
+d2o
+4dobč
+4d5obd
+2d3o2f
+do5rd
+do5vč
+do5v4z
+2d1p
+d5raz
+d3rep
+dre6pn
+d4rev
+2d1s
+2d1š
+2d1t
+dteks6
+d4ur
+du5ro
+du5um
+2d1v
+4d3vi
+2d1z2
+e1a
+e1b
+eb4j
+eb6liz
+e1c
+e1č
+e4čd
+eč5de
+eč5di
+eč5do
+eč3le
+eč5op
+e4čt
+eč5ti
+eč5to
+eč5tr
+eč5up
+e2č1v
+eč6vrs
+e1d
+e4df
+ed5ig
+ed2l
+ed5ob
+ed6obe
+ed6obr
+e4dobs
+e4d3oč
+ed5vč
+ed5zb
+e1e
+e4ep
+e1f
+e4ff
+ef5fe
+ef5ta
+e1g
+e1h
+e1i
+ei6pzi
+ei2z
+eiz5e
+e1j
+e1k
+ek6mal
+ek6tre
+e1l
+e1m
+e1n
+e1o1
+eob4j
+eob4r
+eo4dl
+eo4z5n
+e1p
+ep5nik
+e1ra
+era6z5l
+era5z4r
+era5z4v
+e1re
+e4rf
+e1ri
+e1ro
+e4rr
+e1ru
+e1s
+es5da
+e5sta
+e5sti.
+e5stih
+e5stil
+e1š
+e4šp
+eš5po
+e1t
+4eth
+e4tinš
+e1u1
+e1v
+eve6t5l
+ev5ha
+ev6pre
+ev6ste
+ev5stv
+2ew
+ew6ind
+ew5le
+e4wt
+ew5to
+e4yw
+e1z
+ez5dj
+e3z4dr
+ez2g
+ez5gl
+e5zij
+ez6ijo
+ez5imn
+e5zis
+ez6ist
+ez5iz
+ez4l
+ez6lom
+ez6man
+ez4mo
+e4zob
+e4z5or
+ez4re
+e4zt
+e4z5u4m5
+e4zž
+e1ž
+1fa
+fe1
+fe6ljt
+ff5ma
+fi6zlj
+2f1n
+fo6uri
+fre4u
+2f1s
+2ft
+ft5ve
+fu1
+2g1d
+ge6ige
+gel5č4
+ge6njč
+gi6tpr
+go1
+go5vz
+2g1t
+gu1
+ha4u
+2h1č
+he4i
+2h1k
+4hl.
+h4lo
+2h1n
+h5ren
+2h1š
+2h1t
+1hu
+hu6ffm
+i1a
+i1b
+i1c
+i4cs
+i1ča
+i1če
+i1či
+ič5ra
+i1ču
+ič5vr
+i1d
+4idor
+i1e1
+i1f
+i1g
+4igh
+i1h
+i1i
+ii2n1
+i1j
+i1k
+i4kč
+ik5ča
+i1l
+il5č4k
+4ile
+4ilo
+i1m
+i4mh
+im5hi
+i1n
+1ind
+2ine
+3i4n3os
+1inp
+3inse
+1inš
+4inšk
+3intr
+i1o1
+i1p
+i1r
+4ire
+i1s
+is4a
+is6ert
+isis4
+i4skv
+2iss
+i1š
+i1t
+it5pr
+i1u
+i1v
+iv5jo
+i1x
+i1z
+iz1l
+iz4la
+izli4z
+iz5me
+iz5mo
+iz6ode
+iz5po
+i2zr
+iz1u
+iz6ure
+i1ž
+j5akt
+2j1b
+2j1c
+2j1č
+2j1d
+je4ks4
+2j1g
+2jh
+j1hi
+4jime
+4j5int
+2j1k
+2j1l
+2j1m
+2j1n
+4job
+2j1od
+jod4l
+2jos
+4jož
+2j1p
+2j1r
+jra1
+jraz4
+2j1s
+jsis6t
+2j1š
+2j1t
+ju1
+2juč
+ju5dm
+2jus
+ju2ž1
+2j1v
+2j1z
+jz6ves
+2k1c
+2k1d
+ke5ti
+ki1
+2k1m
+1kn
+ko1
+kok4
+ko5kd
+ko6vše
+koz6lo
+1kre
+2ks.
+k5sat
+ks1c
+ks1p
+ks4po
+ks1t
+4kst.
+ks6taz
+ks5te
+2k1t
+3ktr
+4ktra
+ku5ro
+k5vip
+la4ir
+la6vz.
+2l1b
+2l1c
+2l1č
+2l1d
+le1
+le4e
+le6ipz
+le5me
+2l1f
+2l1g
+lg5ča
+2l1h
+l2i1
+li6dž.
+1liz
+4l5izd
+2lj.
+4ljc
+2ljč
+2ljk
+2ljn
+2ljs
+2ljš
+lju5d6j
+2l1k
+2l1l
+2l1m
+2l1n
+lo1
+1loč
+2l1p
+2l1s
+2l1š
+2l1t
+lu5ki
+lu5ku
+2l1v
+2l1z
+2l1ž
+2m1b
+2m1c
+2m1č
+2m1d
+me4d5n
+me6dos
+me4dr
+2m1f
+4mind
+4minp
+4minš
+mi6th.
+2m1k
+2m1m
+m5niv
+mo6št.
+mo6vš.
+2m1p
+2m1s
+2m1š
+2m1t
+m5urn
+2m1v
+my5hi
+2m1ž
+na1
+5načel
+na4d5nj
+nad5r
+na6dra
+na4dre
+na6dur
+1naj
+na6jak
+na4j5en
+naj3o
+na6joč
+na4j3u
+1nas
+na4v3z
+navze6
+1naz
+naz6or
+2n1b
+2n1c
+2nč
+n1ča
+n1če
+n1či
+n1ču
+2n3d2
+nd5ga
+nd5hi
+n4dm
+ne1
+ne3d2
+1neh
+ne3zm
+nez4v
+2n1f
+2n1g
+n4gh
+ng5ha
+n4gv
+ng5vi
+2n1h
+2nj.
+2njc
+nje4v5s
+2njk
+2njs
+2njš
+4njv
+2n1k
+2n1l
+2n1n
+no5rd
+n4ost
+2n1p
+2n1s
+nsis4
+2n1š
+2n1t
+nteks4
+n4tg
+nt5ga
+nt5ge
+n4tv
+nt5vi
+nu1
+2n1v
+ny5qu
+2n1z
+nz4i
+2n1ž
+o1a
+o4as
+o1b
+ob5gl
+ob5ide
+ob5jo
+5obla
+5obro
+o4bz
+o1c
+oc5ke
+oc5ki
+o4cr
+o1č
+o1d
+od5dv
+od5nal
+o6drep
+od5zd
+o2d1ž
+o1e
+oele4
+o1f
+o1g
+4ogl
+o1h
+o1i
+oiz2
+o1j
+o1k
+o4kb
+ok5ba
+ok5be
+o4kt
+o1l
+o6l5avt
+ol6gča
+o4lr
+ol5re
+o1m
+o1n
+o1o
+ood4l
+o2ol
+o4om
+o1p
+o4pm
+op5me
+4opy
+o1ra
+or4deč
+o1re
+o1ri
+o1ro
+o1ru
+o1s
+5oseb
+ose4m5
+o1š
+o1t
+o1u
+ou5ki
+ou5ku
+o1v
+ov5sem
+ov5šk
+o2v1z
+o5vza
+ov3zd
+o1y
+o1z
+oz4b
+ozd5j
+oz4g
+oz5lo
+oz6lož
+oz2n
+oz5nic
+oz5niš
+oz2o
+oz2r
+oz2v
+o1ž
+ož5mi
+2p1c
+2p3č2
+pč5ka
+pe1
+1peč
+pe4kt
+pet3l
+pe4tle
+pe4v5s
+pev5t4
+4phs
+ph5so
+pi5zo
+2p1k
+4ploz
+po1
+po6dfa
+po4d3l
+po4dna
+po4d5oč
+po6lob
+po6std
+prez4
+2p1s
+2p1š
+2p1t
+pz6ig.
+qu2
+3raču
+2rae
+ra6jžn
+rav5z
+ra6vza
+ra4z5id
+3razl
+ra4z5or
+2r1b
+2r1c
+2r1č
+2r1d
+re1
+3real
+re6cht
+re5čv
+5redč
+re6dig
+re6dnju
+re6iba
+re5jo
+re5km
+re6sda
+rev6sk
+re6znač
+re6zus
+re6zve
+r1f
+2r1g
+2r1h
+ri1
+r4in
+ri5n4o
+riz4g
+riz4l
+riz4n
+2r1j
+2r1k
+2r1l
+2r1m
+2r1n
+ro1
+rob6id
+3rodi
+ro5zo
+2r1p
+r1r
+2r1s
+2r1š
+2r1t
+r4th
+rt5ha
+ru5kl
+2r1v
+r3v2j
+rv5jo
+ry5an
+2r1z
+rz2l
+r1ž
+rž5da
+2s1b
+1sc
+4sc.
+s2ci
+se4k5sa
+sek5si
+se5ma
+se5vp
+2s1f
+si1
+s4id
+si6gn.
+sis1
+2s1j
+2sk.
+s2kn
+4skre
+s4lav
+s4on
+soni5
+sonič4
+1sp
+s4plod
+spod4l
+2s1s
+2st.
+3ste
+s4ten
+4stf
+s4tič
+5stim
+s4tir
+2stk
+2stm
+1str
+s4tra.
+su1
+su4bo
+sve5t
+š2č
+2šč.
+2ščk
+2ščn
+še2s
+2š1j
+ta5wi
+taz4
+2t1b
+2t1c
+tch5o
+2t1d
+tek6st
+5tema
+te5xa
+t1f
+4tind
+4tinos
+4tinp
+4tinse
+4t3int
+2t1k
+6tletno
+2t1m
+4tnaj
+to6vž.
+trt5u
+tr6tur
+2t1s
+2t1t
+tu1
+4tz.
+2u1a
+u1b
+ub4j
+u4bp
+ub5po
+u1c
+u1č
+u1d
+ud6mi.
+u1e
+u1f
+u1g
+u1h
+u1i
+u1j
+u1ka
+u1ke
+u1ko
+u1l
+u1m
+u1n
+u1p
+up6čka
+u1ra
+u1re
+4urg
+u1ri
+u1s
+1usp
+u1š
+uše3s
+u1t
+u4th
+uth5o
+u1v
+ux5em
+u1z
+u1ž
+2v1b
+2v1c
+2vč
+v1ča
+v1če
+v4čer
+v1či
+2v1d
+ve4čl
+ve4čm
+ve4i
+ve4tin
+vetle6t
+v1f
+v1g
+vi5dv
+vid6va
+1viv
+vi6žg.
+2v1j
+4vjo
+2v1k
+2v1m
+2v1n
+vo5rd
+voz5le
+2v1p
+3v2pa
+v4pij
+v4pil
+v5skn
+v5šek
+4všk
+2v1t
+vt4k
+vz2
+v2za
+3v2zg
+2v3zk
+2vzo
+v3zp
+v2zu
+1wa
+wo2
+x1f
+1ye
+2y1f
+y1j
+y1l
+y1w
+1z2a
+z6ane.
+za5uk
+za3vp
+za1z2
+za5zd
+2z1b
+3zbir
+z1c
+2z1č
+2z1d2
+zd5ju
+z3dv
+z1g
+z4gni
+z5got
+2z1h
+1zi
+z1ig
+2z1is
+4z5išč
+2z1j
+2z1k
+z3ku
+z5las
+z1li
+3zlil
+5zlit
+5zliv
+zliz5
+1zlj
+3zlog
+z5lom
+3zlož
+z1lu
+2z1m
+1zn
+1zo
+z1ob
+2z1od
+z1og
+z2ol
+z4om
+2z1p
+1z1r
+4zredč
+4zreš
+4zrez
+4zrež
+4zri
+4zru
+2z1s
+z1š
+z1t
+1zu
+z4uj
+2z1up
+2z1uz
+z1v2
+z4ven
+z3vn
+3z4voj
+z4vok
+2z1z2
+z1ž
+2ž1b
+2ž1c
+2ž1č
+2ž1j
+2ž1k
+4žmi
+.č8
+.š8
+.ž8
+8ž.
+8š.
+8č.
+8b.
+8c.
+8d.
+8f.
+8g.
+8h.
+8j.
+8k.
+8l.
+8m.
+8n.
+8p.
+8r.
+8s.
+8t.
+8v.
+8z.
+8x.
+8y.
+8w.
+8q.
+.b8
+.c8
+.d8
+.f8
+.g8
+.h8
+.j8
+.k8
+.l8
+.m8
+.n8
+.p8
+.r8
+.s8
+.t8
+.v8
+.z8
+.x8
+.y8
+.w8
+.q8
+.i4z1
+.e2k3s
+.e2k5v
diff --git a/prepare_data.py b/prepare_data.py
index 0d38a6b..afda497 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -7,6 +7,7 @@ import h5py
 import gc
 import math
 import keras.backend as K
+import os.path
 
 
 # functions for saving, loading and shuffling whole arrays to ram
@@ -34,9 +35,15 @@ def load_inputs(file_name, other_features=False):
     return X, y
 
 
-def shuffle_inputs(X, y, X_pure=[]):
-    s = np.arange(X.shape[0])
-    np.random.shuffle(s)
+def shuffle_inputs(X, y, shuffle_vector_location, X_pure=[]):
+    if os.path.exists(shuffle_vector_location):
+        s = load_shuffle_vector(shuffle_vector_location)
+    else:
+        s = np.arange(X.shape[0])
+        np.random.shuffle(s)
+        create_and_save_shuffle_vector(shuffle_vector_location, s)
+    # s = np.arange(X.shape[0])
+    # np.random.shuffle(s)
     X = X[s]
     y = y[s]
     if X_pure != []:
@@ -57,7 +64,7 @@ def create_and_save_inputs(file_name, part, X, y, X_pure):
 
 
 def load_extended_inputs(file_name, obtain_range):
-    h5f = h5py.File(file_name,'r')
+    h5f = h5py.File(file_name, 'r')
     X = h5f['X'][obtain_range[0]:obtain_range[1]]
     y = h5f['y'][obtain_range[0]:obtain_range[1]]
     X_pure = h5f['X_pure'][obtain_range[0]:obtain_range[1]]
@@ -69,16 +76,17 @@ def load_extended_inputs(file_name, obtain_range):
 # functions for creating and loading shuffle vector
 def create_and_save_shuffle_vector(file_name, shuffle_vector):
     # X, y, X_pure = generate_full_vowel_matrix_inputs()
-    h5f = h5py.File(file_name + '_shuffle_vector.h5', 'w')
-    adict=dict(shuffle_vector=shuffle_vector)
+    h5f = h5py.File(file_name, 'w')
+    adict = dict(shuffle_vector=shuffle_vector)
     for k, v in adict.items():
-        h5f.create_dataset(k,data=v)
+        h5f.create_dataset(k, data=v)
     h5f.close()
 
 
 def load_shuffle_vector(file_name):
-    h5f = h5py.File(file_name,'r')
-    shuffle_vector = h5f['shuffle_vector'][[179859, 385513, 893430]]
+    h5f = h5py.File(file_name, 'r')
+    # shuffle_vector = h5f['shuffle_vector'][[179859, 385513, 893430]]
+    shuffle_vector = h5f['shuffle_vector'][:]
 
     h5f.close()
     return shuffle_vector
@@ -138,7 +146,8 @@ def create_dict():
     vowels.extend(accetuated_vowels)
     vowels.extend(default_vowels)
 
-    dictionary = ['']
+    dictionary_output = ['']
+    dictionary_input = ['']
     line = 0
     max_word = 0
     # ADD 'EMPTY' VOWEL
@@ -154,12 +163,12 @@ def create_dict():
             for c in list(el[3]):
                 if is_vowel(list(el[3]), i, vowels):
                     num_vowels += 1
-                if c not in dictionary:
-                    dictionary.append(c)
+                if c not in dictionary_output:
+                    dictionary_output.append(c)
                 i += 1
             for c in list(el[0]):
-                if c not in dictionary:
-                    dictionary.append(c)
+                if c not in dictionary_input:
+                    dictionary_input.append(c)
             if num_vowels > max_num_vowels:
                 max_num_vowels = num_vowels
         except Exception:
@@ -167,10 +176,10 @@ def create_dict():
             print(el)
             break
         line += 1
-    dictionary = sorted(dictionary)
+    dictionary_input = sorted(dictionary_input)
     max_num_vowels += 1
     print('DICTIONARY CREATION SUCCESSFUL!')
-    return dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels
+    return dictionary_input, max_word, max_num_vowels, content, vowels, accetuated_vowels
 
 
 # GENERATE X and y
@@ -272,7 +281,22 @@ def generate_presentable_y(accetuations_list, word_list, max_num_vowels):
 #     return X, y
 
 
-def generate_full_matrix_inputs():
+def generate_full_matrix_inputs(content_shuffle_vector_location, shuffle_vector_location):
+    dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
+    train_content, test_content, validate_content = split_content(content, 0.2, content_shuffle_vector_location)
+    feature_dictionary = create_feature_dictionary()
+
+    # Generate X and y
+    print('GENERATING X AND y...')
+    X_train, X_other_features_train, y_train = generate_X_and_y(dictionary, max_word, max_num_vowels, train_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_train.h5')
+    X_test, X_other_features_test, y_test = generate_X_and_y(dictionary, max_word, max_num_vowels, test_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_test.h5')
+    X_validate, X_other_features_validate, y_validate = generate_X_and_y(dictionary, max_word, max_num_vowels, validate_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_validate.h5')
+    print('GENERATION SUCCESSFUL!')
+    return X_train, X_other_features_train, y_train, X_test, X_other_features_test, y_test, X_validate, X_other_features_validate, y_validate
+
+
+# generate full matrix, with old features
+def old_generate_full_matrix_inputs():
     dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
     train_content, validate_content = split_content(content, 0.2)
     feature_dictionary = create_feature_dictionary(content)
@@ -286,7 +310,7 @@ def generate_full_matrix_inputs():
 
 
 # Generate each y as an array of 11 numbers (with possible values between 0 and 1)
-def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary):
+def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location):
     y = np.zeros((len(content), max_num_vowels))
     X = np.zeros((len(content), max_word, len(dictionary)))
     print('CREATING OTHER FEATURES...')
@@ -328,7 +352,7 @@ def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, acce
         i += 1
 
     print('SHUFFELING INPUTS...')
-    X, y, X_other_features = shuffle_inputs(X, y, X_pure=X_other_features)
+    X, y, X_other_features = shuffle_inputs(X, y, shuffle_vector_location, X_pure=X_other_features)
     print('INPUTS SHUFFELED!')
     return X, X_other_features, y
 
@@ -559,6 +583,7 @@ def shuffle_full_vowel_inputs(name, orderd_name, parts):
 
 # Decoders for inputs and outputs
 def decode_X_features(feature_dictionary, X_other_features):
+    final_word = []
     for word in X_other_features:
         final_word = []
         i = 0
@@ -574,6 +599,7 @@ def decode_X_features(feature_dictionary, X_other_features):
                             final_word.append(feature_dictionary[z][j][k])
                         i += 1
         print(u''.join(final_word))
+    return u''.join(final_word)
 
 
 def decode_position(y, max_num_vowels):
@@ -650,7 +676,37 @@ def decode_position_from_vowel_to_final_number(y):
 
 
 # split content so that there is no overfitting
-def split_content(content, ratio):
+def split_content(content, test_and_validation_ratio, content_shuffle_vector_location, validation_ratio=0.5):
+    expanded_content = [el[1] if el[1] != '=' else el[0] for el in content]
+    # print(len(content))
+    unique_content = sorted(set(expanded_content))
+
+    if os.path.exists(content_shuffle_vector_location):
+        s = load_shuffle_vector(content_shuffle_vector_location)
+    else:
+        s = np.arange(len(unique_content))
+        np.random.shuffle(s)
+        create_and_save_shuffle_vector(content_shuffle_vector_location, s)
+
+    split_num = math.floor(len(unique_content) * test_and_validation_ratio)
+    validation_num = math.floor(split_num * validation_ratio)
+    shuffled_unique_train_content = [unique_content[i] for i in range(len(s)) if s[i] >= split_num]
+    shuffled_unique_train_content_set = set(shuffled_unique_train_content)
+
+    shuffled_unique_test_content = [unique_content[i] for i in range(len(s)) if split_num > s[i] >= validation_num]
+    shuffled_unique_test_content_set = set(shuffled_unique_test_content)
+
+    shuffled_unique_validate_content = [unique_content[i] for i in range(len(s)) if s[i] < validation_num]
+    shuffled_unique_validate_content_set = set(shuffled_unique_validate_content)
+
+    train_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_train_content_set]
+    test_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_test_content_set]
+    validate_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_validate_content_set]
+    return train_content, test_content, validate_content
+
+
+# split content so that there is no overfitting with out split of validation and test data
+def old_split_content(content, ratio):
     expanded_content = [el[1] if el[1] != '=' else el[0] for el in content]
     # print(len(content))
     unique_content = sorted(set(expanded_content))
@@ -671,8 +727,8 @@ def split_content(content, ratio):
     return train_content, validate_content
 
 
-#  create feature dictionary
-def create_feature_dictionary(content):
+# X features that use MULTEX v3 as their encoding
+def create_old_feature_dictionary(content):
     additional_data = [el[2] for el in content]
     possible_variants = sorted(set(additional_data))
     categories = sorted(set([el[0] for el in possible_variants]))
@@ -690,7 +746,8 @@ def create_feature_dictionary(content):
     return feature_dictionary
 
 
-def create_X_features(content, feature_dictionary):
+# X features that use MULTEX v3 as their encoding
+def create_old_X_features(content, feature_dictionary):
     content = content
     X_other_features = []
     for el in content:
@@ -707,4 +764,213 @@ def create_X_features(content, feature_dictionary):
             else:
                 X_el_other_features.extend([0] * feature[0])
         X_other_features.append(X_el_other_features)
-    return np.array(X_other_features)
\ No newline at end of file
+    return np.array(X_other_features)
+
+
+def convert_to_MULTEXT_east_v4(old_features, feature_dictionary):
+    new_features = ['-'] * 9
+    new_features[:len(old_features)] = old_features
+    if old_features[0] == 'A':
+        if old_features[1] == 'f' or old_features[1] == 'o':
+            new_features[1] = 'g'
+        return new_features[:len(feature_dictionary[0]) - 1]
+    if old_features[0] == 'C':
+        return new_features[:len(feature_dictionary[1]) - 1]
+    if old_features[0] == 'I':
+        return new_features[:len(feature_dictionary[2]) - 1]
+    if old_features[0] == 'M':
+        new_features[2:6] = old_features[1:5]
+        new_features[1] = old_features[5]
+        if new_features[2] == 'm':
+            new_features[2] = '-'
+        return new_features[:len(feature_dictionary[3]) - 1]
+    if old_features[0] == 'N':
+        if len(old_features) > 5:
+            new_features[5] = old_features[7]
+        return new_features[:len(feature_dictionary[4]) - 1]
+    if old_features[0] == 'P':
+        if new_features[8] == 'n':
+            new_features[8] = 'b'
+        return new_features[:len(feature_dictionary[5]) - 1]
+    if old_features[0] == 'Q':
+        return new_features[:len(feature_dictionary[6]) - 1]
+    if old_features[0] == 'R':
+        return new_features[:len(feature_dictionary[7]) - 1]
+    if old_features[0] == 'S':
+        if len(old_features) == 4:
+            new_features[1] = old_features[3]
+        else:
+            new_features[1] = '-'
+        return new_features[:len(feature_dictionary[8]) - 1]
+    if old_features[0] == 'V':
+        if old_features[1] == 'o' or old_features[1] == 'c':
+            new_features[1] = 'm'
+        new_features[3] = old_features[2]
+        new_features[2] = '-'
+        if old_features[2] == 'i':
+            new_features[3] = 'r'
+        if len(old_features) > 3 and old_features[3] == 'p':
+            new_features[3] = 'r'
+        elif len(old_features) > 3 and old_features[3] == 'f':
+            new_features[3] = 'f'
+        if len(old_features) >= 9:
+            new_features[7] = old_features[8]
+        else:
+            new_features[7] = '-'
+        return new_features[:len(feature_dictionary[9]) - 1]
+    return ''
+
+
+def create_X_features(content, feature_dictionary):
+    content = content
+    X_other_features = []
+    for el in content:
+        X_el_other_features = []
+        converted_el = ''.join(convert_to_MULTEXT_east_v4(list(el[2]), feature_dictionary))
+#         converted_el = el[2]
+        for feature in feature_dictionary:
+            if converted_el[0] == feature[1]:
+                X_el_other_features.append(1)
+                for i in range(2, len(feature)):
+                    for j in range(len(feature[i])):
+                        if i-1 < len(converted_el) and feature[i][j] == converted_el[i-1]:
+                            X_el_other_features.append(1)
+                        else:
+                            X_el_other_features.append(0)
+            else:
+                X_el_other_features.extend([0] * feature[0])
+        X_other_features.append(X_el_other_features)
+    return np.array(X_other_features)
+
+
+def create_feature_dictionary():
+    # old: http://nl.ijs.si/ME/Vault/V3/msd/html/
+    # new: http://nl.ijs.si/ME/V4/msd/html/
+    # changes: http://nl.ijs.si/jos/msd/html-en/msd.diffs.html
+
+    return [[21,
+          'A',
+          ['g', 's'],
+          ['p', 'c', 's'],
+          ['m', 'f', 'n'],
+          ['s', 'd', 'p'],
+          ['n', 'g', 'd', 'a', 'l', 'i'],
+          ['-', 'n', 'y']],
+         [3, 'C', ['c', 's']],
+         [1, 'I'],
+         [21,
+          'M',
+          ['l'],
+          ['-', 'c', 'o', 's'],
+          ['m', 'f', 'n'],
+          ['s', 'd', 'p'],
+          ['n', 'g', 'd', 'a', 'l', 'i'],
+          ['-', 'n', 'y']],
+         [17,
+          'N',
+          ['c'],
+          ['m', 'f', 'n'],
+          ['s', 'd', 'p'],
+          ['n', 'g', 'd', 'a', 'l', 'i'],
+          ['-', 'n', 'y']],
+         [40,
+          'P',
+          ['p', 's', 'd', 'r', 'x', 'g', 'q', 'i', 'z'],
+          ['-', '1', '2', '3'],
+          ['-', 'm', 'f', 'n'],
+          ['-', 's', 'd', 'p'],
+          ['-', 'n', 'g', 'd', 'a', 'l', 'i'],
+          ['-', 's', 'd', 'p'],
+          ['-', 'm', 'f', 'n'],
+          ['-', 'y', 'b']],
+         [1, 'Q'],
+         [5, 'R', ['g'], ['p', 'c', 's']],
+         [7, 'S', ['-', 'g', 'd', 'a', 'l', 'i']],
+         [24,
+          'V',
+          ['m'],
+          ['-'],
+          ['n', 'u', 'p', 'r', 'f', 'c'],
+          ['-', '1', '2', '3'],
+          ['-', 's', 'p', 'd'],
+          ['-', 'm', 'f', 'n'],
+          ['-', 'n', 'y']]
+        ]
+
+
+def complete_feature_dict():
+    # old: http://nl.ijs.si/ME/Vault/V3/msd/html/
+    # new: http://nl.ijs.si/ME/V4/msd/html/
+    # changes: http://nl.ijs.si/jos/msd/html-en/msd.diffs.html
+    return [[27,
+             'A',
+             ['-', 'g', 's', 'p'],
+             ['-', 'p', 'c', 's'],
+             ['-', 'm', 'f', 'n'],
+             ['-', 's', 'd', 'p'],
+             ['-', 'n', 'g', 'd', 'a', 'l', 'i'],
+             ['-', 'n', 'y']],
+            [4, 'C', ['-', 'c', 's']],
+            [1, 'I'],
+            [28,
+             'M',
+             ['-', 'd', 'r', 'l'],
+             ['-', 'c', 'o', 'p', 's'],
+             ['-', 'm', 'f', 'n'],
+             ['-', 's', 'd', 'p'],
+             ['-', 'n', 'g', 'd', 'a', 'l', 'i'],
+             ['-', 'n', 'y']],
+            [22,
+             'N',
+             ['-', 'c', 'p'],
+             ['-', 'm', 'f', 'n'],
+             ['-', 's', 'd', 'p'],
+             ['-', 'n', 'g', 'd', 'a', 'l', 'i'],
+             ['-', 'n', 'y']],
+            [41,
+             'P',
+             ['-', 'p', 's', 'd', 'r', 'x', 'g', 'q', 'i', 'z'],
+             ['-', '1', '2', '3'],
+             ['-', 'm', 'f', 'n'],
+             ['-', 's', 'd', 'p'],
+             ['-', 'n', 'g', 'd', 'a', 'l', 'i'],
+             ['-', 's', 'd', 'p'],
+             ['-', 'm', 'f', 'n'],
+             ['-', 'y', 'b']],
+            [1, 'Q'],
+            [8, 'R', ['-', 'g', 'r'], ['-', 'p', 'c', 's']],
+            [8, 'S', ['-', 'n', 'g', 'd', 'a', 'l', 'i']],
+            [31,
+             'V',
+             ['-', 'm', 'a'],
+             ['-', 'e', 'p', 'b'],
+             ['-', 'n', 'u', 'p', 'r', 'f', 'c', 'm'],
+             ['-', '1', '2', '3'],
+             ['-', 's', 'p', 'd'],
+             ['-', 'm', 'f', 'n'],
+             ['-', 'n', 'y']]
+            ]
+
+
+def check_feature_letter_usage(X_other_features, feature_dictionary):
+    case_numbers = np.sum(X_other_features, axis=0)
+    arrays = [1] * 164
+    letters = list(decode_X_features(feature_dictionary, [arrays]))
+    print(sum(case_numbers))
+    for i in range(len(letters)):
+        print(letters[i] + ': ' + str(case_numbers[i]))
+
+
+def dict_occurances_in_dataset_rate(content):
+    feature_dictionary = complete_feature_dict()
+    # case = 3107
+    # print(content[case])
+    # print(feature_dictionary)
+    # X_other_features = create_X_features([content[case]], feature_dictionary)
+    X_other_features = create_X_features(content, feature_dictionary)
+    # print(X_other_features)
+    # print(decode_X_features(feature_dictionary, X_other_features))
+    X_other_features = np.array(X_other_features)
+
+    case_numbers = np.sum(X_other_features, axis=0)
+    print(case_numbers)
diff --git a/tex_hyphenation.py b/tex_hyphenation.py
new file mode 100644
index 0000000..85867d8
--- /dev/null
+++ b/tex_hyphenation.py
@@ -0,0 +1,101 @@
+import sys
+sys.path.insert(0, '../../../')
+from prepare_data import *
+dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
+feature_dictionary = create_feature_dictionary(content)
+
+
+def read_hyphenation_pattern():
+    with open('../../../hyphenation') as f:
+        content = f.readlines()
+    return [x[:-1] for x in content]
+
+
+def find_hyphenation_patterns_in_text(text, pattern):
+    res = []
+    index = 0
+    while index < len(text):
+        index = text.find(pattern, index)
+        if index == -1:
+            break
+        res.append(index)
+        index += 1  # +2 because len('ll') == 2
+
+    return res
+
+
+def create_hyphenation_dictionary(hyphenation_pattern):
+    dictionary = []
+    for el in hyphenation_pattern:
+        substring = ''
+        anomalies_indices = []
+        digit_location = 0
+        for let in list(el):
+            if let.isdigit():
+                anomalies_indices.append([digit_location, int(let)])
+            else:
+                substring += let
+                digit_location += 1
+        dictionary.append([substring, anomalies_indices])
+    return dictionary
+
+
+def split_hyphenated_word(split, word):
+    split = split[2:-2]
+    print(split)
+    word = list(word)[1:-1]
+    res = []
+    hyphenate = ''
+    loc = 0
+    for let in word:
+        hyphenate += let
+        if loc == len(split) or split[loc] % 2 == 1:
+            res.append(hyphenate)
+            hyphenate = ''
+        loc += 1
+    return res
+
+
+def hyphenate_word(word, hyphenation_dictionary):
+    word = word.replace('è', 'č')
+    word = '.' + word + '.'
+    split = [0] * (len(word) + 1)
+    for pattern in hyphenation_dictionary:
+        pattern_locations = find_hyphenation_patterns_in_text(word, pattern[0])
+        for pattern_location in pattern_locations:
+            for el_hyphenation_dictionary in pattern[1]:
+                if split[pattern_location + el_hyphenation_dictionary[0]] < el_hyphenation_dictionary[1]:
+                    split[pattern_location + el_hyphenation_dictionary[0]] = el_hyphenation_dictionary[1]
+    return split_hyphenated_word(split, word)
+
+
+hyphenation_pattern = read_hyphenation_pattern()
+# ['zz', [{0:2},{1:1},{2:2}]]
+hyphenation_dictionary = create_hyphenation_dictionary(hyphenation_pattern)
+separated_word = hyphenate_word('izziv', hyphenation_dictionary)
+print(separated_word)
+
+all_words = []
+i = 0
+for el in content:
+    separated_word = hyphenate_word(el[0], hyphenation_dictionary)
+    all_words.append([el[0], separated_word])
+    if i % 10000 == 0:
+        print(str(i)+'/'+str(len(content)))
+    i += 1
+
+errors = []
+errors2 = []
+for word in all_words:
+    for hyphenated_part in word[1]:
+        num_vowels = 0
+        for let in list(hyphenated_part):
+            if let in vowels:
+                num_vowels += 1
+        if num_vowels == 0:
+            for let in list(hyphenated_part):
+                if let == 'r':
+                    errors2.append(word[0])
+                    num_vowels += 1
+        if num_vowels != 1:
+            errors.append(word)
\ No newline at end of file