[MAJOR UPDATE] Changed additional features to version 4, erased unnecessary input letters (unused vowels), split validation data to test data and validation data
This commit is contained in:
		
							parent
							
								
									0cc949897f
								
							
						
					
					
						commit
						f0d263e429
					
				
							
								
								
									
										117
									
								
								.idea/workspace.xml
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										117
									
								
								.idea/workspace.xml
									
									
									
										generated
									
									
									
								
							@ -2,8 +2,8 @@
 | 
				
			|||||||
<project version="4">
 | 
					<project version="4">
 | 
				
			||||||
  <component name="ChangeListManager">
 | 
					  <component name="ChangeListManager">
 | 
				
			||||||
    <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
 | 
					    <list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
 | 
				
			||||||
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/1_epoch.h5" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/1_epoch.h5" />
 | 
					      <change type="NEW" beforePath="" afterPath="$PROJECT_DIR$/tex_hyphenation.py" />
 | 
				
			||||||
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/1_epoch_history.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/1_epoch_history.pkl" />
 | 
					      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
 | 
				
			||||||
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
 | 
					      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn.ipynb" />
 | 
				
			||||||
      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
 | 
					      <change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
 | 
				
			||||||
    </list>
 | 
					    </list>
 | 
				
			||||||
@ -35,8 +35,8 @@
 | 
				
			|||||||
      <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
 | 
					      <file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
 | 
				
			||||||
        <entry file="file://$PROJECT_DIR$/prepare_data.py">
 | 
					        <entry file="file://$PROJECT_DIR$/prepare_data.py">
 | 
				
			||||||
          <provider selected="true" editor-type-id="text-editor">
 | 
					          <provider selected="true" editor-type-id="text-editor">
 | 
				
			||||||
            <state relative-caret-position="442">
 | 
					            <state relative-caret-position="284">
 | 
				
			||||||
              <caret line="462" column="19" lean-forward="false" selection-start-line="462" selection-start-column="4" selection-end-line="462" selection-end-column="19" />
 | 
					              <caret line="592" column="36" lean-forward="true" selection-start-line="592" selection-start-column="36" selection-end-line="592" selection-end-column="36" />
 | 
				
			||||||
              <folding>
 | 
					              <folding>
 | 
				
			||||||
                <element signature="e#24#63#0" expanded="true" />
 | 
					                <element signature="e#24#63#0" expanded="true" />
 | 
				
			||||||
              </folding>
 | 
					              </folding>
 | 
				
			||||||
@ -44,6 +44,16 @@
 | 
				
			|||||||
          </provider>
 | 
					          </provider>
 | 
				
			||||||
        </entry>
 | 
					        </entry>
 | 
				
			||||||
      </file>
 | 
					      </file>
 | 
				
			||||||
 | 
					      <file leaf-file-name="tex_hyphenation.py" pinned="false" current-in-tab="false">
 | 
				
			||||||
 | 
					        <entry file="file://$PROJECT_DIR$/tex_hyphenation.py">
 | 
				
			||||||
 | 
					          <provider selected="true" editor-type-id="text-editor">
 | 
				
			||||||
 | 
					            <state relative-caret-position="1206">
 | 
				
			||||||
 | 
					              <caret line="67" column="105" lean-forward="false" selection-start-line="67" selection-start-column="105" selection-end-line="67" selection-end-column="105" />
 | 
				
			||||||
 | 
					              <folding />
 | 
				
			||||||
 | 
					            </state>
 | 
				
			||||||
 | 
					          </provider>
 | 
				
			||||||
 | 
					        </entry>
 | 
				
			||||||
 | 
					      </file>
 | 
				
			||||||
      <file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
 | 
					      <file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
 | 
				
			||||||
        <entry file="file://$PROJECT_DIR$/workbench.py">
 | 
					        <entry file="file://$PROJECT_DIR$/workbench.py">
 | 
				
			||||||
          <provider selected="true" editor-type-id="text-editor">
 | 
					          <provider selected="true" editor-type-id="text-editor">
 | 
				
			||||||
@ -139,6 +149,17 @@
 | 
				
			|||||||
      <find>StringIO</find>
 | 
					      <find>StringIO</find>
 | 
				
			||||||
      <find>shuffle_inputs</find>
 | 
					      <find>shuffle_inputs</find>
 | 
				
			||||||
      <find>generator</find>
 | 
					      <find>generator</find>
 | 
				
			||||||
 | 
					      <find>content, feature_dictionary</find>
 | 
				
			||||||
 | 
					      <find>decode</find>
 | 
				
			||||||
 | 
					      <find>create_feature_dictionary</find>
 | 
				
			||||||
 | 
					      <find>with</find>
 | 
				
			||||||
 | 
					      <find>read</find>
 | 
				
			||||||
 | 
					      <find>generate</find>
 | 
				
			||||||
 | 
					      <find>shuffle</find>
 | 
				
			||||||
 | 
					      <find>X_</find>
 | 
				
			||||||
 | 
					      <find>dictionary</find>
 | 
				
			||||||
 | 
					      <find>create_dict</find>
 | 
				
			||||||
 | 
					      <find>split_content</find>
 | 
				
			||||||
    </findStrings>
 | 
					    </findStrings>
 | 
				
			||||||
  </component>
 | 
					  </component>
 | 
				
			||||||
  <component name="Git.Settings">
 | 
					  <component name="Git.Settings">
 | 
				
			||||||
@ -157,6 +178,7 @@
 | 
				
			|||||||
        <option value="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.py" />
 | 
					        <option value="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.py" />
 | 
				
			||||||
        <option value="$PROJECT_DIR$/workbench.py" />
 | 
					        <option value="$PROJECT_DIR$/workbench.py" />
 | 
				
			||||||
        <option value="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.ipynb" />
 | 
					        <option value="$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.ipynb" />
 | 
				
			||||||
 | 
					        <option value="$PROJECT_DIR$/tex_hyphenation.py" />
 | 
				
			||||||
        <option value="$PROJECT_DIR$/prepare_data.py" />
 | 
					        <option value="$PROJECT_DIR$/prepare_data.py" />
 | 
				
			||||||
      </list>
 | 
					      </list>
 | 
				
			||||||
    </option>
 | 
					    </option>
 | 
				
			||||||
@ -165,7 +187,7 @@
 | 
				
			|||||||
    <option name="x" value="65" />
 | 
					    <option name="x" value="65" />
 | 
				
			||||||
    <option name="y" value="24" />
 | 
					    <option name="y" value="24" />
 | 
				
			||||||
    <option name="width" value="1855" />
 | 
					    <option name="width" value="1855" />
 | 
				
			||||||
    <option name="height" value="1056" />
 | 
					    <option name="height" value="1176" />
 | 
				
			||||||
  </component>
 | 
					  </component>
 | 
				
			||||||
  <component name="ProjectView">
 | 
					  <component name="ProjectView">
 | 
				
			||||||
    <navigator currentView="ProjectPane" proportions="" version="1">
 | 
					    <navigator currentView="ProjectPane" proportions="" version="1">
 | 
				
			||||||
@ -182,7 +204,6 @@
 | 
				
			|||||||
      <foldersAlwaysOnTop value="true" />
 | 
					      <foldersAlwaysOnTop value="true" />
 | 
				
			||||||
    </navigator>
 | 
					    </navigator>
 | 
				
			||||||
    <panes>
 | 
					    <panes>
 | 
				
			||||||
      <pane id="Scratches" />
 | 
					 | 
				
			||||||
      <pane id="ProjectPane">
 | 
					      <pane id="ProjectPane">
 | 
				
			||||||
        <subPane>
 | 
					        <subPane>
 | 
				
			||||||
          <PATH>
 | 
					          <PATH>
 | 
				
			||||||
@ -195,23 +216,10 @@
 | 
				
			|||||||
              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
 | 
					              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
 | 
				
			||||||
            </PATH_ELEMENT>
 | 
					            </PATH_ELEMENT>
 | 
				
			||||||
          </PATH>
 | 
					          </PATH>
 | 
				
			||||||
          <PATH>
 | 
					 | 
				
			||||||
            <PATH_ELEMENT>
 | 
					 | 
				
			||||||
              <option name="myItemId" value="accetuation" />
 | 
					 | 
				
			||||||
              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
 | 
					 | 
				
			||||||
            </PATH_ELEMENT>
 | 
					 | 
				
			||||||
            <PATH_ELEMENT>
 | 
					 | 
				
			||||||
              <option name="myItemId" value="accetuation" />
 | 
					 | 
				
			||||||
              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
 | 
					 | 
				
			||||||
            </PATH_ELEMENT>
 | 
					 | 
				
			||||||
            <PATH_ELEMENT>
 | 
					 | 
				
			||||||
              <option name="myItemId" value="cnn" />
 | 
					 | 
				
			||||||
              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
 | 
					 | 
				
			||||||
            </PATH_ELEMENT>
 | 
					 | 
				
			||||||
          </PATH>
 | 
					 | 
				
			||||||
        </subPane>
 | 
					        </subPane>
 | 
				
			||||||
      </pane>
 | 
					      </pane>
 | 
				
			||||||
      <pane id="Scope" />
 | 
					      <pane id="Scope" />
 | 
				
			||||||
 | 
					      <pane id="Scratches" />
 | 
				
			||||||
    </panes>
 | 
					    </panes>
 | 
				
			||||||
  </component>
 | 
					  </component>
 | 
				
			||||||
  <component name="PropertiesComponent">
 | 
					  <component name="PropertiesComponent">
 | 
				
			||||||
@ -474,7 +482,7 @@
 | 
				
			|||||||
    <servers />
 | 
					    <servers />
 | 
				
			||||||
  </component>
 | 
					  </component>
 | 
				
			||||||
  <component name="ToolWindowManager">
 | 
					  <component name="ToolWindowManager">
 | 
				
			||||||
    <frame x="65" y="24" width="1855" height="1056" extended-state="6" />
 | 
					    <frame x="65" y="24" width="1855" height="1176" extended-state="6" />
 | 
				
			||||||
    <editor active="true" />
 | 
					    <editor active="true" />
 | 
				
			||||||
    <layout>
 | 
					    <layout>
 | 
				
			||||||
      <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.16375546" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
 | 
					      <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.16375546" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
 | 
				
			||||||
@ -495,25 +503,6 @@
 | 
				
			|||||||
      <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
 | 
					      <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
 | 
				
			||||||
      <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.39979124" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
 | 
					      <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.39979124" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
 | 
				
			||||||
    </layout>
 | 
					    </layout>
 | 
				
			||||||
    <layout-to-restore>
 | 
					 | 
				
			||||||
      <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
 | 
					 | 
				
			||||||
      <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
 | 
					 | 
				
			||||||
      <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
 | 
					 | 
				
			||||||
      <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
 | 
					 | 
				
			||||||
      <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" />
 | 
					 | 
				
			||||||
      <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
 | 
					 | 
				
			||||||
      <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="10" side_tool="false" content_ui="tabs" />
 | 
					 | 
				
			||||||
      <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32985386" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" />
 | 
					 | 
				
			||||||
      <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.32985386" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
 | 
					 | 
				
			||||||
      <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32985386" sideWeight="0.5" order="9" side_tool="false" content_ui="tabs" />
 | 
					 | 
				
			||||||
      <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.16375546" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
 | 
					 | 
				
			||||||
      <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
 | 
					 | 
				
			||||||
      <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
 | 
					 | 
				
			||||||
      <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
 | 
					 | 
				
			||||||
      <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
 | 
					 | 
				
			||||||
      <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" />
 | 
					 | 
				
			||||||
      <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.39979124" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
 | 
					 | 
				
			||||||
    </layout-to-restore>
 | 
					 | 
				
			||||||
  </component>
 | 
					  </component>
 | 
				
			||||||
  <component name="VcsContentAnnotationSettings">
 | 
					  <component name="VcsContentAnnotationSettings">
 | 
				
			||||||
    <option name="myLimit" value="2678400000" />
 | 
					    <option name="myLimit" value="2678400000" />
 | 
				
			||||||
@ -529,17 +518,6 @@
 | 
				
			|||||||
    <watches-manager />
 | 
					    <watches-manager />
 | 
				
			||||||
  </component>
 | 
					  </component>
 | 
				
			||||||
  <component name="editorHistoryManager">
 | 
					  <component name="editorHistoryManager">
 | 
				
			||||||
    <entry file="file://$PROJECT_DIR$/theano_tutorial/tutorial_derivates.py">
 | 
					 | 
				
			||||||
      <provider selected="true" editor-type-id="text-editor">
 | 
					 | 
				
			||||||
        <state relative-caret-position="0">
 | 
					 | 
				
			||||||
          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
 | 
					 | 
				
			||||||
          <folding>
 | 
					 | 
				
			||||||
            <element signature="e#0#18#0" expanded="true" />
 | 
					 | 
				
			||||||
          </folding>
 | 
					 | 
				
			||||||
        </state>
 | 
					 | 
				
			||||||
      </provider>
 | 
					 | 
				
			||||||
    </entry>
 | 
					 | 
				
			||||||
    <entry file="file://$PROJECT_DIR$/theanoTest.py" />
 | 
					 | 
				
			||||||
    <entry file="file://$PROJECT_DIR$/theano_tutorial/test.py">
 | 
					    <entry file="file://$PROJECT_DIR$/theano_tutorial/test.py">
 | 
				
			||||||
      <provider selected="true" editor-type-id="text-editor">
 | 
					      <provider selected="true" editor-type-id="text-editor">
 | 
				
			||||||
        <state relative-caret-position="1368">
 | 
					        <state relative-caret-position="1368">
 | 
				
			||||||
@ -822,16 +800,6 @@
 | 
				
			|||||||
      </provider>
 | 
					      </provider>
 | 
				
			||||||
    </entry>
 | 
					    </entry>
 | 
				
			||||||
    <entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn_test_on_other_attributes.ipynb" />
 | 
					    <entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/cnn_test_on_other_attributes.ipynb" />
 | 
				
			||||||
    <entry file="file://$PROJECT_DIR$/prepare_data.py">
 | 
					 | 
				
			||||||
      <provider selected="true" editor-type-id="text-editor">
 | 
					 | 
				
			||||||
        <state relative-caret-position="442">
 | 
					 | 
				
			||||||
          <caret line="462" column="19" lean-forward="false" selection-start-line="462" selection-start-column="4" selection-end-line="462" selection-end-column="19" />
 | 
					 | 
				
			||||||
          <folding>
 | 
					 | 
				
			||||||
            <element signature="e#24#63#0" expanded="true" />
 | 
					 | 
				
			||||||
          </folding>
 | 
					 | 
				
			||||||
        </state>
 | 
					 | 
				
			||||||
      </provider>
 | 
					 | 
				
			||||||
    </entry>
 | 
					 | 
				
			||||||
    <entry file="file://$PROJECT_DIR$/theano_tutorial/logistic_regression.py">
 | 
					    <entry file="file://$PROJECT_DIR$/theano_tutorial/logistic_regression.py">
 | 
				
			||||||
      <provider selected="true" editor-type-id="text-editor">
 | 
					      <provider selected="true" editor-type-id="text-editor">
 | 
				
			||||||
        <state relative-caret-position="162">
 | 
					        <state relative-caret-position="162">
 | 
				
			||||||
@ -882,6 +850,14 @@
 | 
				
			|||||||
    </entry>
 | 
					    </entry>
 | 
				
			||||||
    <entry file="file://$PROJECT_DIR$/theanoTest.py" />
 | 
					    <entry file="file://$PROJECT_DIR$/theanoTest.py" />
 | 
				
			||||||
    <entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.py" />
 | 
					    <entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.py" />
 | 
				
			||||||
 | 
					    <entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.ipynb" />
 | 
				
			||||||
 | 
					    <entry file="file://$PROJECT_DIR$/hyphenation">
 | 
				
			||||||
 | 
					      <provider selected="true" editor-type-id="text-editor">
 | 
				
			||||||
 | 
					        <state relative-caret-position="0">
 | 
				
			||||||
 | 
					          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
 | 
				
			||||||
 | 
					        </state>
 | 
				
			||||||
 | 
					      </provider>
 | 
				
			||||||
 | 
					    </entry>
 | 
				
			||||||
    <entry file="file://$PROJECT_DIR$/workbench.py">
 | 
					    <entry file="file://$PROJECT_DIR$/workbench.py">
 | 
				
			||||||
      <provider selected="true" editor-type-id="text-editor">
 | 
					      <provider selected="true" editor-type-id="text-editor">
 | 
				
			||||||
        <state relative-caret-position="396">
 | 
					        <state relative-caret-position="396">
 | 
				
			||||||
@ -892,6 +868,23 @@
 | 
				
			|||||||
        </state>
 | 
					        </state>
 | 
				
			||||||
      </provider>
 | 
					      </provider>
 | 
				
			||||||
    </entry>
 | 
					    </entry>
 | 
				
			||||||
    <entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.ipynb" />
 | 
					    <entry file="file://$PROJECT_DIR$/tex_hyphenation.py">
 | 
				
			||||||
 | 
					      <provider selected="true" editor-type-id="text-editor">
 | 
				
			||||||
 | 
					        <state relative-caret-position="1206">
 | 
				
			||||||
 | 
					          <caret line="67" column="105" lean-forward="false" selection-start-line="67" selection-start-column="105" selection-end-line="67" selection-end-column="105" />
 | 
				
			||||||
 | 
					          <folding />
 | 
				
			||||||
 | 
					        </state>
 | 
				
			||||||
 | 
					      </provider>
 | 
				
			||||||
 | 
					    </entry>
 | 
				
			||||||
 | 
					    <entry file="file://$PROJECT_DIR$/prepare_data.py">
 | 
				
			||||||
 | 
					      <provider selected="true" editor-type-id="text-editor">
 | 
				
			||||||
 | 
					        <state relative-caret-position="284">
 | 
				
			||||||
 | 
					          <caret line="592" column="36" lean-forward="true" selection-start-line="592" selection-start-column="36" selection-end-line="592" selection-end-column="36" />
 | 
				
			||||||
 | 
					          <folding>
 | 
				
			||||||
 | 
					            <element signature="e#24#63#0" expanded="true" />
 | 
				
			||||||
 | 
					          </folding>
 | 
				
			||||||
 | 
					        </state>
 | 
				
			||||||
 | 
					      </provider>
 | 
				
			||||||
 | 
					    </entry>
 | 
				
			||||||
  </component>
 | 
					  </component>
 | 
				
			||||||
</project>
 | 
					</project>
 | 
				
			||||||
							
								
								
									
										1113
									
								
								hyphenation
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1113
									
								
								hyphenation
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										312
									
								
								prepare_data.py
									
									
									
									
									
								
							
							
						
						
									
										312
									
								
								prepare_data.py
									
									
									
									
									
								
							@ -7,6 +7,7 @@ import h5py
 | 
				
			|||||||
import gc
 | 
					import gc
 | 
				
			||||||
import math
 | 
					import math
 | 
				
			||||||
import keras.backend as K
 | 
					import keras.backend as K
 | 
				
			||||||
 | 
					import os.path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# functions for saving, loading and shuffling whole arrays to ram
 | 
					# functions for saving, loading and shuffling whole arrays to ram
 | 
				
			||||||
@ -34,9 +35,15 @@ def load_inputs(file_name, other_features=False):
 | 
				
			|||||||
    return X, y
 | 
					    return X, y
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def shuffle_inputs(X, y, X_pure=[]):
 | 
					def shuffle_inputs(X, y, shuffle_vector_location, X_pure=[]):
 | 
				
			||||||
    s = np.arange(X.shape[0])
 | 
					    if os.path.exists(shuffle_vector_location):
 | 
				
			||||||
    np.random.shuffle(s)
 | 
					        s = load_shuffle_vector(shuffle_vector_location)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        s = np.arange(X.shape[0])
 | 
				
			||||||
 | 
					        np.random.shuffle(s)
 | 
				
			||||||
 | 
					        create_and_save_shuffle_vector(shuffle_vector_location, s)
 | 
				
			||||||
 | 
					    # s = np.arange(X.shape[0])
 | 
				
			||||||
 | 
					    # np.random.shuffle(s)
 | 
				
			||||||
    X = X[s]
 | 
					    X = X[s]
 | 
				
			||||||
    y = y[s]
 | 
					    y = y[s]
 | 
				
			||||||
    if X_pure != []:
 | 
					    if X_pure != []:
 | 
				
			||||||
@ -57,7 +64,7 @@ def create_and_save_inputs(file_name, part, X, y, X_pure):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def load_extended_inputs(file_name, obtain_range):
 | 
					def load_extended_inputs(file_name, obtain_range):
 | 
				
			||||||
    h5f = h5py.File(file_name,'r')
 | 
					    h5f = h5py.File(file_name, 'r')
 | 
				
			||||||
    X = h5f['X'][obtain_range[0]:obtain_range[1]]
 | 
					    X = h5f['X'][obtain_range[0]:obtain_range[1]]
 | 
				
			||||||
    y = h5f['y'][obtain_range[0]:obtain_range[1]]
 | 
					    y = h5f['y'][obtain_range[0]:obtain_range[1]]
 | 
				
			||||||
    X_pure = h5f['X_pure'][obtain_range[0]:obtain_range[1]]
 | 
					    X_pure = h5f['X_pure'][obtain_range[0]:obtain_range[1]]
 | 
				
			||||||
@ -69,16 +76,17 @@ def load_extended_inputs(file_name, obtain_range):
 | 
				
			|||||||
# functions for creating and loading shuffle vector
 | 
					# functions for creating and loading shuffle vector
 | 
				
			||||||
def create_and_save_shuffle_vector(file_name, shuffle_vector):
 | 
					def create_and_save_shuffle_vector(file_name, shuffle_vector):
 | 
				
			||||||
    # X, y, X_pure = generate_full_vowel_matrix_inputs()
 | 
					    # X, y, X_pure = generate_full_vowel_matrix_inputs()
 | 
				
			||||||
    h5f = h5py.File(file_name + '_shuffle_vector.h5', 'w')
 | 
					    h5f = h5py.File(file_name, 'w')
 | 
				
			||||||
    adict=dict(shuffle_vector=shuffle_vector)
 | 
					    adict = dict(shuffle_vector=shuffle_vector)
 | 
				
			||||||
    for k, v in adict.items():
 | 
					    for k, v in adict.items():
 | 
				
			||||||
        h5f.create_dataset(k,data=v)
 | 
					        h5f.create_dataset(k, data=v)
 | 
				
			||||||
    h5f.close()
 | 
					    h5f.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def load_shuffle_vector(file_name):
 | 
					def load_shuffle_vector(file_name):
 | 
				
			||||||
    h5f = h5py.File(file_name,'r')
 | 
					    h5f = h5py.File(file_name, 'r')
 | 
				
			||||||
    shuffle_vector = h5f['shuffle_vector'][[179859, 385513, 893430]]
 | 
					    # shuffle_vector = h5f['shuffle_vector'][[179859, 385513, 893430]]
 | 
				
			||||||
 | 
					    shuffle_vector = h5f['shuffle_vector'][:]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    h5f.close()
 | 
					    h5f.close()
 | 
				
			||||||
    return shuffle_vector
 | 
					    return shuffle_vector
 | 
				
			||||||
@ -138,7 +146,8 @@ def create_dict():
 | 
				
			|||||||
    vowels.extend(accetuated_vowels)
 | 
					    vowels.extend(accetuated_vowels)
 | 
				
			||||||
    vowels.extend(default_vowels)
 | 
					    vowels.extend(default_vowels)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    dictionary = ['']
 | 
					    dictionary_output = ['']
 | 
				
			||||||
 | 
					    dictionary_input = ['']
 | 
				
			||||||
    line = 0
 | 
					    line = 0
 | 
				
			||||||
    max_word = 0
 | 
					    max_word = 0
 | 
				
			||||||
    # ADD 'EMPTY' VOWEL
 | 
					    # ADD 'EMPTY' VOWEL
 | 
				
			||||||
@ -154,12 +163,12 @@ def create_dict():
 | 
				
			|||||||
            for c in list(el[3]):
 | 
					            for c in list(el[3]):
 | 
				
			||||||
                if is_vowel(list(el[3]), i, vowels):
 | 
					                if is_vowel(list(el[3]), i, vowels):
 | 
				
			||||||
                    num_vowels += 1
 | 
					                    num_vowels += 1
 | 
				
			||||||
                if c not in dictionary:
 | 
					                if c not in dictionary_output:
 | 
				
			||||||
                    dictionary.append(c)
 | 
					                    dictionary_output.append(c)
 | 
				
			||||||
                i += 1
 | 
					                i += 1
 | 
				
			||||||
            for c in list(el[0]):
 | 
					            for c in list(el[0]):
 | 
				
			||||||
                if c not in dictionary:
 | 
					                if c not in dictionary_input:
 | 
				
			||||||
                    dictionary.append(c)
 | 
					                    dictionary_input.append(c)
 | 
				
			||||||
            if num_vowels > max_num_vowels:
 | 
					            if num_vowels > max_num_vowels:
 | 
				
			||||||
                max_num_vowels = num_vowels
 | 
					                max_num_vowels = num_vowels
 | 
				
			||||||
        except Exception:
 | 
					        except Exception:
 | 
				
			||||||
@ -167,10 +176,10 @@ def create_dict():
 | 
				
			|||||||
            print(el)
 | 
					            print(el)
 | 
				
			||||||
            break
 | 
					            break
 | 
				
			||||||
        line += 1
 | 
					        line += 1
 | 
				
			||||||
    dictionary = sorted(dictionary)
 | 
					    dictionary_input = sorted(dictionary_input)
 | 
				
			||||||
    max_num_vowels += 1
 | 
					    max_num_vowels += 1
 | 
				
			||||||
    print('DICTIONARY CREATION SUCCESSFUL!')
 | 
					    print('DICTIONARY CREATION SUCCESSFUL!')
 | 
				
			||||||
    return dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels
 | 
					    return dictionary_input, max_word, max_num_vowels, content, vowels, accetuated_vowels
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# GENERATE X and y
 | 
					# GENERATE X and y
 | 
				
			||||||
@ -272,7 +281,22 @@ def generate_presentable_y(accetuations_list, word_list, max_num_vowels):
 | 
				
			|||||||
#     return X, y
 | 
					#     return X, y
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def generate_full_matrix_inputs():
 | 
					def generate_full_matrix_inputs(content_shuffle_vector_location, shuffle_vector_location):
 | 
				
			||||||
 | 
					    dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
 | 
				
			||||||
 | 
					    train_content, test_content, validate_content = split_content(content, 0.2, content_shuffle_vector_location)
 | 
				
			||||||
 | 
					    feature_dictionary = create_feature_dictionary()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Generate X and y
 | 
				
			||||||
 | 
					    print('GENERATING X AND y...')
 | 
				
			||||||
 | 
					    X_train, X_other_features_train, y_train = generate_X_and_y(dictionary, max_word, max_num_vowels, train_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_train.h5')
 | 
				
			||||||
 | 
					    X_test, X_other_features_test, y_test = generate_X_and_y(dictionary, max_word, max_num_vowels, test_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_test.h5')
 | 
				
			||||||
 | 
					    X_validate, X_other_features_validate, y_validate = generate_X_and_y(dictionary, max_word, max_num_vowels, validate_content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location + '_validate.h5')
 | 
				
			||||||
 | 
					    print('GENERATION SUCCESSFUL!')
 | 
				
			||||||
 | 
					    return X_train, X_other_features_train, y_train, X_test, X_other_features_test, y_test, X_validate, X_other_features_validate, y_validate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# generate full matrix, with old features
 | 
				
			||||||
 | 
					def old_generate_full_matrix_inputs():
 | 
				
			||||||
    dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
 | 
					    dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
 | 
				
			||||||
    train_content, validate_content = split_content(content, 0.2)
 | 
					    train_content, validate_content = split_content(content, 0.2)
 | 
				
			||||||
    feature_dictionary = create_feature_dictionary(content)
 | 
					    feature_dictionary = create_feature_dictionary(content)
 | 
				
			||||||
@ -286,7 +310,7 @@ def generate_full_matrix_inputs():
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Generate each y as an array of 11 numbers (with possible values between 0 and 1)
 | 
					# Generate each y as an array of 11 numbers (with possible values between 0 and 1)
 | 
				
			||||||
def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary):
 | 
					def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels, feature_dictionary, shuffle_vector_location):
 | 
				
			||||||
    y = np.zeros((len(content), max_num_vowels))
 | 
					    y = np.zeros((len(content), max_num_vowels))
 | 
				
			||||||
    X = np.zeros((len(content), max_word, len(dictionary)))
 | 
					    X = np.zeros((len(content), max_word, len(dictionary)))
 | 
				
			||||||
    print('CREATING OTHER FEATURES...')
 | 
					    print('CREATING OTHER FEATURES...')
 | 
				
			||||||
@ -328,7 +352,7 @@ def generate_X_and_y(dictionary, max_word, max_num_vowels, content, vowels, acce
 | 
				
			|||||||
        i += 1
 | 
					        i += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print('SHUFFELING INPUTS...')
 | 
					    print('SHUFFELING INPUTS...')
 | 
				
			||||||
    X, y, X_other_features = shuffle_inputs(X, y, X_pure=X_other_features)
 | 
					    X, y, X_other_features = shuffle_inputs(X, y, shuffle_vector_location, X_pure=X_other_features)
 | 
				
			||||||
    print('INPUTS SHUFFELED!')
 | 
					    print('INPUTS SHUFFELED!')
 | 
				
			||||||
    return X, X_other_features, y
 | 
					    return X, X_other_features, y
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -559,6 +583,7 @@ def shuffle_full_vowel_inputs(name, orderd_name, parts):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
# Decoders for inputs and outputs
 | 
					# Decoders for inputs and outputs
 | 
				
			||||||
def decode_X_features(feature_dictionary, X_other_features):
 | 
					def decode_X_features(feature_dictionary, X_other_features):
 | 
				
			||||||
 | 
					    final_word = []
 | 
				
			||||||
    for word in X_other_features:
 | 
					    for word in X_other_features:
 | 
				
			||||||
        final_word = []
 | 
					        final_word = []
 | 
				
			||||||
        i = 0
 | 
					        i = 0
 | 
				
			||||||
@ -574,6 +599,7 @@ def decode_X_features(feature_dictionary, X_other_features):
 | 
				
			|||||||
                            final_word.append(feature_dictionary[z][j][k])
 | 
					                            final_word.append(feature_dictionary[z][j][k])
 | 
				
			||||||
                        i += 1
 | 
					                        i += 1
 | 
				
			||||||
        print(u''.join(final_word))
 | 
					        print(u''.join(final_word))
 | 
				
			||||||
 | 
					    return u''.join(final_word)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def decode_position(y, max_num_vowels):
 | 
					def decode_position(y, max_num_vowels):
 | 
				
			||||||
@ -650,7 +676,37 @@ def decode_position_from_vowel_to_final_number(y):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# split content so that there is no overfitting
 | 
					# split content so that there is no overfitting
 | 
				
			||||||
def split_content(content, ratio):
 | 
					def split_content(content, test_and_validation_ratio, content_shuffle_vector_location, validation_ratio=0.5):
 | 
				
			||||||
 | 
					    expanded_content = [el[1] if el[1] != '=' else el[0] for el in content]
 | 
				
			||||||
 | 
					    # print(len(content))
 | 
				
			||||||
 | 
					    unique_content = sorted(set(expanded_content))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if os.path.exists(content_shuffle_vector_location):
 | 
				
			||||||
 | 
					        s = load_shuffle_vector(content_shuffle_vector_location)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        s = np.arange(len(unique_content))
 | 
				
			||||||
 | 
					        np.random.shuffle(s)
 | 
				
			||||||
 | 
					        create_and_save_shuffle_vector(content_shuffle_vector_location, s)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    split_num = math.floor(len(unique_content) * test_and_validation_ratio)
 | 
				
			||||||
 | 
					    validation_num = math.floor(split_num * validation_ratio)
 | 
				
			||||||
 | 
					    shuffled_unique_train_content = [unique_content[i] for i in range(len(s)) if s[i] >= split_num]
 | 
				
			||||||
 | 
					    shuffled_unique_train_content_set = set(shuffled_unique_train_content)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    shuffled_unique_test_content = [unique_content[i] for i in range(len(s)) if split_num > s[i] >= validation_num]
 | 
				
			||||||
 | 
					    shuffled_unique_test_content_set = set(shuffled_unique_test_content)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    shuffled_unique_validate_content = [unique_content[i] for i in range(len(s)) if s[i] < validation_num]
 | 
				
			||||||
 | 
					    shuffled_unique_validate_content_set = set(shuffled_unique_validate_content)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    train_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_train_content_set]
 | 
				
			||||||
 | 
					    test_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_test_content_set]
 | 
				
			||||||
 | 
					    validate_content = [content[i] for i in range(len(content)) if expanded_content[i] in shuffled_unique_validate_content_set]
 | 
				
			||||||
 | 
					    return train_content, test_content, validate_content
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# split content so that there is no overfitting with out split of validation and test data
 | 
				
			||||||
 | 
					def old_split_content(content, ratio):
 | 
				
			||||||
    expanded_content = [el[1] if el[1] != '=' else el[0] for el in content]
 | 
					    expanded_content = [el[1] if el[1] != '=' else el[0] for el in content]
 | 
				
			||||||
    # print(len(content))
 | 
					    # print(len(content))
 | 
				
			||||||
    unique_content = sorted(set(expanded_content))
 | 
					    unique_content = sorted(set(expanded_content))
 | 
				
			||||||
@ -671,8 +727,8 @@ def split_content(content, ratio):
 | 
				
			|||||||
    return train_content, validate_content
 | 
					    return train_content, validate_content
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#  create feature dictionary
 | 
					# X features that use MULTEX v3 as their encoding
 | 
				
			||||||
def create_feature_dictionary(content):
 | 
					def create_old_feature_dictionary(content):
 | 
				
			||||||
    additional_data = [el[2] for el in content]
 | 
					    additional_data = [el[2] for el in content]
 | 
				
			||||||
    possible_variants = sorted(set(additional_data))
 | 
					    possible_variants = sorted(set(additional_data))
 | 
				
			||||||
    categories = sorted(set([el[0] for el in possible_variants]))
 | 
					    categories = sorted(set([el[0] for el in possible_variants]))
 | 
				
			||||||
@ -690,7 +746,8 @@ def create_feature_dictionary(content):
 | 
				
			|||||||
    return feature_dictionary
 | 
					    return feature_dictionary
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def create_X_features(content, feature_dictionary):
 | 
					# X features that use MULTEX v3 as their encoding
 | 
				
			||||||
 | 
					def create_old_X_features(content, feature_dictionary):
 | 
				
			||||||
    content = content
 | 
					    content = content
 | 
				
			||||||
    X_other_features = []
 | 
					    X_other_features = []
 | 
				
			||||||
    for el in content:
 | 
					    for el in content:
 | 
				
			||||||
@ -708,3 +765,212 @@ def create_X_features(content, feature_dictionary):
 | 
				
			|||||||
                X_el_other_features.extend([0] * feature[0])
 | 
					                X_el_other_features.extend([0] * feature[0])
 | 
				
			||||||
        X_other_features.append(X_el_other_features)
 | 
					        X_other_features.append(X_el_other_features)
 | 
				
			||||||
    return np.array(X_other_features)
 | 
					    return np.array(X_other_features)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def convert_to_MULTEXT_east_v4(old_features, feature_dictionary):
 | 
				
			||||||
 | 
					    new_features = ['-'] * 9
 | 
				
			||||||
 | 
					    new_features[:len(old_features)] = old_features
 | 
				
			||||||
 | 
					    if old_features[0] == 'A':
 | 
				
			||||||
 | 
					        if old_features[1] == 'f' or old_features[1] == 'o':
 | 
				
			||||||
 | 
					            new_features[1] = 'g'
 | 
				
			||||||
 | 
					        return new_features[:len(feature_dictionary[0]) - 1]
 | 
				
			||||||
 | 
					    if old_features[0] == 'C':
 | 
				
			||||||
 | 
					        return new_features[:len(feature_dictionary[1]) - 1]
 | 
				
			||||||
 | 
					    if old_features[0] == 'I':
 | 
				
			||||||
 | 
					        return new_features[:len(feature_dictionary[2]) - 1]
 | 
				
			||||||
 | 
					    if old_features[0] == 'M':
 | 
				
			||||||
 | 
					        new_features[2:6] = old_features[1:5]
 | 
				
			||||||
 | 
					        new_features[1] = old_features[5]
 | 
				
			||||||
 | 
					        if new_features[2] == 'm':
 | 
				
			||||||
 | 
					            new_features[2] = '-'
 | 
				
			||||||
 | 
					        return new_features[:len(feature_dictionary[3]) - 1]
 | 
				
			||||||
 | 
					    if old_features[0] == 'N':
 | 
				
			||||||
 | 
					        if len(old_features) > 5:
 | 
				
			||||||
 | 
					            new_features[5] = old_features[7]
 | 
				
			||||||
 | 
					        return new_features[:len(feature_dictionary[4]) - 1]
 | 
				
			||||||
 | 
					    if old_features[0] == 'P':
 | 
				
			||||||
 | 
					        if new_features[8] == 'n':
 | 
				
			||||||
 | 
					            new_features[8] = 'b'
 | 
				
			||||||
 | 
					        return new_features[:len(feature_dictionary[5]) - 1]
 | 
				
			||||||
 | 
					    if old_features[0] == 'Q':
 | 
				
			||||||
 | 
					        return new_features[:len(feature_dictionary[6]) - 1]
 | 
				
			||||||
 | 
					    if old_features[0] == 'R':
 | 
				
			||||||
 | 
					        return new_features[:len(feature_dictionary[7]) - 1]
 | 
				
			||||||
 | 
					    if old_features[0] == 'S':
 | 
				
			||||||
 | 
					        if len(old_features) == 4:
 | 
				
			||||||
 | 
					            new_features[1] = old_features[3]
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            new_features[1] = '-'
 | 
				
			||||||
 | 
					        return new_features[:len(feature_dictionary[8]) - 1]
 | 
				
			||||||
 | 
					    if old_features[0] == 'V':
 | 
				
			||||||
 | 
					        if old_features[1] == 'o' or old_features[1] == 'c':
 | 
				
			||||||
 | 
					            new_features[1] = 'm'
 | 
				
			||||||
 | 
					        new_features[3] = old_features[2]
 | 
				
			||||||
 | 
					        new_features[2] = '-'
 | 
				
			||||||
 | 
					        if old_features[2] == 'i':
 | 
				
			||||||
 | 
					            new_features[3] = 'r'
 | 
				
			||||||
 | 
					        if len(old_features) > 3 and old_features[3] == 'p':
 | 
				
			||||||
 | 
					            new_features[3] = 'r'
 | 
				
			||||||
 | 
					        elif len(old_features) > 3 and old_features[3] == 'f':
 | 
				
			||||||
 | 
					            new_features[3] = 'f'
 | 
				
			||||||
 | 
					        if len(old_features) >= 9:
 | 
				
			||||||
 | 
					            new_features[7] = old_features[8]
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            new_features[7] = '-'
 | 
				
			||||||
 | 
					        return new_features[:len(feature_dictionary[9]) - 1]
 | 
				
			||||||
 | 
					    return ''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_X_features(content, feature_dictionary):
 | 
				
			||||||
 | 
					    content = content
 | 
				
			||||||
 | 
					    X_other_features = []
 | 
				
			||||||
 | 
					    for el in content:
 | 
				
			||||||
 | 
					        X_el_other_features = []
 | 
				
			||||||
 | 
					        converted_el = ''.join(convert_to_MULTEXT_east_v4(list(el[2]), feature_dictionary))
 | 
				
			||||||
 | 
					#         converted_el = el[2]
 | 
				
			||||||
 | 
					        for feature in feature_dictionary:
 | 
				
			||||||
 | 
					            if converted_el[0] == feature[1]:
 | 
				
			||||||
 | 
					                X_el_other_features.append(1)
 | 
				
			||||||
 | 
					                for i in range(2, len(feature)):
 | 
				
			||||||
 | 
					                    for j in range(len(feature[i])):
 | 
				
			||||||
 | 
					                        if i-1 < len(converted_el) and feature[i][j] == converted_el[i-1]:
 | 
				
			||||||
 | 
					                            X_el_other_features.append(1)
 | 
				
			||||||
 | 
					                        else:
 | 
				
			||||||
 | 
					                            X_el_other_features.append(0)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                X_el_other_features.extend([0] * feature[0])
 | 
				
			||||||
 | 
					        X_other_features.append(X_el_other_features)
 | 
				
			||||||
 | 
					    return np.array(X_other_features)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_feature_dictionary():
 | 
				
			||||||
 | 
					    # old: http://nl.ijs.si/ME/Vault/V3/msd/html/
 | 
				
			||||||
 | 
					    # new: http://nl.ijs.si/ME/V4/msd/html/
 | 
				
			||||||
 | 
					    # changes: http://nl.ijs.si/jos/msd/html-en/msd.diffs.html
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return [[21,
 | 
				
			||||||
 | 
					          'A',
 | 
				
			||||||
 | 
					          ['g', 's'],
 | 
				
			||||||
 | 
					          ['p', 'c', 's'],
 | 
				
			||||||
 | 
					          ['m', 'f', 'n'],
 | 
				
			||||||
 | 
					          ['s', 'd', 'p'],
 | 
				
			||||||
 | 
					          ['n', 'g', 'd', 'a', 'l', 'i'],
 | 
				
			||||||
 | 
					          ['-', 'n', 'y']],
 | 
				
			||||||
 | 
					         [3, 'C', ['c', 's']],
 | 
				
			||||||
 | 
					         [1, 'I'],
 | 
				
			||||||
 | 
					         [21,
 | 
				
			||||||
 | 
					          'M',
 | 
				
			||||||
 | 
					          ['l'],
 | 
				
			||||||
 | 
					          ['-', 'c', 'o', 's'],
 | 
				
			||||||
 | 
					          ['m', 'f', 'n'],
 | 
				
			||||||
 | 
					          ['s', 'd', 'p'],
 | 
				
			||||||
 | 
					          ['n', 'g', 'd', 'a', 'l', 'i'],
 | 
				
			||||||
 | 
					          ['-', 'n', 'y']],
 | 
				
			||||||
 | 
					         [17,
 | 
				
			||||||
 | 
					          'N',
 | 
				
			||||||
 | 
					          ['c'],
 | 
				
			||||||
 | 
					          ['m', 'f', 'n'],
 | 
				
			||||||
 | 
					          ['s', 'd', 'p'],
 | 
				
			||||||
 | 
					          ['n', 'g', 'd', 'a', 'l', 'i'],
 | 
				
			||||||
 | 
					          ['-', 'n', 'y']],
 | 
				
			||||||
 | 
					         [40,
 | 
				
			||||||
 | 
					          'P',
 | 
				
			||||||
 | 
					          ['p', 's', 'd', 'r', 'x', 'g', 'q', 'i', 'z'],
 | 
				
			||||||
 | 
					          ['-', '1', '2', '3'],
 | 
				
			||||||
 | 
					          ['-', 'm', 'f', 'n'],
 | 
				
			||||||
 | 
					          ['-', 's', 'd', 'p'],
 | 
				
			||||||
 | 
					          ['-', 'n', 'g', 'd', 'a', 'l', 'i'],
 | 
				
			||||||
 | 
					          ['-', 's', 'd', 'p'],
 | 
				
			||||||
 | 
					          ['-', 'm', 'f', 'n'],
 | 
				
			||||||
 | 
					          ['-', 'y', 'b']],
 | 
				
			||||||
 | 
					         [1, 'Q'],
 | 
				
			||||||
 | 
					         [5, 'R', ['g'], ['p', 'c', 's']],
 | 
				
			||||||
 | 
					         [7, 'S', ['-', 'g', 'd', 'a', 'l', 'i']],
 | 
				
			||||||
 | 
					         [24,
 | 
				
			||||||
 | 
					          'V',
 | 
				
			||||||
 | 
					          ['m'],
 | 
				
			||||||
 | 
					          ['-'],
 | 
				
			||||||
 | 
					          ['n', 'u', 'p', 'r', 'f', 'c'],
 | 
				
			||||||
 | 
					          ['-', '1', '2', '3'],
 | 
				
			||||||
 | 
					          ['-', 's', 'p', 'd'],
 | 
				
			||||||
 | 
					          ['-', 'm', 'f', 'n'],
 | 
				
			||||||
 | 
					          ['-', 'n', 'y']]
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def complete_feature_dict():
 | 
				
			||||||
 | 
					    # old: http://nl.ijs.si/ME/Vault/V3/msd/html/
 | 
				
			||||||
 | 
					    # new: http://nl.ijs.si/ME/V4/msd/html/
 | 
				
			||||||
 | 
					    # changes: http://nl.ijs.si/jos/msd/html-en/msd.diffs.html
 | 
				
			||||||
 | 
					    return [[27,
 | 
				
			||||||
 | 
					             'A',
 | 
				
			||||||
 | 
					             ['-', 'g', 's', 'p'],
 | 
				
			||||||
 | 
					             ['-', 'p', 'c', 's'],
 | 
				
			||||||
 | 
					             ['-', 'm', 'f', 'n'],
 | 
				
			||||||
 | 
					             ['-', 's', 'd', 'p'],
 | 
				
			||||||
 | 
					             ['-', 'n', 'g', 'd', 'a', 'l', 'i'],
 | 
				
			||||||
 | 
					             ['-', 'n', 'y']],
 | 
				
			||||||
 | 
					            [4, 'C', ['-', 'c', 's']],
 | 
				
			||||||
 | 
					            [1, 'I'],
 | 
				
			||||||
 | 
					            [28,
 | 
				
			||||||
 | 
					             'M',
 | 
				
			||||||
 | 
					             ['-', 'd', 'r', 'l'],
 | 
				
			||||||
 | 
					             ['-', 'c', 'o', 'p', 's'],
 | 
				
			||||||
 | 
					             ['-', 'm', 'f', 'n'],
 | 
				
			||||||
 | 
					             ['-', 's', 'd', 'p'],
 | 
				
			||||||
 | 
					             ['-', 'n', 'g', 'd', 'a', 'l', 'i'],
 | 
				
			||||||
 | 
					             ['-', 'n', 'y']],
 | 
				
			||||||
 | 
					            [22,
 | 
				
			||||||
 | 
					             'N',
 | 
				
			||||||
 | 
					             ['-', 'c', 'p'],
 | 
				
			||||||
 | 
					             ['-', 'm', 'f', 'n'],
 | 
				
			||||||
 | 
					             ['-', 's', 'd', 'p'],
 | 
				
			||||||
 | 
					             ['-', 'n', 'g', 'd', 'a', 'l', 'i'],
 | 
				
			||||||
 | 
					             ['-', 'n', 'y']],
 | 
				
			||||||
 | 
					            [41,
 | 
				
			||||||
 | 
					             'P',
 | 
				
			||||||
 | 
					             ['-', 'p', 's', 'd', 'r', 'x', 'g', 'q', 'i', 'z'],
 | 
				
			||||||
 | 
					             ['-', '1', '2', '3'],
 | 
				
			||||||
 | 
					             ['-', 'm', 'f', 'n'],
 | 
				
			||||||
 | 
					             ['-', 's', 'd', 'p'],
 | 
				
			||||||
 | 
					             ['-', 'n', 'g', 'd', 'a', 'l', 'i'],
 | 
				
			||||||
 | 
					             ['-', 's', 'd', 'p'],
 | 
				
			||||||
 | 
					             ['-', 'm', 'f', 'n'],
 | 
				
			||||||
 | 
					             ['-', 'y', 'b']],
 | 
				
			||||||
 | 
					            [1, 'Q'],
 | 
				
			||||||
 | 
					            [8, 'R', ['-', 'g', 'r'], ['-', 'p', 'c', 's']],
 | 
				
			||||||
 | 
					            [8, 'S', ['-', 'n', 'g', 'd', 'a', 'l', 'i']],
 | 
				
			||||||
 | 
					            [31,
 | 
				
			||||||
 | 
					             'V',
 | 
				
			||||||
 | 
					             ['-', 'm', 'a'],
 | 
				
			||||||
 | 
					             ['-', 'e', 'p', 'b'],
 | 
				
			||||||
 | 
					             ['-', 'n', 'u', 'p', 'r', 'f', 'c', 'm'],
 | 
				
			||||||
 | 
					             ['-', '1', '2', '3'],
 | 
				
			||||||
 | 
					             ['-', 's', 'p', 'd'],
 | 
				
			||||||
 | 
					             ['-', 'm', 'f', 'n'],
 | 
				
			||||||
 | 
					             ['-', 'n', 'y']]
 | 
				
			||||||
 | 
					            ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def check_feature_letter_usage(X_other_features, feature_dictionary):
 | 
				
			||||||
 | 
					    case_numbers = np.sum(X_other_features, axis=0)
 | 
				
			||||||
 | 
					    arrays = [1] * 164
 | 
				
			||||||
 | 
					    letters = list(decode_X_features(feature_dictionary, [arrays]))
 | 
				
			||||||
 | 
					    print(sum(case_numbers))
 | 
				
			||||||
 | 
					    for i in range(len(letters)):
 | 
				
			||||||
 | 
					        print(letters[i] + ': ' + str(case_numbers[i]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def dict_occurances_in_dataset_rate(content):
 | 
				
			||||||
 | 
					    feature_dictionary = complete_feature_dict()
 | 
				
			||||||
 | 
					    # case = 3107
 | 
				
			||||||
 | 
					    # print(content[case])
 | 
				
			||||||
 | 
					    # print(feature_dictionary)
 | 
				
			||||||
 | 
					    # X_other_features = create_X_features([content[case]], feature_dictionary)
 | 
				
			||||||
 | 
					    X_other_features = create_X_features(content, feature_dictionary)
 | 
				
			||||||
 | 
					    # print(X_other_features)
 | 
				
			||||||
 | 
					    # print(decode_X_features(feature_dictionary, X_other_features))
 | 
				
			||||||
 | 
					    X_other_features = np.array(X_other_features)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    case_numbers = np.sum(X_other_features, axis=0)
 | 
				
			||||||
 | 
					    print(case_numbers)
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										101
									
								
								tex_hyphenation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										101
									
								
								tex_hyphenation.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,101 @@
 | 
				
			|||||||
 | 
					import sys
 | 
				
			||||||
 | 
					sys.path.insert(0, '../../../')
 | 
				
			||||||
 | 
					from prepare_data import *
 | 
				
			||||||
 | 
					dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
 | 
				
			||||||
 | 
					feature_dictionary = create_feature_dictionary(content)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def read_hyphenation_pattern():
 | 
				
			||||||
 | 
					    with open('../../../hyphenation') as f:
 | 
				
			||||||
 | 
					        content = f.readlines()
 | 
				
			||||||
 | 
					    return [x[:-1] for x in content]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def find_hyphenation_patterns_in_text(text, pattern):
 | 
				
			||||||
 | 
					    res = []
 | 
				
			||||||
 | 
					    index = 0
 | 
				
			||||||
 | 
					    while index < len(text):
 | 
				
			||||||
 | 
					        index = text.find(pattern, index)
 | 
				
			||||||
 | 
					        if index == -1:
 | 
				
			||||||
 | 
					            break
 | 
				
			||||||
 | 
					        res.append(index)
 | 
				
			||||||
 | 
					        index += 1  # +2 because len('ll') == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return res
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_hyphenation_dictionary(hyphenation_pattern):
 | 
				
			||||||
 | 
					    dictionary = []
 | 
				
			||||||
 | 
					    for el in hyphenation_pattern:
 | 
				
			||||||
 | 
					        substring = ''
 | 
				
			||||||
 | 
					        anomalies_indices = []
 | 
				
			||||||
 | 
					        digit_location = 0
 | 
				
			||||||
 | 
					        for let in list(el):
 | 
				
			||||||
 | 
					            if let.isdigit():
 | 
				
			||||||
 | 
					                anomalies_indices.append([digit_location, int(let)])
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                substring += let
 | 
				
			||||||
 | 
					                digit_location += 1
 | 
				
			||||||
 | 
					        dictionary.append([substring, anomalies_indices])
 | 
				
			||||||
 | 
					    return dictionary
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def split_hyphenated_word(split, word):
 | 
				
			||||||
 | 
					    split = split[2:-2]
 | 
				
			||||||
 | 
					    print(split)
 | 
				
			||||||
 | 
					    word = list(word)[1:-1]
 | 
				
			||||||
 | 
					    res = []
 | 
				
			||||||
 | 
					    hyphenate = ''
 | 
				
			||||||
 | 
					    loc = 0
 | 
				
			||||||
 | 
					    for let in word:
 | 
				
			||||||
 | 
					        hyphenate += let
 | 
				
			||||||
 | 
					        if loc == len(split) or split[loc] % 2 == 1:
 | 
				
			||||||
 | 
					            res.append(hyphenate)
 | 
				
			||||||
 | 
					            hyphenate = ''
 | 
				
			||||||
 | 
					        loc += 1
 | 
				
			||||||
 | 
					    return res
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def hyphenate_word(word, hyphenation_dictionary):
 | 
				
			||||||
 | 
					    word = word.replace('è', 'č')
 | 
				
			||||||
 | 
					    word = '.' + word + '.'
 | 
				
			||||||
 | 
					    split = [0] * (len(word) + 1)
 | 
				
			||||||
 | 
					    for pattern in hyphenation_dictionary:
 | 
				
			||||||
 | 
					        pattern_locations = find_hyphenation_patterns_in_text(word, pattern[0])
 | 
				
			||||||
 | 
					        for pattern_location in pattern_locations:
 | 
				
			||||||
 | 
					            for el_hyphenation_dictionary in pattern[1]:
 | 
				
			||||||
 | 
					                if split[pattern_location + el_hyphenation_dictionary[0]] < el_hyphenation_dictionary[1]:
 | 
				
			||||||
 | 
					                    split[pattern_location + el_hyphenation_dictionary[0]] = el_hyphenation_dictionary[1]
 | 
				
			||||||
 | 
					    return split_hyphenated_word(split, word)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					hyphenation_pattern = read_hyphenation_pattern()
 | 
				
			||||||
 | 
					# ['zz', [{0:2},{1:1},{2:2}]]
 | 
				
			||||||
 | 
					hyphenation_dictionary = create_hyphenation_dictionary(hyphenation_pattern)
 | 
				
			||||||
 | 
					separated_word = hyphenate_word('izziv', hyphenation_dictionary)
 | 
				
			||||||
 | 
					print(separated_word)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					all_words = []
 | 
				
			||||||
 | 
					i = 0
 | 
				
			||||||
 | 
					for el in content:
 | 
				
			||||||
 | 
					    separated_word = hyphenate_word(el[0], hyphenation_dictionary)
 | 
				
			||||||
 | 
					    all_words.append([el[0], separated_word])
 | 
				
			||||||
 | 
					    if i % 10000 == 0:
 | 
				
			||||||
 | 
					        print(str(i)+'/'+str(len(content)))
 | 
				
			||||||
 | 
					    i += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					errors = []
 | 
				
			||||||
 | 
					errors2 = []
 | 
				
			||||||
 | 
					for word in all_words:
 | 
				
			||||||
 | 
					    for hyphenated_part in word[1]:
 | 
				
			||||||
 | 
					        num_vowels = 0
 | 
				
			||||||
 | 
					        for let in list(hyphenated_part):
 | 
				
			||||||
 | 
					            if let in vowels:
 | 
				
			||||||
 | 
					                num_vowels += 1
 | 
				
			||||||
 | 
					        if num_vowels == 0:
 | 
				
			||||||
 | 
					            for let in list(hyphenated_part):
 | 
				
			||||||
 | 
					                if let == 'r':
 | 
				
			||||||
 | 
					                    errors2.append(word[0])
 | 
				
			||||||
 | 
					                    num_vowels += 1
 | 
				
			||||||
 | 
					        if num_vowels != 1:
 | 
				
			||||||
 | 
					            errors.append(word)
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user