Accentuation on sloleks

This commit is contained in:
Luka 2018-04-14 10:25:40 +02:00
parent d4e6e1b222
commit 9f6e5b2752
5 changed files with 807 additions and 213 deletions

View File

@ -2,13 +2,15 @@
<project version="4">
<component name="ChangeListManager">
<list default="true" id="8a8ba9af-e1a4-433a-9968-475192610776" name="Default" comment="">
<change type="NEW" beforePath="" afterPath="$PROJECT_DIR$/sloleks_accentuation2.py" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis_results/onedirectional_input/ensemble_errors.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis_results/onedirectional_input/ensemble_errors.pkl" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis_results/onedirectional_input/ensemble_predictions.pkl" afterPath="$PROJECT_DIR$/cnn/word_accetuation/error_analysis_results/onedirectional_input/ensemble_predictions.pkl" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/accent_classification/error_analysis.ipynb" afterPath="$PROJECT_DIR$/cnn/accent_classification/error_analysis.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/accent_classification/letters/v3_0/workbench.py" afterPath="$PROJECT_DIR$/cnn/accent_classification/letters/v3_0/workbench.py" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/accent_classification/syllabled_letters/v2_0/workbench.py" afterPath="$PROJECT_DIR$/cnn/accent_classification/syllabled_letters/v2_0/workbench.py" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/accent_classification/syllables/v2_0/workbench.py" afterPath="$PROJECT_DIR$/cnn/accent_classification/syllables/v2_0/workbench.py" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/accent_classification/bidirectional_error_analysis.ipynb" afterPath="$PROJECT_DIR$/cnn/accent_classification/bidirectional_error_analysis.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/bidirectional_error_analysis.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/bidirectional_error_analysis.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/run_multiple_files.py" afterPath="$PROJECT_DIR$/run_multiple_files.py" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/cnn/word_accetuation/pattern_repetition.ipynb" afterPath="$PROJECT_DIR$/cnn/word_accetuation/pattern_repetition.ipynb" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/prepare_data.py" afterPath="$PROJECT_DIR$/prepare_data.py" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/sloleks_accetuation.ipynb" afterPath="$PROJECT_DIR$/sloleks_accetuation.ipynb" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="TRACKING_ENABLED" value="true" />
@ -35,19 +37,19 @@
</provider>
</entry>
</file>
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="true">
<file leaf-file-name="prepare_data.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="311">
<caret line="1055" column="31" lean-forward="true" selection-start-line="1055" selection-start-column="31" selection-end-line="1055" selection-end-column="31" />
<state relative-caret-position="-221">
<caret line="1570" column="23" lean-forward="false" selection-start-line="1570" selection-start-column="23" selection-end-line="1570" selection-end-column="23" />
<folding>
<element signature="e#24#63#0" expanded="true" />
<element signature="e#7927#9215#0" expanded="false" />
<element signature="e#10871#11166#0" expanded="false" />
<element signature="e#18472#18774#0" expanded="false" />
<element signature="e#19747#20093#0" expanded="false" />
<element signature="e#24509#25156#0" expanded="false" />
<element signature="e#38245#38386#0" expanded="false" />
<element signature="e#7950#9238#0" expanded="false" />
<element signature="e#10894#11189#0" expanded="false" />
<element signature="e#11294#14633#0" expanded="false" />
<element signature="e#18495#18797#0" expanded="false" />
<element signature="e#19770#20116#0" expanded="false" />
<element signature="e#24532#25179#0" expanded="false" />
</folding>
</state>
</provider>
@ -56,7 +58,7 @@
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v5_2/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="328">
<state relative-caret-position="358">
<caret line="85" column="39" lean-forward="false" selection-start-line="85" selection-start-column="39" selection-end-line="85" selection-end-column="39" />
<folding />
</state>
@ -66,8 +68,8 @@
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/v3_2/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="400">
<caret line="100" column="61" lean-forward="false" selection-start-line="100" selection-start-column="61" selection-end-line="100" selection-end-column="61" />
<state relative-caret-position="426">
<caret line="87" column="84" lean-forward="true" selection-start-line="55" selection-start-column="0" selection-end-line="87" selection-end-column="84" />
<folding />
</state>
</provider>
@ -76,7 +78,7 @@
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/syllables/v3_2/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="414">
<state relative-caret-position="1602">
<caret line="100" column="47" lean-forward="false" selection-start-line="100" selection-start-column="47" selection-end-line="100" selection-end-column="53" />
<folding />
</state>
@ -86,7 +88,7 @@
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/cnn/accent_classification/syllabled_letters/v2_0/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="346">
<state relative-caret-position="468">
<caret line="37" column="121" lean-forward="false" selection-start-line="37" selection-start-column="121" selection-end-line="37" selection-end-column="121" />
<folding />
</state>
@ -96,7 +98,7 @@
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/cnn/accent_classification/syllables/v2_0/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="306">
<state relative-caret-position="414">
<caret line="34" column="52" lean-forward="false" selection-start-line="34" selection-start-column="52" selection-end-line="34" selection-end-column="52" />
<folding />
</state>
@ -106,8 +108,8 @@
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/cnn/accent_classification/letters/v3_1/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="396">
<caret line="33" column="41" lean-forward="false" selection-start-line="33" selection-start-column="41" selection-end-line="33" selection-end-column="41" />
<state relative-caret-position="612">
<caret line="45" column="17" lean-forward="false" selection-start-line="45" selection-start-column="17" selection-end-line="45" selection-end-column="17" />
<folding />
</state>
</provider>
@ -116,7 +118,7 @@
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/cnn/accent_classification/syllabled_letters/v2_1/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1494">
<state relative-caret-position="1602">
<caret line="100" column="56" lean-forward="false" selection-start-line="100" selection-start-column="56" selection-end-line="100" selection-end-column="56" />
<folding />
</state>
@ -126,7 +128,7 @@
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/cnn/accent_classification/syllables/v2_1/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1548">
<state relative-caret-position="1602">
<caret line="100" column="48" lean-forward="false" selection-start-line="100" selection-start-column="48" selection-end-line="100" selection-end-column="48" />
<folding />
</state>
@ -146,8 +148,8 @@
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v5_3/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="378">
<caret line="32" column="45" lean-forward="false" selection-start-line="32" selection-start-column="45" selection-end-line="32" selection-end-column="45" />
<state relative-caret-position="1170">
<caret line="76" column="84" lean-forward="false" selection-start-line="38" selection-start-column="0" selection-end-line="76" selection-end-column="84" />
<folding />
</state>
</provider>
@ -156,7 +158,7 @@
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/v3_3/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="342">
<state relative-caret-position="450">
<caret line="36" column="47" lean-forward="false" selection-start-line="36" selection-start-column="47" selection-end-line="36" selection-end-column="47" />
<folding />
</state>
@ -166,8 +168,8 @@
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/syllables/v3_3/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="306">
<caret line="37" column="40" lean-forward="false" selection-start-line="37" selection-start-column="40" selection-end-line="37" selection-end-column="40" />
<state relative-caret-position="1368">
<caret line="87" column="84" lean-forward="false" selection-start-line="41" selection-start-column="0" selection-end-line="87" selection-end-column="84" />
<folding />
</state>
</provider>
@ -176,8 +178,8 @@
<file leaf-file-name="run_multiple_files.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/run_multiple_files.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180">
<caret line="10" column="26" lean-forward="false" selection-start-line="10" selection-start-column="26" selection-end-line="10" selection-end-column="26" />
<state relative-caret-position="198">
<caret line="11" column="26" lean-forward="false" selection-start-line="11" selection-start-column="26" selection-end-line="11" selection-end-column="26" />
<folding />
</state>
</provider>
@ -211,7 +213,19 @@
<entry file="file://$PROJECT_DIR$/sloleks_accentuation.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180">
<caret line="16" column="53" lean-forward="false" selection-start-line="16" selection-start-column="53" selection-end-line="16" selection-end-column="53" />
<caret line="16" column="28" lean-forward="true" selection-start-line="16" selection-start-column="28" selection-end-line="16" selection-end-column="28" />
<folding>
<element signature="e#24#63#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file leaf-file-name="sloleks_accentuation2.py" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/sloleks_accentuation2.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="454">
<caret line="48" column="61" lean-forward="true" selection-start-line="48" selection-start-column="61" selection-end-line="48" selection-end-column="61" />
<folding>
<element signature="e#24#63#0" expanded="true" />
</folding>
@ -244,7 +258,7 @@
<file leaf-file-name="workbench.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/cnn/accent_classification/letters/v3_0/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="418">
<state relative-caret-position="1512">
<caret line="95" column="55" lean-forward="false" selection-start-line="95" selection-start-column="55" selection-end-line="95" selection-end-column="55" />
<folding />
</state>
@ -262,36 +276,36 @@
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>bidirectional_basic_input</find>
<find>_bidirectional_basic_input</find>
<find>shuffeling</find>
<find>generate_data</find>
<find>_generate_inputs</find>
<find>content_shuffle_vector_path</find>
<find>content_shuffle_vector_location</find>
<find>_shuffle_all_inputs</find>
<find>_generator_instance</find>
<find>_x_letter_input</find>
<find>_generate_x_and_y</find>
<find>content</find>
<find>number_of_syllables</find>
<find>_create_syllables</find>
<find>index</find>
<find>x_sy</find>
<find>_letter_generator</find>
<find>translator</find>
<find>_bidirectional_architectural_input</find>
<find>print</find>
<find>_syllable_generator</find>
<find>np.concatenate</find>
<find>prepare_data</find>
<find>assign_stress_locations</find>
<find>test_accuracy</find>
<find>test_acc</find>
<find>test_set</find>
<find>reverse_inputs</find>
<find>accent_classification</find>
<find>test_type</find>
<find>get_word_length</find>
<find>input_words</find>
<find>content</find>
<find>codecs</find>
<find>data</find>
<find>load_model</find>
<find>accentuate_word</find>
<find>get_word_le</find>
<find>load_location_models</find>
<find>get_ensemble_location_predictions</find>
<find>reverse_inputs</find>
<find>_x_letter_input</find>
<find>reverse</find>
<find>print(</find>
<find>count_vowels</find>
<find>count</find>
<find>sylla</find>
<find># word</find>
<find>accented_word</find>
<find>get_ensemble_type_predictions</find>
<find>rever</find>
<find>accentuate_wo</find>
</findStrings>
</component>
<component name="Git.Settings">
@ -337,6 +351,7 @@
<option value="$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/v3_3/workbench.py" />
<option value="$PROJECT_DIR$/cnn/word_accetuation/syllables/v3_3/workbench.py" />
<option value="$PROJECT_DIR$/prepare_data.py" />
<option value="$PROJECT_DIR$/sloleks_accentuation2.py" />
</list>
</option>
</component>
@ -361,7 +376,7 @@
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scope" />
<pane id="Scratches" />
<pane id="ProjectPane">
<subPane>
<PATH>
@ -374,63 +389,9 @@
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="cnn" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="cnn" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="accent_classification" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="accetuation" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="cnn" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="accent_classification" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="syllabled_letters" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
</subPane>
</pane>
<pane id="Scratches" />
<pane id="Scope" />
</panes>
</component>
<component name="PropertiesComponent">
@ -439,11 +400,11 @@
</component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
<recent name="$PROJECT_DIR$" />
<recent name="$PROJECT_DIR$/cnn/accent_classification/syllabled_letters/v2_0" />
<recent name="$PROJECT_DIR$/cnn/accent_classification/syllables/v2_0" />
<recent name="$PROJECT_DIR$/cnn/accent_classification/letters/v3_0" />
<recent name="$PROJECT_DIR$/cnn/word_accetuation/syllables/v3_2" />
<recent name="$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/v3_2" />
</key>
</component>
<component name="RunManager" selected="Python.TEST">
@ -762,7 +723,6 @@
<watches-manager />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/theanoTest.py" />
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.py" />
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/character_based_ffnn_keras.ipynb" />
<entry file="file://$PROJECT_DIR$/hyphenation">
@ -878,21 +838,10 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/sloleks_accentuation.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180">
<caret line="16" column="53" lean-forward="false" selection-start-line="16" selection-start-column="53" selection-end-line="16" selection-end-column="53" />
<folding>
<element signature="e#24#63#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/v3_1/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="414">
<caret line="34" column="66" lean-forward="false" selection-start-line="34" selection-start-column="20" selection-end-line="34" selection-end-column="66" />
<folding />
</state>
</provider>
</entry>
@ -900,7 +849,6 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1476">
<caret line="93" column="84" lean-forward="false" selection-start-line="41" selection-start-column="0" selection-end-line="93" selection-end-column="84" />
<folding />
</state>
</provider>
</entry>
@ -928,7 +876,6 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="828">
<caret line="57" column="0" lean-forward="false" selection-start-line="57" selection-start-column="0" selection-end-line="57" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
@ -960,7 +907,6 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="612">
<caret line="45" column="0" lean-forward="false" selection-start-line="45" selection-start-column="0" selection-end-line="45" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
@ -968,7 +914,6 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="324">
<caret line="29" column="42" lean-forward="false" selection-start-line="29" selection-start-column="17" selection-end-line="29" selection-end-column="42" />
<folding />
</state>
</provider>
</entry>
@ -983,7 +928,6 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="774">
<caret line="54" column="26" lean-forward="false" selection-start-line="54" selection-start-column="0" selection-end-line="54" selection-end-column="26" />
<folding />
</state>
</provider>
</entry>
@ -991,37 +935,20 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="972">
<caret line="65" column="70" lean-forward="false" selection-start-line="65" selection-start-column="70" selection-end-line="65" selection-end-column="70" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/v3_2/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="400">
<caret line="100" column="61" lean-forward="false" selection-start-line="100" selection-start-column="61" selection-end-line="100" selection-end-column="61" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/syllables/v3_2/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="414">
<state relative-caret-position="1602">
<caret line="100" column="47" lean-forward="false" selection-start-line="100" selection-start-column="47" selection-end-line="100" selection-end-column="53" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v5_2/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="328">
<caret line="85" column="39" lean-forward="false" selection-start-line="85" selection-start-column="39" selection-end-line="85" selection-end-column="39" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/cnn/accent_classification/letters/v3_0/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="418">
<state relative-caret-position="1512">
<caret line="95" column="55" lean-forward="false" selection-start-line="95" selection-start-column="55" selection-end-line="95" selection-end-column="55" />
<folding />
</state>
@ -1029,7 +956,7 @@
</entry>
<entry file="file://$PROJECT_DIR$/cnn/accent_classification/syllabled_letters/v2_0/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="346">
<state relative-caret-position="468">
<caret line="37" column="121" lean-forward="false" selection-start-line="37" selection-start-column="121" selection-end-line="37" selection-end-column="121" />
<folding />
</state>
@ -1037,7 +964,7 @@
</entry>
<entry file="file://$PROJECT_DIR$/cnn/accent_classification/syllables/v2_0/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="306">
<state relative-caret-position="414">
<caret line="34" column="52" lean-forward="false" selection-start-line="34" selection-start-column="52" selection-end-line="34" selection-end-column="52" />
<folding />
</state>
@ -1051,17 +978,9 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/run_multiple_files.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180">
<caret line="10" column="26" lean-forward="false" selection-start-line="10" selection-start-column="26" selection-end-line="10" selection-end-column="26" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/cnn/accent_classification/syllables/v2_1/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1548">
<state relative-caret-position="1602">
<caret line="100" column="48" lean-forward="false" selection-start-line="100" selection-start-column="48" selection-end-line="100" selection-end-column="48" />
<folding />
</state>
@ -1069,56 +988,100 @@
</entry>
<entry file="file://$PROJECT_DIR$/cnn/accent_classification/syllabled_letters/v2_1/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1494">
<state relative-caret-position="1602">
<caret line="100" column="56" lean-forward="false" selection-start-line="100" selection-start-column="56" selection-end-line="100" selection-end-column="56" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/cnn/accent_classification/letters/v3_1/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="396">
<caret line="33" column="41" lean-forward="false" selection-start-line="33" selection-start-column="41" selection-end-line="33" selection-end-column="41" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v5_3/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="378">
<caret line="32" column="45" lean-forward="false" selection-start-line="32" selection-start-column="45" selection-end-line="32" selection-end-column="45" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/v3_3/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="342">
<state relative-caret-position="450">
<caret line="36" column="47" lean-forward="false" selection-start-line="36" selection-start-column="47" selection-end-line="36" selection-end-column="47" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v5_3/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1170">
<caret line="76" column="84" lean-forward="false" selection-start-line="38" selection-start-column="0" selection-end-line="76" selection-end-column="84" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/syllables/v3_3/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="306">
<caret line="37" column="40" lean-forward="false" selection-start-line="37" selection-start-column="40" selection-end-line="37" selection-end-column="40" />
<state relative-caret-position="1368">
<caret line="87" column="84" lean-forward="false" selection-start-line="41" selection-start-column="0" selection-end-line="87" selection-end-column="84" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/cnn/accent_classification/letters/v3_1/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="612">
<caret line="45" column="17" lean-forward="false" selection-start-line="45" selection-start-column="17" selection-end-line="45" selection-end-column="17" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/cnn_dictionary/v5_2/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="358">
<caret line="85" column="39" lean-forward="false" selection-start-line="85" selection-start-column="39" selection-end-line="85" selection-end-column="39" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/cnn/word_accetuation/syllabled_letters/v3_2/workbench.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="426">
<caret line="87" column="84" lean-forward="true" selection-start-line="55" selection-start-column="0" selection-end-line="87" selection-end-column="84" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/run_multiple_files.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="198">
<caret line="11" column="26" lean-forward="false" selection-start-line="11" selection-start-column="26" selection-end-line="11" selection-end-column="26" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/prepare_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="311">
<caret line="1055" column="31" lean-forward="true" selection-start-line="1055" selection-start-column="31" selection-end-line="1055" selection-end-column="31" />
<state relative-caret-position="-221">
<caret line="1570" column="23" lean-forward="false" selection-start-line="1570" selection-start-column="23" selection-end-line="1570" selection-end-column="23" />
<folding>
<element signature="e#24#63#0" expanded="true" />
<element signature="e#7950#9238#0" expanded="false" />
<element signature="e#10894#11189#0" expanded="false" />
<element signature="e#11294#14633#0" expanded="false" />
<element signature="e#18495#18797#0" expanded="false" />
<element signature="e#19770#20116#0" expanded="false" />
<element signature="e#24532#25179#0" expanded="false" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/sloleks_accentuation.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180">
<caret line="16" column="28" lean-forward="true" selection-start-line="16" selection-start-column="28" selection-end-line="16" selection-end-column="28" />
<folding>
<element signature="e#24#63#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/sloleks_accentuation2.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="454">
<caret line="48" column="61" lean-forward="true" selection-start-line="48" selection-start-column="61" selection-end-line="48" selection-end-column="61" />
<folding>
<element signature="e#24#63#0" expanded="true" />
<element signature="e#7927#9215#0" expanded="false" />
<element signature="e#10871#11166#0" expanded="false" />
<element signature="e#18472#18774#0" expanded="false" />
<element signature="e#19747#20093#0" expanded="false" />
<element signature="e#24509#25156#0" expanded="false" />
<element signature="e#38245#38386#0" expanded="false" />
</folding>
</state>
</provider>

View File

@ -9,6 +9,8 @@ import keras.backend as K
import os.path
import codecs
from copy import copy
from keras import optimizers
from keras.models import Model
from keras.layers import Dense, Dropout, Input
@ -968,16 +970,49 @@ class Data:
return res
def test_accuracy(self, predictions, x, x_other_features, y, dictionary, feature_dictionary, vowels, syllable_dictionary=None,
threshold=0.4999955):
threshold=0.4999955, patterns=None):
errors = []
num_of_pred = len(predictions)
num_of_correct_pred = 0
# wrong_patterns = 0
# wrong_pattern_prediction = 0
for i in range(predictions.shape[0]):
correct_prediction = True
round_predictions = np.zeros(predictions[i].shape)
for j in range(len(y[i])):
if predictions[i][j] < threshold:
round_predictions[j] = 0.0
else:
round_predictions[j] = 1.0
if (predictions[i][j] < threshold and y[i][j] == 1.0) or (predictions[i][j] >= threshold and y[i][j] == 0.0):
correct_prediction = False
break
# in_pattern = False
# if patterns is not None:
# test_predictions = copy(predictions[i])
# l = self.get_word_length(x[i])
# round_predictions = np.zeros(test_predictions.shape)
# for j in range(len(y[i])):
# if test_predictions[j] < threshold:
# round_predictions[j] = 0.0
# else:
# round_predictions[j] = 1.0
#
# in_pattern = False
# for pattern in patterns[l]:
# if (pattern == round_predictions).all():
# in_pattern = True
# if not in_pattern:
# wrong_patterns += 1
#
# for j in range(len(y[i])):
# if (predictions[i][j] < threshold and y[i][j] == 1.0) or (predictions[i][j] >= threshold and y[i][j] == 0.0):
# correct_prediction = False
#
# if not in_pattern and not correct_prediction:
# wrong_pattern_prediction += 1
# if (np.around(predictions[i]) == y[i]).all():
if correct_prediction:
num_of_correct_pred += 1
@ -991,12 +1026,22 @@ class Data:
errors.append([i,
decoded_x,
self.decode_x_other_features(feature_dictionary, [x_other_features[i]]),
self.assign_stress_locations(decoded_x, np.around(predictions[i]), vowels, syllables=self._input_type != 'l'),
self.assign_stress_locations(decoded_x, round_predictions, vowels, syllables=self._input_type != 'l'),
self.assign_stress_locations(decoded_x, y[i], vowels, syllables=self._input_type != 'l')
])
# print(wrong_patterns)
# print(wrong_pattern_prediction)
return (num_of_correct_pred / float(num_of_pred)) * 100, errors
# def get_word_length(self, x_el):
# i = 0
# for el in x_el:
# if el == 0:
# return i
# i += 1
# return 10
@staticmethod
def decode_syllable_x(word_encoded, syllable_dictionary):
word = []
@ -1214,8 +1259,37 @@ class Data:
@staticmethod
def load_location_models(letters_path, syllables_path, syllabled_letters_path):
############################ LOCATION ########################
letter_location_model = load_model(letters_path, custom_objects={'actual_accuracy': actual_accuracy})
nn_output_dim = 10
conv_input_shape = (23, 36)
othr_input = (140,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
x_conv = Conv1D(115, (3), padding='same', activation='relu')(conv_input)
x_conv = Conv1D(46, (3), padding='same', activation='relu')(x_conv)
x_conv = MaxPooling1D(pool_size=2)(x_conv)
x_conv = Flatten()(x_conv)
othr_input = Input(shape=othr_input, name='othr_input')
x = concatenate([x_conv, othr_input])
# x = Dense(1024, input_dim=(516 + 256), activation='relu')(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(nn_output_dim, activation='sigmoid')(x)
letter_location_model = Model(inputs=[conv_input, othr_input], outputs=x)
opt = optimizers.Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
letter_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
letter_location_model.load_weights(letters_path)
##############################################################
# num_examples = len(data.x_train) # training set size
nn_output_dim = 10
@ -1244,7 +1318,10 @@ class Data:
syllable_location_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[actual_accuracy, ])
syllable_location_model.load_weights(syllables_path)
#####################################################
conv_input_shape = (10, 252)
othr_input = (140,)
conv_input = Input(shape=conv_input_shape, name='conv_input')
@ -1354,6 +1431,7 @@ class Data:
@staticmethod
def get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
batch_size = 16
# print(tagged_input_words[pos])
@ -1379,10 +1457,58 @@ class Data:
generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, syllable_letters_translator, accented_vowels)
syllabled_letters_location_predictions = syllabled_letters_location_model.predict_generator(generator, len(x) / (batch_size))
return np.mean(np.array([letter_location_predictions, syllable_location_predictions, syllabled_letters_location_predictions]), axis=0)
############## CORRECT ORDER INPUT ##############
data = Data('l', shuffle_all_inputs=False, convert_multext=False, reverse_inputs=False)
x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
feature_dictionary, 'who cares')
generator = data._letter_generator(x, x_other_features, fake_y, batch_size, accented_vowels)
letter_location_co_predictions = letter_location_co_model.predict_generator(generator, len(x) / (batch_size))
letter_location_co_predictions = data.reverse_predictions(letter_location_co_predictions, input_words, vowels)
data = Data('s', shuffle_all_inputs=False, convert_multext=False, reverse_inputs=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
eye = np.eye(len(syllable_dictionary), dtype=int)
generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, eye, accented_vowels)
syllable_location_co_predictions = syllable_location_co_model.predict_generator(generator, len(x) / (batch_size))
syllable_location_co_predictions = data.reverse_predictions(syllable_location_co_predictions, input_words, vowels)
data = Data('sl', shuffle_all_inputs=False, convert_multext=False, reverse_inputs=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
max_syllable = data._get_max_syllable(syllable_dictionary)
syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
generator = data._syllable_generator(x, x_other_features, fake_y, batch_size, syllable_letters_translator, accented_vowels)
syllabled_letters_location_co_predictions = syllabled_letters_location_co_model.predict_generator(generator, len(x) / (batch_size))
syllabled_letters_location_co_predictions = data.reverse_predictions(syllabled_letters_location_co_predictions, input_words, vowels)
return np.mean(np.array([letter_location_predictions, syllable_location_predictions, syllabled_letters_location_predictions,
letter_location_co_predictions, syllable_location_co_predictions, syllabled_letters_location_co_predictions]), axis=0)
def count_syllables(self, word, vowels):
j = 0
num_vowels = 0
for j in range(len(word)):
if self._is_vowel(word, j, vowels):
num_vowels += 1
return num_vowels
def reverse_predictions(self, predictions, words, vowels):
new_predictions = np.zeros(predictions.shape, dtype='float32')
for i in range(len(predictions)):
word_len = self.count_syllables(words[i][0], vowels)
for k in range(word_len):
new_predictions[i][k] += predictions[i][word_len - 1 - k]
return new_predictions
@staticmethod
def get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model, syllabled_letter_type_model,
letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
batch_size = 16
y_array = np.asarray(location_y)
@ -1409,7 +1535,57 @@ class Data:
generator = data._syllable_generator(x, x_other_features, location_y, batch_size, syllable_letters_translator, accented_vowels)
syllabled_letter_type_predictions = syllabled_letter_type_model.predict_generator(generator, accentuation_length / batch_size)
return np.mean(np.array([letter_type_predictions, syllable_type_predictions, syllabled_letter_type_predictions]), axis=0)
############## CORRECT ORDER INPUT ##############
location_y = data.reverse_predictions(location_y, input_words, vowels)
data = Data('l', shuffle_all_inputs=False, accent_classification=True, convert_multext=False, reverse_inputs=False)
x, x_other_features, fake_y = data._generate_x_and_y(dictionary, max_word, max_num_vowels, input_words, vowels, accented_vowels,
feature_dictionary, 'who cares')
generator = data._letter_generator(x, x_other_features, location_y, batch_size, accented_vowels)
letter_type_co_predictions = letter_type_co_model.predict_generator(generator, accentuation_length / (batch_size))
data.reorder_correct_direction_inputs(letter_type_co_predictions, location_y)
data = Data('s', shuffle_all_inputs=False, accent_classification=True, convert_multext=False, reverse_inputs=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
eye = np.eye(len(syllable_dictionary), dtype=int)
generator = data._syllable_generator(x, x_other_features, location_y, batch_size, eye, accented_vowels)
syllable_type_co_predictions = syllable_type_co_model.predict_generator(generator, accentuation_length / (batch_size))
data.reorder_correct_direction_inputs(syllable_type_co_predictions, location_y)
data = Data('sl', shuffle_all_inputs=False, accent_classification=True, convert_multext=False, reverse_inputs=False)
x, x_other_features, fake_y = data._generate_x_and_y(syllable_dictionary, max_word, max_num_vowels, input_words, vowels,
accented_vowels, feature_dictionary, 'who cares')
max_syllable = data._get_max_syllable(syllable_dictionary)
syllable_letters_translator = data._create_syllable_letters_translator(max_syllable, syllable_dictionary, dictionary, vowels)
generator = data._syllable_generator(x, x_other_features, location_y, batch_size, syllable_letters_translator, accented_vowels)
syllabled_letter_type_co_predictions = syllabled_letter_type_co_model.predict_generator(generator, accentuation_length / batch_size)
data.reorder_correct_direction_inputs(syllabled_letter_type_co_predictions, location_y)
return np.mean(np.array([letter_type_predictions, syllable_type_predictions, syllabled_letter_type_predictions,
letter_type_co_predictions, syllable_type_co_predictions, syllabled_letter_type_co_predictions]), axis=0)
def reorder_correct_direction_inputs(self, predictions, y):
pred_i = 0
for i in range(len(y)):
num_accented_syllables = 0
for el in y[i]:
if el > 0:
num_accented_syllables += 1
if num_accented_syllables > 1:
min_i = pred_i
max_i = pred_i + num_accented_syllables - 1
while (max_i > min_i):
min_pred = copy(predictions[min_i])
max_pred = copy(predictions[max_i])
predictions[min_i] = max_pred
predictions[max_i] = min_pred
min_i += 1
max_i -= 1
pred_i += num_accented_syllables
def assign_location_stress(self, word, locations, vowels):
# word = list(word)
@ -1449,12 +1625,17 @@ class Data:
return ''.join(word_list)
def accentuate_word(self, input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model,
letter_type_model, syllable_type_model, syllabled_letter_type_model,
letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary):
predictions = self.get_ensemble_location_predictions(input_words, letter_location_model, syllable_location_model,
syllabled_letters_location_model,
letter_location_co_model, syllable_location_co_model,
syllabled_letters_location_co_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
syllable_dictionary)
#print(predictions)
if 'A' not in vowels:
vowels.extend(['A', 'E', 'I', 'O', 'U'])
location_accented_words = [self.assign_location_stress(input_words[i][0][::-1], self.decode_y(predictions[i]), vowels)[::-1] for i in
@ -1463,6 +1644,7 @@ class Data:
location_y = np.around(predictions)
type_predictions = self.get_ensemble_type_predictions(input_words, location_y, letter_type_model, syllable_type_model,
syllabled_letter_type_model,
letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary,
syllable_dictionary)

70
sloleks_accentuation2.py Normal file
View File

@ -0,0 +1,70 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
from keras.models import load_model
import sys
import pickle
import time
from prepare_data import *
np.random.seed(7)
data = Data('l', shuffle_all_inputs=False)
content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex')
dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content)
feature_dictionary = data._create_slovene_feature_dictionary()
syllable_dictionary = data._create_syllables_dictionary(content, vowels)
accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü']
data = Data('l', shuffle_all_inputs=False)
letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models(
'cnn/word_accetuation/cnn_dictionary/v5_3/20_final_epoch.h5',
'cnn/word_accetuation/syllables/v3_3/20_final_epoch.h5',
'cnn/word_accetuation/syllabled_letters/v3_3/20_final_epoch.h5')
letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model = data.load_location_models(
'cnn/word_accetuation/cnn_dictionary/v5_2/20_final_epoch.h5',
'cnn/word_accetuation/syllables/v3_2/20_final_epoch.h5',
'cnn/word_accetuation/syllabled_letters/v3_2/20_final_epoch.h5')
letter_type_model, syllable_type_model, syllabled_letter_type_model = data.load_type_models(
'cnn/accent_classification/letters/v3_1/20_final_epoch.h5',
'cnn/accent_classification/syllables/v2_1/20_final_epoch.h5',
'cnn/accent_classification/syllabled_letters/v2_1/20_final_epoch.h5')
letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model = data.load_type_models(
'cnn/accent_classification/letters/v3_0/20_final_epoch.h5',
'cnn/accent_classification/syllables/v2_0/20_final_epoch.h5',
'cnn/accent_classification/syllabled_letters/v2_0/20_final_epoch.h5')
data = Data('s', shuffle_all_inputs=False)
new_content = data._read_content('data/sloleks-sl_v1.2.tbl')
print('Commencing accentuator!')
rate = 100000
start_timer = time.time()
with open("data/new_sloleks/new_sloleks.tab", "a") as myfile:
for index in range(0, len(new_content), rate):
if index+rate >= len(new_content):
words = [[el[0], '', el[2], el[0]] for el in new_content][index:len(new_content)]
else:
words = [[el[0], '', el[2], el[0]] for el in new_content][index:index+rate]
data = Data('l', shuffle_all_inputs=False)
location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model, syllabled_letters_location_model,
letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model,
letter_type_model, syllable_type_model, syllabled_letter_type_model,
letter_type_co_model, syllable_type_co_model, syllabled_letter_type_co_model,
dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)
res = ''
for i in range(index, index + len(words)):
res += new_content[i][0] + '\t' + new_content[i][1] + '\t' + new_content[i][2] + '\t' \
+ new_content[i][3][:-1] + '\t' + location_accented_words[i-index] + '\t' + accented_words[i-index] + '\n'
print('Writing data from ' + str(index) + ' onward.')
end_timer = time.time()
print("Elapsed time: " + "{0:.2f}".format((end_timer - start_timer)/60.0) + " minutes")
myfile.write(res)

View File

@ -87,18 +87,6 @@
" i += 1"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"feature__en_dictionary = data._create_feature_dictionary()\n",
"feature__slo_dictionary = data._create_slovene_feature_dictionary()"
]
},
{
"cell_type": "code",
"execution_count": 3,
@ -120,7 +108,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"metadata": {
"collapsed": true
},
@ -131,14 +119,14 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%run prepare_data.py\n",
"data = Data('l', shuffle_all_inputs=False)\n",
"data = Data('s', shuffle_all_inputs=False)\n",
"location_accented_words, accented_words = data.accentuate_word(test_input, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
" letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
" dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)"
@ -146,15 +134,15 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['uradní', 'podatkí', 'policíje', 'kažéjo', 'ná', 'precéj', 'napeté', 'razmeré', 'v', 'piranském', 'zalivú', 'jé', 'danés', 'poročála', 'oddajá', 'dó', 'danés', 'sé', 'jé', 'zgodílo']\n",
"['uradnî', 'podatkî', 'policíje', 'kažëjo', 'ná', 'precëj', 'napetë', 'razmerë', 'v', 'piranskëm', 'zalivú', 'jë', 'danës', 'poročála', 'oddajá', 'dó', 'danës', 'së', 'jë', 'zgodílo']\n"
"['uradni', 'podatkí', 'policíje', 'kažéjo', 'ná', 'precéj', 'napeté', 'razmeré', 'v', 'piranském', 'zalivú', 'jé', 'danés', 'poročála', 'oddajá', 'dó', 'danés', 'sé', 'jé', 'zgodílo']\n",
"['uradni', 'pödatki', 'polícije', 'kažëjo', 'ná', 'prëcej', 'nápete', 'räzmere', 'v', 'pîranskem', 'zálivu', 'jë', 'dánes', 'poróčala', 'öddaja', 'dó', 'dánes', 'së', 'jë', 'zgodílo']\n"
]
}
],
@ -225,7 +213,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 9,
"metadata": {
"collapsed": true
},
@ -508,6 +496,134 @@
" "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'xml_words_generator' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-8-44b0367c6cbf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mgen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mxml_words_generator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data/Sloleks_v1.2_p2.xml'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mword_glob_num\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mword_limit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'xml_words_generator' is not defined"
]
}
],
"source": [
"#Words proccesed: 650250\n",
"#Word indeks: 50023\n",
"#Word number: 50023\n",
"\n",
"from lxml import etree\n",
"import time\n",
"\n",
"gen = xml_words_generator('data/Sloleks_v1.2_p2.xml')\n",
"word_glob_num = 0\n",
"word_limit = 0\n",
"iter_num = 50000\n",
"word_index = 0\n",
"start_timer = time.time()\n",
"iter_index = 0\n",
"words = []\n",
"\n",
"lexical_entries_load_number = 0\n",
"lexical_entries_save_number = 0\n",
"\n",
"\n",
"# INSIDE\n",
"#word_glob_num = 1500686\n",
"word_glob_num = 1550705\n",
"\n",
"#word_limit = 1500686\n",
"word_limit = 1550705\n",
"\n",
"\n",
"iter_index = 31\n",
"\n",
"#done_lexical_entries = 33522\n",
"\n",
"with open(\"data/new_sloleks/new_sloleks.xml\", \"ab\") as myfile:\n",
" myfile2 = open('data/new_sloleks/pa' + str(iter_index) + '.xml', 'ab')\n",
" for event, element in etree.iterparse('data/Sloleks_v1.2_p2.xml', tag=\"LexicalEntry\", encoding=\"UTF-8\", remove_blank_text=True):\n",
" # LOAD NEW WORDS AND ACCENTUATE THEM\n",
" #print(\"HERE\")\n",
" \n",
"# if lexical_entries_save_number < done_lexical_entries:\n",
"# next(gen)\n",
"# #print(lexical_entries_save_number)\n",
"# lexical_entries_save_number += 1\n",
"# lexical_entries_load_number += 1\n",
"# continue\n",
" \n",
" if word_glob_num >= word_limit:\n",
" myfile2.close()\n",
" myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab')\n",
" iter_index += 1\n",
" print(\"Words proccesed: \" + str(word_glob_num))\n",
"\n",
" print(\"Word indeks: \" + str(word_index))\n",
" print(\"Word number: \" + str(len(words)))\n",
" \n",
" #print(\"lexical_entries_load_number: \" + str(lexical_entries_load_number))\n",
" #print(\"lexical_entries_save_number: \" + str(lexical_entries_save_number))\n",
"\n",
" end_timer = time.time()\n",
" print(\"Elapsed time: \" + \"{0:.2f}\".format((end_timer - start_timer)/60.0) + \" minutes\")\n",
"\n",
"\n",
" word_index = 0\n",
" words = []\n",
"\n",
" while len(words) < iter_num:\n",
" try:\n",
" words.extend(next(gen))\n",
" lexical_entries_load_number += 1\n",
" except:\n",
" break\n",
" #if word_glob_num > 1:\n",
" # break\n",
"\n",
" #problem_words = words\n",
" #break\n",
" data = Data('l', shuffle_all_inputs=False)\n",
" location_accented_words, accented_words = data.accentuate_word(words, letter_location_model, syllable_location_model, syllabled_letters_location_model,\n",
" letter_type_model, syllable_type_model, syllabled_letter_type_model,\n",
" dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)\n",
"\n",
" word_limit += len(words)\n",
" \n",
" \n",
" # READ DATA\n",
" for child in element:\n",
" if child.tag == 'WordForm':\n",
" msd = None\n",
" word = None\n",
" for wf in child:\n",
" if wf.tag == 'FormRepresentation':\n",
" new_element = etree.Element('feat')\n",
" new_element.attrib['att']='naglasna_mesta_oblike'\n",
" new_element.attrib['val']=location_accented_words[word_index]\n",
" wf.append(new_element)\n",
"\n",
" new_element = etree.Element('feat')\n",
" new_element.attrib['att']='naglašena_oblika'\n",
" new_element.attrib['val']=accented_words[word_index]\n",
" wf.append(new_element)\n",
" word_glob_num += 1\n",
" word_index += 1\n",
"\n",
" # print(etree.tostring(element, encoding=\"UTF-8\"))\n",
" myfile2.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
" myfile.write(etree.tostring(element, encoding=\"UTF-8\", pretty_print=True))\n",
" element.clear()\n",
" lexical_entries_save_number += 1\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 6,

263
sloleks_accetuation2.ipynb Normal file

File diff suppressed because one or more lines are too long