From 4ca872dc63310596255c0442de07e4b066acaf03 Mon Sep 17 00:00:00 2001 From: Luka Date: Fri, 17 Aug 2018 10:20:55 +0200 Subject: [PATCH] Formatted computer + added correct forms of accentuation --- .gitignore | 1 + LICENSE | 0 README.md | 0 __init__.py | 0 accentuate.py | 0 accentuate_connected_text.py | 0 hyphenation | 0 learn_location_weights.py | 0 notes | 0 prepare_data.py | 0 preprocessed_data/environment.pkl | Bin requirements.txt | 0 run_multiple_files.py | 0 sloleks_accentuation.py | 0 sloleks_accentuation2.py | 2 +- sloleks_accentuation2_tab2xml.py | 3 ++- sloleks_accetuation.ipynb | 0 sloleks_accetuation2.ipynb | 4 ++-- sloleks_xml_checker.py | 0 test_data/accented_connected_text | 0 test_data/accented_data | 0 test_data/original_connected_text | 0 test_data/unaccented_dictionary | 0 tex_hyphenation.py | 0 text2SAMPA.py | 6 ++++++ workbench.py | 0 workbench.sh | 0 workbench.xrsl | 0 28 files changed, 12 insertions(+), 4 deletions(-) mode change 100644 => 100755 .gitignore mode change 100644 => 100755 LICENSE mode change 100644 => 100755 README.md mode change 100644 => 100755 __init__.py mode change 100644 => 100755 accentuate.py mode change 100644 => 100755 accentuate_connected_text.py mode change 100644 => 100755 hyphenation mode change 100644 => 100755 learn_location_weights.py mode change 100644 => 100755 notes mode change 100644 => 100755 prepare_data.py mode change 100644 => 100755 preprocessed_data/environment.pkl mode change 100644 => 100755 requirements.txt mode change 100644 => 100755 run_multiple_files.py mode change 100644 => 100755 sloleks_accentuation.py mode change 100644 => 100755 sloleks_accentuation2.py mode change 100644 => 100755 sloleks_accentuation2_tab2xml.py mode change 100644 => 100755 sloleks_accetuation.ipynb mode change 100644 => 100755 sloleks_accetuation2.ipynb mode change 100644 => 100755 sloleks_xml_checker.py mode change 100644 => 100755 test_data/accented_connected_text mode change 100644 => 100755 test_data/accented_data mode change 100644 => 100755 test_data/original_connected_text mode change 100644 => 100755 test_data/unaccented_dictionary mode change 100644 => 100755 tex_hyphenation.py mode change 100644 => 100755 text2SAMPA.py mode change 100644 => 100755 workbench.py mode change 100644 => 100755 workbench.sh mode change 100644 => 100755 workbench.xrsl diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 index b7f581f..1fe4c96 --- a/.gitignore +++ b/.gitignore @@ -98,3 +98,4 @@ grid_results/ .idea/ cnn/word_accetuation/svm/data/ data_merge.ipynb +data_merge.py diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/__init__.py b/__init__.py old mode 100644 new mode 100755 diff --git a/accentuate.py b/accentuate.py old mode 100644 new mode 100755 diff --git a/accentuate_connected_text.py b/accentuate_connected_text.py old mode 100644 new mode 100755 diff --git a/hyphenation b/hyphenation old mode 100644 new mode 100755 diff --git a/learn_location_weights.py b/learn_location_weights.py old mode 100644 new mode 100755 diff --git a/notes b/notes old mode 100644 new mode 100755 diff --git a/prepare_data.py b/prepare_data.py old mode 100644 new mode 100755 diff --git a/preprocessed_data/environment.pkl b/preprocessed_data/environment.pkl old mode 100644 new mode 100755 diff --git a/requirements.txt b/requirements.txt old mode 100644 new mode 100755 diff --git a/run_multiple_files.py b/run_multiple_files.py old mode 100644 new mode 100755 diff --git a/sloleks_accentuation.py b/sloleks_accentuation.py old mode 100644 new mode 100755 diff --git a/sloleks_accentuation2.py b/sloleks_accentuation2.py old mode 100644 new mode 100755 index 89277a3..09f22f1 --- a/sloleks_accentuation2.py +++ b/sloleks_accentuation2.py @@ -16,7 +16,7 @@ content = data._read_content('data/SlovarIJS_BESEDE_utf8.lex') dictionary, max_word, max_num_vowels, vowels, accented_vowels = data._create_dict(content) feature_dictionary = data._create_slovene_feature_dictionary() syllable_dictionary = data._create_syllables_dictionary(content, vowels) -accented_vowels = ['ŕ', 'á', 'ä', 'é', 'ë', 'ě', 'í', 'î', 'ó', 'ô', 'ö', 'ú', 'ü'] +accented_vowels = ['ŕ', 'á', 'à', 'é', 'è', 'ê', 'í', 'ì', 'ó', 'ô', 'ò', 'ú', 'ù'] data = Data('l', shuffle_all_inputs=False) letter_location_model, syllable_location_model, syllabled_letters_location_model = data.load_location_models( diff --git a/sloleks_accentuation2_tab2xml.py b/sloleks_accentuation2_tab2xml.py old mode 100644 new mode 100755 index d2bf890..c139982 --- a/sloleks_accentuation2_tab2xml.py +++ b/sloleks_accentuation2_tab2xml.py @@ -59,7 +59,8 @@ start_timer = time.time() print('Copy initialization complete') with open("data/new_sloleks/final_sloleks.xml", "ab") as myfile: # myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab') - for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): + for event, element in etree.iterparse('data/new_sloleks/final_sloleks_read.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): + # for event, element in etree.iterparse('data/Sloleks_v1.2.xml', tag="LexicalEntry", encoding="UTF-8", remove_blank_text=True): # if word_glob_num >= word_limit: # myfile2.close() # myfile2 = open('data/new_sloleks/p' + str(iter_index) + '.xml', 'ab') diff --git a/sloleks_accetuation.ipynb b/sloleks_accetuation.ipynb old mode 100644 new mode 100755 diff --git a/sloleks_accetuation2.ipynb b/sloleks_accetuation2.ipynb old mode 100644 new mode 100755 index cb0e7fd..a9ebaf8 --- a/sloleks_accetuation2.ipynb +++ b/sloleks_accetuation2.ipynb @@ -219,7 +219,6 @@ { "ename": "IndexError", "evalue": "index 10 is out of bounds for axis 0 with size 10", - "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", @@ -228,7 +227,8 @@ "\u001b[0;32m~/Developement/accetuation/prepare_data.py\u001b[0m in \u001b[0;36mget_ensemble_location_predictions\u001b[0;34m(input_words, letter_location_model, syllable_location_model, syllabled_letters_location_model, letter_location_co_model, syllable_location_co_model, syllabled_letters_location_co_model, dictionary, max_word, max_num_vowels, vowels, accented_vowels, feature_dictionary, syllable_dictionary)\u001b[0m\n\u001b[1;32m 1465\u001b[0m \u001b[0mletter_location_co_predictions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mletter_location_co_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_generator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgenerator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1466\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1467\u001b[0;31m \u001b[0mletter_location_co_predictions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreverse_predictions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mletter_location_co_predictions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_words\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvowels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1468\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1469\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mData\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m's'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshuffle_all_inputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconvert_multext\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreverse_inputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Developement/accetuation/prepare_data.py\u001b[0m in \u001b[0;36mreverse_predictions\u001b[0;34m(self, predictions, words, vowels)\u001b[0m\n\u001b[1;32m 1503\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1504\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword_len\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1505\u001b[0;31m \u001b[0mnew_predictions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mpredictions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mword_len\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1506\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1507\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnew_predictions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mIndexError\u001b[0m: index 10 is out of bounds for axis 0 with size 10" - ] + ], + "output_type": "error" } ], "source": [ diff --git a/sloleks_xml_checker.py b/sloleks_xml_checker.py old mode 100644 new mode 100755 diff --git a/test_data/accented_connected_text b/test_data/accented_connected_text old mode 100644 new mode 100755 diff --git a/test_data/accented_data b/test_data/accented_data old mode 100644 new mode 100755 diff --git a/test_data/original_connected_text b/test_data/original_connected_text old mode 100644 new mode 100755 diff --git a/test_data/unaccented_dictionary b/test_data/unaccented_dictionary old mode 100644 new mode 100755 diff --git a/tex_hyphenation.py b/tex_hyphenation.py old mode 100644 new mode 100755 diff --git a/text2SAMPA.py b/text2SAMPA.py old mode 100644 new mode 100755 index 895b559..97d0313 --- a/text2SAMPA.py +++ b/text2SAMPA.py @@ -86,6 +86,7 @@ def create_syllables(word, vowels): def convert_to_SAMPA(word): + word = word.lower() syllables = create_syllables(word, vowels) letters_in_stressed_syllable = [False] * len(word) # print(syllables) @@ -152,6 +153,11 @@ def convert_to_SAMPA(word): word = list(''.join(word)) + test_word = ''.join(word) + test_word = test_word.replace('"', '').replace(':', '') + if len(test_word) <= 1: + return ''.join(word) + previous_letter_i = -1 letter_i = 0 next_letter_i = 1 diff --git a/workbench.py b/workbench.py old mode 100644 new mode 100755 diff --git a/workbench.sh b/workbench.sh old mode 100644 new mode 100755 diff --git a/workbench.xrsl b/workbench.xrsl old mode 100644 new mode 100755