[MAJOR UPDATE] Changed additional features to version 4, erased unnecessary input letters (unused vowels), split validation data to test data and validation data
This commit is contained in:
@@ -0,0 +1,101 @@
|
||||
import sys
|
||||
sys.path.insert(0, '../../../')
|
||||
from prepare_data import *
|
||||
dictionary, max_word, max_num_vowels, content, vowels, accetuated_vowels = create_dict()
|
||||
feature_dictionary = create_feature_dictionary(content)
|
||||
|
||||
|
||||
def read_hyphenation_pattern():
|
||||
with open('../../../hyphenation') as f:
|
||||
content = f.readlines()
|
||||
return [x[:-1] for x in content]
|
||||
|
||||
|
||||
def find_hyphenation_patterns_in_text(text, pattern):
|
||||
res = []
|
||||
index = 0
|
||||
while index < len(text):
|
||||
index = text.find(pattern, index)
|
||||
if index == -1:
|
||||
break
|
||||
res.append(index)
|
||||
index += 1 # +2 because len('ll') == 2
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def create_hyphenation_dictionary(hyphenation_pattern):
|
||||
dictionary = []
|
||||
for el in hyphenation_pattern:
|
||||
substring = ''
|
||||
anomalies_indices = []
|
||||
digit_location = 0
|
||||
for let in list(el):
|
||||
if let.isdigit():
|
||||
anomalies_indices.append([digit_location, int(let)])
|
||||
else:
|
||||
substring += let
|
||||
digit_location += 1
|
||||
dictionary.append([substring, anomalies_indices])
|
||||
return dictionary
|
||||
|
||||
|
||||
def split_hyphenated_word(split, word):
|
||||
split = split[2:-2]
|
||||
print(split)
|
||||
word = list(word)[1:-1]
|
||||
res = []
|
||||
hyphenate = ''
|
||||
loc = 0
|
||||
for let in word:
|
||||
hyphenate += let
|
||||
if loc == len(split) or split[loc] % 2 == 1:
|
||||
res.append(hyphenate)
|
||||
hyphenate = ''
|
||||
loc += 1
|
||||
return res
|
||||
|
||||
|
||||
def hyphenate_word(word, hyphenation_dictionary):
|
||||
word = word.replace('è', 'č')
|
||||
word = '.' + word + '.'
|
||||
split = [0] * (len(word) + 1)
|
||||
for pattern in hyphenation_dictionary:
|
||||
pattern_locations = find_hyphenation_patterns_in_text(word, pattern[0])
|
||||
for pattern_location in pattern_locations:
|
||||
for el_hyphenation_dictionary in pattern[1]:
|
||||
if split[pattern_location + el_hyphenation_dictionary[0]] < el_hyphenation_dictionary[1]:
|
||||
split[pattern_location + el_hyphenation_dictionary[0]] = el_hyphenation_dictionary[1]
|
||||
return split_hyphenated_word(split, word)
|
||||
|
||||
|
||||
hyphenation_pattern = read_hyphenation_pattern()
|
||||
# ['zz', [{0:2},{1:1},{2:2}]]
|
||||
hyphenation_dictionary = create_hyphenation_dictionary(hyphenation_pattern)
|
||||
separated_word = hyphenate_word('izziv', hyphenation_dictionary)
|
||||
print(separated_word)
|
||||
|
||||
all_words = []
|
||||
i = 0
|
||||
for el in content:
|
||||
separated_word = hyphenate_word(el[0], hyphenation_dictionary)
|
||||
all_words.append([el[0], separated_word])
|
||||
if i % 10000 == 0:
|
||||
print(str(i)+'/'+str(len(content)))
|
||||
i += 1
|
||||
|
||||
errors = []
|
||||
errors2 = []
|
||||
for word in all_words:
|
||||
for hyphenated_part in word[1]:
|
||||
num_vowels = 0
|
||||
for let in list(hyphenated_part):
|
||||
if let in vowels:
|
||||
num_vowels += 1
|
||||
if num_vowels == 0:
|
||||
for let in list(hyphenated_part):
|
||||
if let == 'r':
|
||||
errors2.append(word[0])
|
||||
num_vowels += 1
|
||||
if num_vowels != 1:
|
||||
errors.append(word)
|
||||
Reference in New Issue
Block a user