You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

368 lines
9.7 KiB

import os
import pickle
import nltk
import random
from time import time
import string
from polyglot.text import Word
import logging
log = logging.getLogger(__name__)
sno = nltk.stem.SnowballStemmer("english")
def dict_safe_key(dic, key):
# Returns a list, no matter what.
# Transform 1 element into a list.
# Return key not found as empty list.
if (
dic is None or
key not in dic
return []
subdic = dic[key]
if not isinstance(subdic, list):
return [subdic]
return subdic
def pickle_dump(data, path):
with open(path, "wb") as file:
pickle.dump(data, file)"Dumped data to {}.".format(path))
return True
def pickle_load(path):
ret = None
if os.path.isfile(path):
with open(path, "rb") as file:
ret = pickle.load(file)"Loaded data from {}.".format(path))
return ret # Returns None in case of failure.
# Implemented bucket sort for alphabetically sorting slovenian words.
# Bucket sort >>>>>>>>>>>>>>>>>>>>
def gen_sbs_alphabet():
alphabet = "abcčdefghijklmnoprsštuvzž"
return {letter: (idx + 1) for idx, letter in enumerate(alphabet)}
slo_bucket_sort_alphabet = gen_sbs_alphabet()
def slo_bucket_sort(words, key=None):
if key is None:
def key(x):
return x
def alph_score(word, idx):
kword = key(word)
if idx >= len(kword):
return 0
return slo_bucket_sort_alphabet.get(kword[idx]) or 0
def list_to_bins(words, idx):
bins = [[] for i in range(len(slo_bucket_sort_alphabet.keys()) + 1)]
for word in words:
bins[alph_score(word, idx)].append(word)
return bins
def bins_to_list(bins):
lst = []
for b in bins:
for el in b:
return lst
maxLen = 0
for w in words:
if len(key(w)) > maxLen:
maxLen = len(key(w))
maxIdx = maxLen - 1
for idx in range(maxIdx, -1, -1):
bins = list_to_bins(words, idx)
words = bins_to_list(bins)
def get_letter(idx, word):
kword = key(word)
if idx < len(kword):
return "#"
print([(word, get_letter(idx, word)) for word in words])
return words
# Bucket sort <<<<<<<<<<<<<<<<<<<<
def stem_slo(x):
# Simplified;
# Remove the last syllable.
w = Word(x, language="sl").morphemes
ret = "".join(w[:-1])
return ret
def stem_eng(x):
return sno.stem(x)
def tokenize(sentence, min_token_len=3, stem=None):
# input: sentence string
# output: list of token strings
if stem is None:
def stem(x):
return x
all_tokens = []
sent_txt = nltk.sent_tokenize(sentence)
for sent in sent_txt:
tokens = nltk.word_tokenize(sent)
res = []
for x in all_tokens:
if x in string.punctuation:
stemmed = stem(x.lower())
if len(stemmed) >= min_token_len:
return res
def tokenize_multiple(str_list, min_token_len=3, stem=None):
# tstart = time()
res = []
for sentence in str_list:
res.extend(tokenize(sentence, min_token_len, stem))
# log.debug("tokenize_multiple: {:.2f}s".format(time() - tstart))
return res
def t_tokenize():
teststring = "This is a test sentence. I hope it works. .. Asdf. asdf ,,,;"
res = tokenize(teststring, min_token_len=None)
def permute_paths(list2d, x=None, y=None, paths=None, current_path=None):
# python stuff
if x is None:
x = -1
if paths is None:
paths = []
if current_path is None:
current_path = []
if x >= len(list2d) - 1:
return paths
for i in range(len(list2d[x + 1])):
tmp_path = current_path + [(x + 1, i)]
# Computational complexity peoblem (prune long lists)
# len == 12 -> 30%, len == 5 -> 100%
# if random.randint(0, 100) <= (100 - 10 * (len(list2d) - 5)):
if True:
paths = permute_paths(
x + 1,
return paths
def t_permute_paths():
list2d = [
["through", "around"],
["North Korea", "kindergarten"],
["with", "without"],
["bag of", "abundance of"],
["bolts", "janitors"]
paths = permute_paths(list2d=list2d)
for path in paths:
print([list2d[p[0]][p[1]] for p in path])
def find_overlaps(list_a, list_b):
# Input: two lists.
# Output: lists of overlapping elements.
dict_a = {}
dict_b = {}
lists = [list_a, list_b]
dicts = [dict_a, dict_b]
for lidx in range(len(lists)):
for elidx in range(len(lists[lidx])):
el = lists[lidx][elidx]
if el not in dicts[lidx]:
dicts[lidx][el] = []
substrings = []
sda = sorted(dict_a.keys())
sdb = sorted(dict_b.keys())
i_sda = 0
i_sdb = 0
while ((i_sda < len(sda) and i_sdb < len(sdb))):
if sda[i_sda] == sdb[i_sdb]:
lia = dict_a[sda[i_sda]]
lib = dict_b[sdb[i_sdb]]
for llia in lia:
for llib in lib:
tmp_substr = []
ii = 0
while (
(llia + ii < len(list_a)) and
(llib + ii < len(list_b)) and
(list_a[llia + ii] == list_b[llib + ii])
tmp_substr.append(list_a[llia + ii])
ii += 1
ii = 1
while (
(llia - ii >= 0) and
(llib - ii >= 0) and
(list_a[llia - ii] == list_b[llib - ii])
tmp_substr.insert(0, list_a[llia - ii])
ii += 1
if sda[i_sda] < sdb[i_sdb]:
i_sda += 1
i_sdb += 1
uniques = set()
res = []
for ss in substrings:
if str(ss) not in uniques:
return res
def find_overlaps_str(tokens_a, tokens_b):
# Strings only.
overlaps = []
for N in range(1, 5):
ngrams_a = []
for i in range(len(tokens_a)):
if i + N <= len(tokens_a):
ngrams_a.append(tuple(tokens_a[i:i + N]))
ngrams_b = []
for i in range(len(tokens_b)):
if i + N <= len(tokens_b):
ngrams_b.append(tuple(tokens_b[i:i + N]))
res = []
for ovl in sorted(overlaps, key=lambda x: len(x), reverse=True):
oovl = " ".join(ovl)
for r in res:
if oovl in r:
res[:] = [x.split(" ") for x in res]
return res
def t_find_overlaps():
res = []
input_len = [10, 100, 1000, 10000]
for ll in input_len:
alen = ll + int(ll * random.uniform(0.8, 1))
blen = ll + int(ll * random.uniform(0.8, 1))
a = [random.randint(0, 100) for x in range(alen)]
b = [random.randint(0, 100) for x in range(blen)]
tstart = time()
find_overlaps(a, b)
"time": time() - tstart,
"input_size": ll
list_a = [6, 6, 4, 8, 3, 2, 2, 5, 6, 3, 4, 7, 5]
list_b = [5, 3, 6, 8, 6, 6, 5, 3, 2, 6, 7, 8, 3, 2, 3, 2, 2, 5]
res = find_overlaps(list_a, list_b)
for r in res:
def t1_find_overlaps():
t1 = "This is a test sentence. I hope it works. .. Asdf. asdf ,,,;"
t2 = "this is a seconde sentence. I hope my stuff works."
res = find_overlaps(tokenize(t1), tokenize(t2))
for r in res:
res = find_overlaps_str(tokenize(t1), tokenize(t2))
for r in res:
def t_find_overlaps_str():
t1 = [
'vsa', 'moja', 'možganska', 'beda', 'se', 'združuje',
'v', 'dejstvu', 'da', 'sem', 'si', 'čeprav', 'sem', 'pozabil',
'ulico', 'zapomnil', 'hišno', 'številko'
t2 = [
'narediti', 'doseči', 'da', 'se', 'kaj', 'aktivno', 'ohrani',
'v', 'zavesti', 'zapomniti', 'si', 'imena', 'predstavljenih',
'gostov', 'dobro', 'natančno', 'slabo', 'si', 'kaj', 'zapomniti',
'takega', 'sem', 'si', 'zapomnil', 'zapomnite', 'te', 'prizore'
res = find_overlaps(t1, t2)
def t_slo_bucket_sort():
a1 = []
a2 = []
with open("./tests/m_besede2.txt") as f:
for line in f:
a2.append((line.split("\n")[0], random.randint(0, 9)))
a1 = slo_bucket_sort(a1)
a2 = slo_bucket_sort(a2, key=lambda x: x[0])
check = True
for i in range(len(a1)):
check &= (a1[i] == a2[i][0])
print("{:<10}{:>10}".format(str(a1[i]), str(a2[i])))
def t1_slo_bucket_sort():
words = "_xyz zebra. .bober raca bor borovnica antilopa".split(" ")
words.append("test space")
words.append("test srrrr")
words.append("test saaa")
for w in slo_bucket_sort(words):
if __name__ == "__main__":
# t_find_overlaps()
# t1_find_overlaps()
# t_tokenize()
# t_find_overlaps_str()