import os import pickle import nltk import random from time import time import string from polyglot.text import Word import logging log = logging.getLogger(__name__) sno = nltk.stem.SnowballStemmer("english") def dict_safe_key(dic, key): # Returns a list, no matter what. # Transform 1 element into a list. # Return key not found as empty list. if ( dic is None or key not in dic ): return [] subdic = dic[key] if not isinstance(subdic, list): return [subdic] return subdic def pickle_dump(data, path): with open(path, "wb") as file: pickle.dump(data, file) log.info("Dumped data to {}.".format(path)) return True def pickle_load(path): ret = None if os.path.isfile(path): with open(path, "rb") as file: ret = pickle.load(file) log.info("Loaded data from {}.".format(path)) return ret # Returns None in case of failure. # Implemented bucket sort for alphabetically sorting slovenian words. # Bucket sort >>>>>>>>>>>>>>>>>>>> def gen_sbs_alphabet(): alphabet = "abcčdefghijklmnoprsštuvzž" return {letter: (idx + 1) for idx, letter in enumerate(alphabet)} slo_bucket_sort_alphabet = gen_sbs_alphabet() def slo_bucket_sort(words, key=None): if key is None: def key(x): return x def alph_score(word, idx): kword = key(word) if idx >= len(kword): return 0 return slo_bucket_sort_alphabet.get(kword[idx]) or 0 def list_to_bins(words, idx): bins = [[] for i in range(len(slo_bucket_sort_alphabet.keys()) + 1)] for word in words: bins[alph_score(word, idx)].append(word) return bins def bins_to_list(bins): lst = [] for b in bins: for el in b: lst.append(el) return lst maxLen = 0 for w in words: if len(key(w)) > maxLen: maxLen = len(key(w)) maxIdx = maxLen - 1 for idx in range(maxIdx, -1, -1): bins = list_to_bins(words, idx) words = bins_to_list(bins) """ print(idx) def get_letter(idx, word): kword = key(word) if idx < len(kword): return(kword[idx]) return "#" print([(word, get_letter(idx, word)) for word in words]) """ return words # Bucket sort <<<<<<<<<<<<<<<<<<<< def stem_slo(x): # Simplified; # Remove the last syllable. w = Word(x, language="sl").morphemes ret = "".join(w[:-1]) return ret def stem_eng(x): return sno.stem(x) def tokenize(sentence, min_token_len=3, stem=None): # input: sentence string # output: list of token strings if stem is None: def stem(x): return x all_tokens = [] sent_txt = nltk.sent_tokenize(sentence) for sent in sent_txt: tokens = nltk.word_tokenize(sent) all_tokens.extend(tokens) res = [] for x in all_tokens: if x in string.punctuation: continue stemmed = stem(x.lower()) if len(stemmed) >= min_token_len: res.append(stemmed) return res def tokenize_multiple(str_list, min_token_len=3, stem=None): # tstart = time() res = [] for sentence in str_list: res.extend(tokenize(sentence, min_token_len, stem)) # log.debug("tokenize_multiple: {:.2f}s".format(time() - tstart)) return res def t_tokenize(): teststring = "This is a test sentence. I hope it works. .. Asdf. asdf ,,,;" print(teststring) res = tokenize(teststring, min_token_len=None) print(res) def permute_paths(list2d, x=None, y=None, paths=None, current_path=None): # python stuff if x is None: x = -1 if paths is None: paths = [] if current_path is None: current_path = [] if x >= len(list2d) - 1: paths.append(current_path) return paths for i in range(len(list2d[x + 1])): tmp_path = current_path + [(x + 1, i)] # Computational complexity peoblem (prune long lists) # len == 12 -> 30%, len == 5 -> 100% # if random.randint(0, 100) <= (100 - 10 * (len(list2d) - 5)): if True: paths = permute_paths( list2d, x + 1, i, paths, tmp_path ) return paths def t_permute_paths(): list2d = [ ["Greta"], ["backflips"], ["through", "around"], ["North Korea", "kindergarten"], ["with", "without"], ["a"], ["bag of", "abundance of"], ["bolts", "janitors"] ] print(list2d) paths = permute_paths(list2d=list2d) for path in paths: print([list2d[p[0]][p[1]] for p in path]) def find_overlaps(list_a, list_b): # Input: two lists. # Output: lists of overlapping elements. dict_a = {} dict_b = {} lists = [list_a, list_b] dicts = [dict_a, dict_b] for lidx in range(len(lists)): for elidx in range(len(lists[lidx])): el = lists[lidx][elidx] if el not in dicts[lidx]: dicts[lidx][el] = [] dicts[lidx][el].append(elidx) substrings = [] sda = sorted(dict_a.keys()) sdb = sorted(dict_b.keys()) i_sda = 0 i_sdb = 0 while ((i_sda < len(sda) and i_sdb < len(sdb))): if sda[i_sda] == sdb[i_sdb]: lia = dict_a[sda[i_sda]] lib = dict_b[sdb[i_sdb]] for llia in lia: for llib in lib: tmp_substr = [] ii = 0 while ( (llia + ii < len(list_a)) and (llib + ii < len(list_b)) and (list_a[llia + ii] == list_b[llib + ii]) ): tmp_substr.append(list_a[llia + ii]) ii += 1 ii = 1 while ( (llia - ii >= 0) and (llib - ii >= 0) and (list_a[llia - ii] == list_b[llib - ii]) ): tmp_substr.insert(0, list_a[llia - ii]) ii += 1 substrings.append(tmp_substr) if sda[i_sda] < sdb[i_sdb]: i_sda += 1 else: i_sdb += 1 uniques = set() res = [] for ss in substrings: if str(ss) not in uniques: uniques.add(str(ss)) res.append(ss) return res def find_overlaps_str(tokens_a, tokens_b): # Strings only. overlaps = [] for N in range(1, 5): ngrams_a = [] for i in range(len(tokens_a)): if i + N <= len(tokens_a): ngrams_a.append(tuple(tokens_a[i:i + N])) ngrams_b = [] for i in range(len(tokens_b)): if i + N <= len(tokens_b): ngrams_b.append(tuple(tokens_b[i:i + N])) overlaps.extend(list(set(ngrams_a).intersection(set(ngrams_b)))) res = [] for ovl in sorted(overlaps, key=lambda x: len(x), reverse=True): oovl = " ".join(ovl) for r in res: if oovl in r: break else: res.append(oovl) res[:] = [x.split(" ") for x in res] return res def t_find_overlaps(): res = [] input_len = [10, 100, 1000, 10000] for ll in input_len: alen = ll + int(ll * random.uniform(0.8, 1)) blen = ll + int(ll * random.uniform(0.8, 1)) a = [random.randint(0, 100) for x in range(alen)] b = [random.randint(0, 100) for x in range(blen)] tstart = time() find_overlaps(a, b) res.append({ "time": time() - tstart, "input_size": ll }) """ list_a = [6, 6, 4, 8, 3, 2, 2, 5, 6, 3, 4, 7, 5] list_b = [5, 3, 6, 8, 6, 6, 5, 3, 2, 6, 7, 8, 3, 2, 3, 2, 2, 5] res = find_overlaps(list_a, list_b) """ for r in res: print(r) def t1_find_overlaps(): t1 = "This is a test sentence. I hope it works. .. Asdf. asdf ,,,;" t2 = "this is a seconde sentence. I hope my stuff works." print(t1) print(t2) res = find_overlaps(tokenize(t1), tokenize(t2)) for r in res: print(r) print() res = find_overlaps_str(tokenize(t1), tokenize(t2)) for r in res: print(r) def t_find_overlaps_str(): t1 = [ 'vsa', 'moja', 'možganska', 'beda', 'se', 'združuje', 'v', 'dejstvu', 'da', 'sem', 'si', 'čeprav', 'sem', 'pozabil', 'ulico', 'zapomnil', 'hišno', 'številko' ] t2 = [ 'narediti', 'doseči', 'da', 'se', 'kaj', 'aktivno', 'ohrani', 'v', 'zavesti', 'zapomniti', 'si', 'imena', 'predstavljenih', 'gostov', 'dobro', 'natančno', 'slabo', 'si', 'kaj', 'zapomniti', 'takega', 'sem', 'si', 'zapomnil', 'zapomnite', 'te', 'prizore' ] res = find_overlaps(t1, t2) print(res) def t_slo_bucket_sort(): a1 = [] a2 = [] with open("./tests/m_besede2.txt") as f: for line in f: a1.append(line.split("\n")[0]) a2.append((line.split("\n")[0], random.randint(0, 9))) a1 = slo_bucket_sort(a1) a2 = slo_bucket_sort(a2, key=lambda x: x[0]) check = True for i in range(len(a1)): check &= (a1[i] == a2[i][0]) print("{:<10}{:>10}".format(str(a1[i]), str(a2[i]))) print(check) def t1_slo_bucket_sort(): words = "_xyz zebra. .bober raca bor borovnica antilopa".split(" ") words.append("test space") words.append("test srrrr") words.append("test saaa") for w in slo_bucket_sort(words): print(w) if __name__ == "__main__": # t_find_overlaps() # t1_find_overlaps() # t_tokenize() # t_find_overlaps_str() t1_slo_bucket_sort()