368 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			368 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import os
 | |
| import pickle
 | |
| import nltk
 | |
| import random
 | |
| from time import time
 | |
| import string
 | |
| from polyglot.text import Word
 | |
| import logging
 | |
| 
 | |
| log = logging.getLogger(__name__)
 | |
| sno = nltk.stem.SnowballStemmer("english")
 | |
| 
 | |
| 
 | |
| def dict_safe_key(dic, key):
 | |
|     # Returns a list, no matter what.
 | |
|     # Transform 1 element into a list.
 | |
|     # Return key not found as empty list.
 | |
|     if (
 | |
|         dic is None or
 | |
|         key not in dic
 | |
|     ):
 | |
|         return []
 | |
|     subdic = dic[key]
 | |
|     if not isinstance(subdic, list):
 | |
|         return [subdic]
 | |
|     return subdic
 | |
| 
 | |
| 
 | |
| def pickle_dump(data, path):
 | |
|     with open(path, "wb") as file:
 | |
|         pickle.dump(data, file)
 | |
|         log.info("Dumped data to {}.".format(path))
 | |
|     return True
 | |
| 
 | |
| 
 | |
| def pickle_load(path):
 | |
|     ret = None
 | |
|     if os.path.isfile(path):
 | |
|         with open(path, "rb") as file:
 | |
|             ret = pickle.load(file)
 | |
|             log.info("Loaded data from {}.".format(path))
 | |
|     return ret  # Returns None in case of failure.
 | |
| 
 | |
| 
 | |
| # Implemented bucket sort for alphabetically sorting slovenian words.
 | |
| # Bucket sort >>>>>>>>>>>>>>>>>>>>
 | |
| def gen_sbs_alphabet():
 | |
|     alphabet = "abcčdefghijklmnoprsštuvzž"
 | |
|     return {letter: (idx + 1) for idx, letter in enumerate(alphabet)}
 | |
| 
 | |
| 
 | |
| slo_bucket_sort_alphabet = gen_sbs_alphabet()
 | |
| 
 | |
| 
 | |
| def slo_bucket_sort(words, key=None):
 | |
|     if key is None:
 | |
|         def key(x):
 | |
|             return x
 | |
| 
 | |
|     def alph_score(word, idx):
 | |
|         kword = key(word)
 | |
|         if idx >= len(kword):
 | |
|             return 0
 | |
|         return slo_bucket_sort_alphabet.get(kword[idx]) or 0
 | |
| 
 | |
|     def list_to_bins(words, idx):
 | |
|         bins = [[] for i in range(len(slo_bucket_sort_alphabet.keys()) + 1)]
 | |
|         for word in words:
 | |
|             bins[alph_score(word, idx)].append(word)
 | |
|         return bins
 | |
| 
 | |
|     def bins_to_list(bins):
 | |
|         lst = []
 | |
|         for b in bins:
 | |
|             for el in b:
 | |
|                 lst.append(el)
 | |
|         return lst
 | |
| 
 | |
|     maxLen = 0
 | |
|     for w in words:
 | |
|         if len(key(w)) > maxLen:
 | |
|             maxLen = len(key(w))
 | |
|     maxIdx = maxLen - 1
 | |
|     for idx in range(maxIdx, -1, -1):
 | |
|         bins = list_to_bins(words, idx)
 | |
|         words = bins_to_list(bins)
 | |
|         """
 | |
|         print(idx)
 | |
|         def get_letter(idx, word):
 | |
|             kword = key(word)
 | |
|             if idx < len(kword):
 | |
|                 return(kword[idx])
 | |
|             return "#"
 | |
|         print([(word, get_letter(idx, word)) for word in words])
 | |
|         """
 | |
|     return words
 | |
| # Bucket sort <<<<<<<<<<<<<<<<<<<<
 | |
| 
 | |
| 
 | |
| def stem_slo(x):
 | |
|     # Simplified;
 | |
|     # Remove the last syllable.
 | |
|     w = Word(x, language="sl").morphemes
 | |
|     ret = "".join(w[:-1])
 | |
|     return ret
 | |
| 
 | |
| 
 | |
| def stem_eng(x):
 | |
|     return sno.stem(x)
 | |
| 
 | |
| 
 | |
| def tokenize(sentence, min_token_len=3, stem=None):
 | |
|     # input: sentence string
 | |
|     # output: list of token strings
 | |
|     if stem is None:
 | |
|         def stem(x):
 | |
|             return x
 | |
|     all_tokens = []
 | |
|     sent_txt = nltk.sent_tokenize(sentence)
 | |
|     for sent in sent_txt:
 | |
|         tokens = nltk.word_tokenize(sent)
 | |
|         all_tokens.extend(tokens)
 | |
|     res = []
 | |
|     for x in all_tokens:
 | |
|         if x in string.punctuation:
 | |
|             continue
 | |
|         stemmed = stem(x.lower())
 | |
|         if len(stemmed) >= min_token_len:
 | |
|             res.append(stemmed)
 | |
|     return res
 | |
| 
 | |
| 
 | |
| def tokenize_multiple(str_list, min_token_len=3, stem=None):
 | |
|     # tstart = time()
 | |
|     res = []
 | |
|     for sentence in str_list:
 | |
|         res.extend(tokenize(sentence, min_token_len, stem))
 | |
|     # log.debug("tokenize_multiple: {:.2f}s".format(time() - tstart))
 | |
|     return res
 | |
| 
 | |
| 
 | |
| def t_tokenize():
 | |
|     teststring = "This is a test sentence. I hope it works. .. Asdf. asdf ,,,;"
 | |
|     print(teststring)
 | |
|     res = tokenize(teststring, min_token_len=None)
 | |
|     print(res)
 | |
| 
 | |
| 
 | |
| def permute_paths(list2d, x=None, y=None, paths=None, current_path=None):
 | |
|     # python stuff
 | |
|     if x is None:
 | |
|         x = -1
 | |
|     if paths is None:
 | |
|         paths = []
 | |
|     if current_path is None:
 | |
|         current_path = []
 | |
| 
 | |
|     if x >= len(list2d) - 1:
 | |
|         paths.append(current_path)
 | |
|         return paths
 | |
|     for i in range(len(list2d[x + 1])):
 | |
|         tmp_path = current_path + [(x + 1, i)]
 | |
|         # Computational complexity peoblem (prune long lists)
 | |
|         # len == 12 -> 30%, len == 5 -> 100%
 | |
|         # if random.randint(0, 100) <= (100 - 10 * (len(list2d) - 5)):
 | |
|         if True:
 | |
|             paths = permute_paths(
 | |
|                 list2d,
 | |
|                 x + 1,
 | |
|                 i,
 | |
|                 paths,
 | |
|                 tmp_path
 | |
|             )
 | |
|     return paths
 | |
| 
 | |
| 
 | |
| def t_permute_paths():
 | |
|     list2d = [
 | |
|         ["Greta"],
 | |
|         ["backflips"],
 | |
|         ["through", "around"],
 | |
|         ["North Korea", "kindergarten"],
 | |
|         ["with", "without"],
 | |
|         ["a"],
 | |
|         ["bag of", "abundance of"],
 | |
|         ["bolts", "janitors"]
 | |
|     ]
 | |
| 
 | |
|     print(list2d)
 | |
|     paths = permute_paths(list2d=list2d)
 | |
|     for path in paths:
 | |
|         print([list2d[p[0]][p[1]] for p in path])
 | |
| 
 | |
| 
 | |
| def find_overlaps(list_a, list_b):
 | |
|     # Input: two lists.
 | |
|     # Output: lists of overlapping elements.
 | |
|     dict_a = {}
 | |
|     dict_b = {}
 | |
|     lists = [list_a, list_b]
 | |
|     dicts = [dict_a, dict_b]
 | |
|     for lidx in range(len(lists)):
 | |
|         for elidx in range(len(lists[lidx])):
 | |
|             el = lists[lidx][elidx]
 | |
|             if el not in dicts[lidx]:
 | |
|                 dicts[lidx][el] = []
 | |
|             dicts[lidx][el].append(elidx)
 | |
| 
 | |
|     substrings = []
 | |
| 
 | |
|     sda = sorted(dict_a.keys())
 | |
|     sdb = sorted(dict_b.keys())
 | |
| 
 | |
|     i_sda = 0
 | |
|     i_sdb = 0
 | |
|     while ((i_sda < len(sda) and i_sdb < len(sdb))):
 | |
|         if sda[i_sda] == sdb[i_sdb]:
 | |
|             lia = dict_a[sda[i_sda]]
 | |
|             lib = dict_b[sdb[i_sdb]]
 | |
|             for llia in lia:
 | |
|                 for llib in lib:
 | |
|                     tmp_substr = []
 | |
|                     ii = 0
 | |
|                     while (
 | |
|                         (llia + ii < len(list_a)) and
 | |
|                         (llib + ii < len(list_b)) and
 | |
|                         (list_a[llia + ii] == list_b[llib + ii])
 | |
|                     ):
 | |
|                         tmp_substr.append(list_a[llia + ii])
 | |
|                         ii += 1
 | |
|                     ii = 1
 | |
|                     while (
 | |
|                         (llia - ii >= 0) and
 | |
|                         (llib - ii >= 0) and
 | |
|                         (list_a[llia - ii] == list_b[llib - ii])
 | |
|                     ):
 | |
|                         tmp_substr.insert(0, list_a[llia - ii])
 | |
|                         ii += 1
 | |
|                     substrings.append(tmp_substr)
 | |
|         if sda[i_sda] < sdb[i_sdb]:
 | |
|             i_sda += 1
 | |
|         else:
 | |
|             i_sdb += 1
 | |
| 
 | |
|     uniques = set()
 | |
|     res = []
 | |
|     for ss in substrings:
 | |
|         if str(ss) not in uniques:
 | |
|             uniques.add(str(ss))
 | |
|             res.append(ss)
 | |
|     return res
 | |
| 
 | |
| 
 | |
| def find_overlaps_str(tokens_a, tokens_b):
 | |
|     # Strings only.
 | |
|     overlaps = []
 | |
|     for N in range(1, 5):
 | |
|         ngrams_a = []
 | |
|         for i in range(len(tokens_a)):
 | |
|             if i + N <= len(tokens_a):
 | |
|                 ngrams_a.append(tuple(tokens_a[i:i + N]))
 | |
|         ngrams_b = []
 | |
|         for i in range(len(tokens_b)):
 | |
|             if i + N <= len(tokens_b):
 | |
|                 ngrams_b.append(tuple(tokens_b[i:i + N]))
 | |
|         overlaps.extend(list(set(ngrams_a).intersection(set(ngrams_b))))
 | |
| 
 | |
|     res = []
 | |
|     for ovl in sorted(overlaps, key=lambda x: len(x), reverse=True):
 | |
|         oovl = " ".join(ovl)
 | |
|         for r in res:
 | |
|             if oovl in r:
 | |
|                 break
 | |
|         else:
 | |
|             res.append(oovl)
 | |
|     res[:] = [x.split(" ") for x in res]
 | |
|     return res
 | |
| 
 | |
| 
 | |
| def t_find_overlaps():
 | |
|     res = []
 | |
|     input_len = [10, 100, 1000, 10000]
 | |
|     for ll in input_len:
 | |
|         alen = ll + int(ll * random.uniform(0.8, 1))
 | |
|         blen = ll + int(ll * random.uniform(0.8, 1))
 | |
|         a = [random.randint(0, 100) for x in range(alen)]
 | |
|         b = [random.randint(0, 100) for x in range(blen)]
 | |
|         tstart = time()
 | |
|         find_overlaps(a, b)
 | |
|         res.append({
 | |
|             "time": time() - tstart,
 | |
|             "input_size": ll
 | |
|         })
 | |
|     """
 | |
|     list_a = [6, 6, 4, 8, 3, 2, 2, 5, 6, 3, 4, 7, 5]
 | |
|     list_b = [5, 3, 6, 8, 6, 6, 5, 3, 2, 6, 7, 8, 3, 2, 3, 2, 2, 5]
 | |
|     res = find_overlaps(list_a, list_b)
 | |
|     """
 | |
|     for r in res:
 | |
|         print(r)
 | |
| 
 | |
| 
 | |
| def t1_find_overlaps():
 | |
|     t1 = "This is a test sentence. I hope it works. .. Asdf. asdf ,,,;"
 | |
|     t2 = "this is a seconde sentence. I hope my stuff works."
 | |
|     print(t1)
 | |
|     print(t2)
 | |
|     res = find_overlaps(tokenize(t1), tokenize(t2))
 | |
|     for r in res:
 | |
|         print(r)
 | |
| 
 | |
|     print()
 | |
| 
 | |
|     res = find_overlaps_str(tokenize(t1), tokenize(t2))
 | |
|     for r in res:
 | |
|         print(r)
 | |
| 
 | |
| 
 | |
| def t_find_overlaps_str():
 | |
|     t1 = [
 | |
|         'vsa', 'moja', 'možganska', 'beda', 'se', 'združuje',
 | |
|         'v', 'dejstvu', 'da', 'sem', 'si', 'čeprav', 'sem', 'pozabil',
 | |
|         'ulico', 'zapomnil', 'hišno', 'številko'
 | |
|     ]
 | |
|     t2 = [
 | |
|         'narediti', 'doseči', 'da', 'se', 'kaj', 'aktivno', 'ohrani',
 | |
|         'v', 'zavesti', 'zapomniti', 'si', 'imena', 'predstavljenih',
 | |
|         'gostov', 'dobro', 'natančno', 'slabo', 'si', 'kaj', 'zapomniti',
 | |
|         'takega', 'sem', 'si', 'zapomnil', 'zapomnite', 'te', 'prizore'
 | |
|     ]
 | |
|     res = find_overlaps(t1, t2)
 | |
|     print(res)
 | |
| 
 | |
| 
 | |
| def t_slo_bucket_sort():
 | |
|     a1 = []
 | |
|     a2 = []
 | |
|     with open("./tests/m_besede2.txt") as f:
 | |
|         for line in f:
 | |
|             a1.append(line.split("\n")[0])
 | |
|             a2.append((line.split("\n")[0], random.randint(0, 9)))
 | |
| 
 | |
|     a1 = slo_bucket_sort(a1)
 | |
|     a2 = slo_bucket_sort(a2, key=lambda x: x[0])
 | |
| 
 | |
|     check = True
 | |
|     for i in range(len(a1)):
 | |
|         check &= (a1[i] == a2[i][0])
 | |
|         print("{:<10}{:>10}".format(str(a1[i]), str(a2[i])))
 | |
|     print(check)
 | |
| 
 | |
| 
 | |
| def t1_slo_bucket_sort():
 | |
|     words = "_xyz zebra. .bober raca bor borovnica antilopa".split(" ")
 | |
|     words.append("test space")
 | |
|     words.append("test srrrr")
 | |
|     words.append("test saaa")
 | |
|     for w in slo_bucket_sort(words):
 | |
|         print(w)
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     # t_find_overlaps()
 | |
|     # t1_find_overlaps()
 | |
|     # t_tokenize()
 | |
|     # t_find_overlaps_str()
 | |
|     t1_slo_bucket_sort()
 |