cjvt-valency/dip_src/valency/k_utils.py

import os
import pickle
import nltk
import random
from time import time
import string
from polyglot.text import Word
import logging

log = logging.getLogger(__name__)
sno = nltk.stem.SnowballStemmer("english")


def dict_safe_key(dic, key):
    # Returns a list, no matter what.
    # Transform 1 element into a list.
    # Return key not found as empty list.
    if (
        dic is None or
        key not in dic
    ):
        return []
    subdic = dic[key]
    if not isinstance(subdic, list):
        return [subdic]
    return subdic


def pickle_dump(data, path):
    with open(path, "wb") as file:
        pickle.dump(data, file)
        log.info("Dumped data to {}.".format(path))
    return True


def pickle_load(path):
    ret = None
    if os.path.isfile(path):
        with open(path, "rb") as file:
            ret = pickle.load(file)
            log.info("Loaded data from {}.".format(path))
    return ret  # Returns None in case of failure.


# Implemented bucket sort for alphabetically sorting slovenian words.
# Bucket sort >>>>>>>>>>>>>>>>>>>>
def gen_sbs_alphabet():
    alphabet = "abcčdefghijklmnoprsštuvzž"
    return {letter: (idx + 1) for idx, letter in enumerate(alphabet)}


slo_bucket_sort_alphabet = gen_sbs_alphabet()


def slo_bucket_sort(words, key=None):
    if key is None:
        def key(x):
            return x

    def alph_score(word, idx):
        kword = key(word)
        if idx >= len(kword):
            return 0
        return slo_bucket_sort_alphabet.get(kword[idx]) or 0

    def list_to_bins(words, idx):
        bins = [[] for i in range(len(slo_bucket_sort_alphabet.keys()) + 1)]
        for word in words:
            bins[alph_score(word, idx)].append(word)
        return bins

    def bins_to_list(bins):
        lst = []
        for b in bins:
            for el in b:
                lst.append(el)
        return lst

    maxLen = 0
    for w in words:
        if len(key(w)) > maxLen:
            maxLen = len(key(w))
    maxIdx = maxLen - 1
    for idx in range(maxIdx, -1, -1):
        bins = list_to_bins(words, idx)
        words = bins_to_list(bins)
        """
        print(idx)
        def get_letter(idx, word):
            kword = key(word)
            if idx < len(kword):
                return(kword[idx])
            return "#"
        print([(word, get_letter(idx, word)) for word in words])
        """
    return words
# Bucket sort <<<<<<<<<<<<<<<<<<<<


def stem_slo(x):
    # Simplified;
    # Remove the last syllable.
    w = Word(x, language="sl").morphemes
    ret = "".join(w[:-1])
    return ret


def stem_eng(x):
    return sno.stem(x)


def tokenize(sentence, min_token_len=3, stem=None):
    # input: sentence string
    # output: list of token strings
    if stem is None:
        def stem(x):
            return x
    all_tokens = []
    sent_txt = nltk.sent_tokenize(sentence)
    for sent in sent_txt:
        tokens = nltk.word_tokenize(sent)
        all_tokens.extend(tokens)
    res = []
    for x in all_tokens:
        if x in string.punctuation:
            continue
        stemmed = stem(x.lower())
        if len(stemmed) >= min_token_len:
            res.append(stemmed)
    return res


def tokenize_multiple(str_list, min_token_len=3, stem=None):
    # tstart = time()
    res = []
    for sentence in str_list:
        res.extend(tokenize(sentence, min_token_len, stem))
    # log.debug("tokenize_multiple: {:.2f}s".format(time() - tstart))
    return res


def t_tokenize():
    teststring = "This is a test sentence. I hope it works. .. Asdf. asdf ,,,;"
    print(teststring)
    res = tokenize(teststring, min_token_len=None)
    print(res)


def permute_paths(list2d, x=None, y=None, paths=None, current_path=None):
    # python stuff
    if x is None:
        x = -1
    if paths is None:
        paths = []
    if current_path is None:
        current_path = []

    if x >= len(list2d) - 1:
        paths.append(current_path)
        return paths
    for i in range(len(list2d[x + 1])):
        tmp_path = current_path + [(x + 1, i)]
        # Computational complexity peoblem (prune long lists)
        # len == 12 -> 30%, len == 5 -> 100%
        # if random.randint(0, 100) <= (100 - 10 * (len(list2d) - 5)):
        if True:
            paths = permute_paths(
                list2d,
                x + 1,
                i,
                paths,
                tmp_path
            )
    return paths


def t_permute_paths():
    list2d = [
        ["Greta"],
        ["backflips"],
        ["through", "around"],
        ["North Korea", "kindergarten"],
        ["with", "without"],
        ["a"],
        ["bag of", "abundance of"],
        ["bolts", "janitors"]
    ]

    print(list2d)
    paths = permute_paths(list2d=list2d)
    for path in paths:
        print([list2d[p[0]][p[1]] for p in path])


def find_overlaps(list_a, list_b):
    # Input: two lists.
    # Output: lists of overlapping elements.
    dict_a = {}
    dict_b = {}
    lists = [list_a, list_b]
    dicts = [dict_a, dict_b]
    for lidx in range(len(lists)):
        for elidx in range(len(lists[lidx])):
            el = lists[lidx][elidx]
            if el not in dicts[lidx]:
                dicts[lidx][el] = []
            dicts[lidx][el].append(elidx)

    substrings = []

    sda = sorted(dict_a.keys())
    sdb = sorted(dict_b.keys())

    i_sda = 0
    i_sdb = 0
    while ((i_sda < len(sda) and i_sdb < len(sdb))):
        if sda[i_sda] == sdb[i_sdb]:
            lia = dict_a[sda[i_sda]]
            lib = dict_b[sdb[i_sdb]]
            for llia in lia:
                for llib in lib:
                    tmp_substr = []
                    ii = 0
                    while (
                        (llia + ii < len(list_a)) and
                        (llib + ii < len(list_b)) and
                        (list_a[llia + ii] == list_b[llib + ii])
                    ):
                        tmp_substr.append(list_a[llia + ii])
                        ii += 1
                    ii = 1
                    while (
                        (llia - ii >= 0) and
                        (llib - ii >= 0) and
                        (list_a[llia - ii] == list_b[llib - ii])
                    ):
                        tmp_substr.insert(0, list_a[llia - ii])
                        ii += 1
                    substrings.append(tmp_substr)
        if sda[i_sda] < sdb[i_sdb]:
            i_sda += 1
        else:
            i_sdb += 1

    uniques = set()
    res = []
    for ss in substrings:
        if str(ss) not in uniques:
            uniques.add(str(ss))
            res.append(ss)
    return res


def find_overlaps_str(tokens_a, tokens_b):
    # Strings only.
    overlaps = []
    for N in range(1, 5):
        ngrams_a = []
        for i in range(len(tokens_a)):
            if i + N <= len(tokens_a):
                ngrams_a.append(tuple(tokens_a[i:i + N]))
        ngrams_b = []
        for i in range(len(tokens_b)):
            if i + N <= len(tokens_b):
                ngrams_b.append(tuple(tokens_b[i:i + N]))
        overlaps.extend(list(set(ngrams_a).intersection(set(ngrams_b))))

    res = []
    for ovl in sorted(overlaps, key=lambda x: len(x), reverse=True):
        oovl = " ".join(ovl)
        for r in res:
            if oovl in r:
                break
        else:
            res.append(oovl)
    res[:] = [x.split(" ") for x in res]
    return res


def t_find_overlaps():
    res = []
    input_len = [10, 100, 1000, 10000]
    for ll in input_len:
        alen = ll + int(ll * random.uniform(0.8, 1))
        blen = ll + int(ll * random.uniform(0.8, 1))
        a = [random.randint(0, 100) for x in range(alen)]
        b = [random.randint(0, 100) for x in range(blen)]
        tstart = time()
        find_overlaps(a, b)
        res.append({
            "time": time() - tstart,
            "input_size": ll
        })
    """
    list_a = [6, 6, 4, 8, 3, 2, 2, 5, 6, 3, 4, 7, 5]
    list_b = [5, 3, 6, 8, 6, 6, 5, 3, 2, 6, 7, 8, 3, 2, 3, 2, 2, 5]
    res = find_overlaps(list_a, list_b)
    """
    for r in res:
        print(r)


def t1_find_overlaps():
    t1 = "This is a test sentence. I hope it works. .. Asdf. asdf ,,,;"
    t2 = "this is a seconde sentence. I hope my stuff works."
    print(t1)
    print(t2)
    res = find_overlaps(tokenize(t1), tokenize(t2))
    for r in res:
        print(r)

    print()

    res = find_overlaps_str(tokenize(t1), tokenize(t2))
    for r in res:
        print(r)


def t_find_overlaps_str():
    t1 = [
        'vsa', 'moja', 'možganska', 'beda', 'se', 'združuje',
        'v', 'dejstvu', 'da', 'sem', 'si', 'čeprav', 'sem', 'pozabil',
        'ulico', 'zapomnil', 'hišno', 'številko'
    ]
    t2 = [
        'narediti', 'doseči', 'da', 'se', 'kaj', 'aktivno', 'ohrani',
        'v', 'zavesti', 'zapomniti', 'si', 'imena', 'predstavljenih',
        'gostov', 'dobro', 'natančno', 'slabo', 'si', 'kaj', 'zapomniti',
        'takega', 'sem', 'si', 'zapomnil', 'zapomnite', 'te', 'prizore'
    ]
    res = find_overlaps(t1, t2)
    print(res)


def t_slo_bucket_sort():
    a1 = []
    a2 = []
    with open("./tests/m_besede2.txt") as f:
        for line in f:
            a1.append(line.split("\n")[0])
            a2.append((line.split("\n")[0], random.randint(0, 9)))

    a1 = slo_bucket_sort(a1)
    a2 = slo_bucket_sort(a2, key=lambda x: x[0])

    check = True
    for i in range(len(a1)):
        check &= (a1[i] == a2[i][0])
        print("{:<10}{:>10}".format(str(a1[i]), str(a2[i])))
    print(check)


def t1_slo_bucket_sort():
    words = "_xyz zebra. .bober raca bor borovnica antilopa".split(" ")
    words.append("test space")
    words.append("test srrrr")
    words.append("test saaa")
    for w in slo_bucket_sort(words):
        print(w)


if __name__ == "__main__":
    # t_find_overlaps()
    # t1_find_overlaps()
    # t_tokenize()
    # t_find_overlaps_str()
    t1_slo_bucket_sort()
old files from diploma's poc 2019-03-07 08:00:01 +00:00			`import os`
			`import pickle`
			`import nltk`
			`import random`
			`from time import time`
			`import string`
			`from polyglot.text import Word`
			`import logging`

			`log = logging.getLogger(__name__)`
			`sno = nltk.stem.SnowballStemmer("english")`


			`def dict_safe_key(dic, key):`
			`# Returns a list, no matter what.`
			`# Transform 1 element into a list.`
			`# Return key not found as empty list.`
			`if (`
			`dic is None or`
			`key not in dic`
			`):`
			`return []`
			`subdic = dic[key]`
			`if not isinstance(subdic, list):`
			`return [subdic]`
			`return subdic`


			`def pickle_dump(data, path):`
			`with open(path, "wb") as file:`
			`pickle.dump(data, file)`
			`log.info("Dumped data to {}.".format(path))`
			`return True`


			`def pickle_load(path):`
			`ret = None`
			`if os.path.isfile(path):`
			`with open(path, "rb") as file:`
			`ret = pickle.load(file)`
			`log.info("Loaded data from {}.".format(path))`
			`return ret # Returns None in case of failure.`


			`# Implemented bucket sort for alphabetically sorting slovenian words.`
			`# Bucket sort >>>>>>>>>>>>>>>>>>>>`
			`def gen_sbs_alphabet():`
			`alphabet = "abcčdefghijklmnoprsštuvzž"`
			`return {letter: (idx + 1) for idx, letter in enumerate(alphabet)}`


			`slo_bucket_sort_alphabet = gen_sbs_alphabet()`


			`def slo_bucket_sort(words, key=None):`
			`if key is None:`
			`def key(x):`
			`return x`

			`def alph_score(word, idx):`
			`kword = key(word)`
			`if idx >= len(kword):`
			`return 0`
			`return slo_bucket_sort_alphabet.get(kword[idx]) or 0`

			`def list_to_bins(words, idx):`
			`bins = [[] for i in range(len(slo_bucket_sort_alphabet.keys()) + 1)]`
			`for word in words:`
			`bins[alph_score(word, idx)].append(word)`
			`return bins`

			`def bins_to_list(bins):`
			`lst = []`
			`for b in bins:`
			`for el in b:`
			`lst.append(el)`
			`return lst`

			`maxLen = 0`
			`for w in words:`
			`if len(key(w)) > maxLen:`
			`maxLen = len(key(w))`
			`maxIdx = maxLen - 1`
			`for idx in range(maxIdx, -1, -1):`
			`bins = list_to_bins(words, idx)`
			`words = bins_to_list(bins)`
			`"""`
			`print(idx)`
			`def get_letter(idx, word):`
			`kword = key(word)`
			`if idx < len(kword):`
			`return(kword[idx])`
			`return "#"`
			`print([(word, get_letter(idx, word)) for word in words])`
			`"""`
			`return words`
			`# Bucket sort <<<<<<<<<<<<<<<<<<<<`


			`def stem_slo(x):`
			`# Simplified;`
			`# Remove the last syllable.`
			`w = Word(x, language="sl").morphemes`
			`ret = "".join(w[:-1])`
			`return ret`


			`def stem_eng(x):`
			`return sno.stem(x)`


			`def tokenize(sentence, min_token_len=3, stem=None):`
			`# input: sentence string`
			`# output: list of token strings`
			`if stem is None:`
			`def stem(x):`
			`return x`
			`all_tokens = []`
			`sent_txt = nltk.sent_tokenize(sentence)`
			`for sent in sent_txt:`
			`tokens = nltk.word_tokenize(sent)`
			`all_tokens.extend(tokens)`
			`res = []`
			`for x in all_tokens:`
			`if x in string.punctuation:`
			`continue`
			`stemmed = stem(x.lower())`
			`if len(stemmed) >= min_token_len:`
			`res.append(stemmed)`
			`return res`


			`def tokenize_multiple(str_list, min_token_len=3, stem=None):`
			`# tstart = time()`
			`res = []`
			`for sentence in str_list:`
			`res.extend(tokenize(sentence, min_token_len, stem))`
			`# log.debug("tokenize_multiple: {:.2f}s".format(time() - tstart))`
			`return res`


			`def t_tokenize():`
			`teststring = "This is a test sentence. I hope it works. .. Asdf. asdf ,,,;"`
			`print(teststring)`
			`res = tokenize(teststring, min_token_len=None)`
			`print(res)`


			`def permute_paths(list2d, x=None, y=None, paths=None, current_path=None):`
			`# python stuff`
			`if x is None:`
			`x = -1`
			`if paths is None:`
			`paths = []`
			`if current_path is None:`
			`current_path = []`

			`if x >= len(list2d) - 1:`
			`paths.append(current_path)`
			`return paths`
			`for i in range(len(list2d[x + 1])):`
			`tmp_path = current_path + [(x + 1, i)]`
			`# Computational complexity peoblem (prune long lists)`
			`# len == 12 -> 30%, len == 5 -> 100%`
			`# if random.randint(0, 100) <= (100 - 10 * (len(list2d) - 5)):`
			`if True:`
			`paths = permute_paths(`
			`list2d,`
			`x + 1,`
			`i,`
			`paths,`
			`tmp_path`
			`)`
			`return paths`


			`def t_permute_paths():`
			`list2d = [`
			`["Greta"],`
			`["backflips"],`
			`["through", "around"],`
			`["North Korea", "kindergarten"],`
			`["with", "without"],`
			`["a"],`
			`["bag of", "abundance of"],`
			`["bolts", "janitors"]`
			`]`

			`print(list2d)`
			`paths = permute_paths(list2d=list2d)`
			`for path in paths:`
			`print([list2d[p[0]][p[1]] for p in path])`


			`def find_overlaps(list_a, list_b):`
			`# Input: two lists.`
			`# Output: lists of overlapping elements.`
			`dict_a = {}`
			`dict_b = {}`
			`lists = [list_a, list_b]`
			`dicts = [dict_a, dict_b]`
			`for lidx in range(len(lists)):`
			`for elidx in range(len(lists[lidx])):`
			`el = lists[lidx][elidx]`
			`if el not in dicts[lidx]:`
			`dicts[lidx][el] = []`
			`dicts[lidx][el].append(elidx)`

			`substrings = []`

			`sda = sorted(dict_a.keys())`
			`sdb = sorted(dict_b.keys())`

			`i_sda = 0`
			`i_sdb = 0`
			`while ((i_sda < len(sda) and i_sdb < len(sdb))):`
			`if sda[i_sda] == sdb[i_sdb]:`
			`lia = dict_a[sda[i_sda]]`
			`lib = dict_b[sdb[i_sdb]]`
			`for llia in lia:`
			`for llib in lib:`
			`tmp_substr = []`
			`ii = 0`
			`while (`
			`(llia + ii < len(list_a)) and`
			`(llib + ii < len(list_b)) and`
			`(list_a[llia + ii] == list_b[llib + ii])`
			`):`
			`tmp_substr.append(list_a[llia + ii])`
			`ii += 1`
			`ii = 1`
			`while (`
			`(llia - ii >= 0) and`
			`(llib - ii >= 0) and`
			`(list_a[llia - ii] == list_b[llib - ii])`
			`):`
			`tmp_substr.insert(0, list_a[llia - ii])`
			`ii += 1`
			`substrings.append(tmp_substr)`
			`if sda[i_sda] < sdb[i_sdb]:`
			`i_sda += 1`
			`else:`
			`i_sdb += 1`

			`uniques = set()`
			`res = []`
			`for ss in substrings:`
			`if str(ss) not in uniques:`
			`uniques.add(str(ss))`
			`res.append(ss)`
			`return res`


			`def find_overlaps_str(tokens_a, tokens_b):`
			`# Strings only.`
			`overlaps = []`
			`for N in range(1, 5):`
			`ngrams_a = []`
			`for i in range(len(tokens_a)):`
			`if i + N <= len(tokens_a):`
			`ngrams_a.append(tuple(tokens_a[i:i + N]))`
			`ngrams_b = []`
			`for i in range(len(tokens_b)):`
			`if i + N <= len(tokens_b):`
			`ngrams_b.append(tuple(tokens_b[i:i + N]))`
			`overlaps.extend(list(set(ngrams_a).intersection(set(ngrams_b))))`

			`res = []`
			`for ovl in sorted(overlaps, key=lambda x: len(x), reverse=True):`
			`oovl = " ".join(ovl)`
			`for r in res:`
			`if oovl in r:`
			`break`
			`else:`
			`res.append(oovl)`
			`res[:] = [x.split(" ") for x in res]`
			`return res`


			`def t_find_overlaps():`
			`res = []`
			`input_len = [10, 100, 1000, 10000]`
			`for ll in input_len:`
			`alen = ll + int(ll * random.uniform(0.8, 1))`
			`blen = ll + int(ll * random.uniform(0.8, 1))`
			`a = [random.randint(0, 100) for x in range(alen)]`
			`b = [random.randint(0, 100) for x in range(blen)]`
			`tstart = time()`
			`find_overlaps(a, b)`
			`res.append({`
			`"time": time() - tstart,`
			`"input_size": ll`
			`})`
			`"""`
			`list_a = [6, 6, 4, 8, 3, 2, 2, 5, 6, 3, 4, 7, 5]`
			`list_b = [5, 3, 6, 8, 6, 6, 5, 3, 2, 6, 7, 8, 3, 2, 3, 2, 2, 5]`
			`res = find_overlaps(list_a, list_b)`
			`"""`
			`for r in res:`
			`print(r)`


			`def t1_find_overlaps():`
			`t1 = "This is a test sentence. I hope it works. .. Asdf. asdf ,,,;"`
			`t2 = "this is a seconde sentence. I hope my stuff works."`
			`print(t1)`
			`print(t2)`
			`res = find_overlaps(tokenize(t1), tokenize(t2))`
			`for r in res:`
			`print(r)`

			`print()`

			`res = find_overlaps_str(tokenize(t1), tokenize(t2))`
			`for r in res:`
			`print(r)`


			`def t_find_overlaps_str():`
			`t1 = [`
			`'vsa', 'moja', 'možganska', 'beda', 'se', 'združuje',`
			`'v', 'dejstvu', 'da', 'sem', 'si', 'čeprav', 'sem', 'pozabil',`
			`'ulico', 'zapomnil', 'hišno', 'številko'`
			`]`
			`t2 = [`
			`'narediti', 'doseči', 'da', 'se', 'kaj', 'aktivno', 'ohrani',`
			`'v', 'zavesti', 'zapomniti', 'si', 'imena', 'predstavljenih',`
			`'gostov', 'dobro', 'natančno', 'slabo', 'si', 'kaj', 'zapomniti',`
			`'takega', 'sem', 'si', 'zapomnil', 'zapomnite', 'te', 'prizore'`
			`]`
			`res = find_overlaps(t1, t2)`
			`print(res)`


			`def t_slo_bucket_sort():`
			`a1 = []`
			`a2 = []`
			`with open("./tests/m_besede2.txt") as f:`
			`for line in f:`
			`a1.append(line.split("\n")[0])`
			`a2.append((line.split("\n")[0], random.randint(0, 9)))`

			`a1 = slo_bucket_sort(a1)`
			`a2 = slo_bucket_sort(a2, key=lambda x: x[0])`

			`check = True`
			`for i in range(len(a1)):`
			`check &= (a1[i] == a2[i][0])`
			`print("{:<10}{:>10}".format(str(a1[i]), str(a2[i])))`
			`print(check)`


			`def t1_slo_bucket_sort():`
			`words = "_xyz zebra. .bober raca bor borovnica antilopa".split(" ")`
			`words.append("test space")`
			`words.append("test srrrr")`
			`words.append("test saaa")`
			`for w in slo_bucket_sort(words):`
			`print(w)`


			`if __name__ == "__main__":`
			`# t_find_overlaps()`
			`# t1_find_overlaps()`
			`# t_tokenize()`
			`# t_find_overlaps_str()`
			`t1_slo_bucket_sort()`