frontend_devops fix

This commit is contained in:
2019-03-20 17:49:34 +01:00
parent fbe9eb7b0f
commit aab075a291
96 changed files with 4 additions and 4 deletions

1
dip_src/valency/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
*.pickle

View File

View File

@@ -0,0 +1,386 @@
from valency import k_utils
import logging
from time import time
from valency.k_utils import dict_safe_key as dsk
from copy import deepcopy as DC
log = logging.getLogger(__name__)
# Upper limit for how many senses a lemma can have.
GUL = 20
SLOWNET_CACHE = "slownet_glosses_cache"
class DictionaryInterface:
def __init__(self, vallex, dictionary):
self.vallex = vallex
self.dictionary = "interface"
def find(self, lemma):
return []
def contains(self, lemma, upper_limit=GUL):
# useless. need to check if sense_glosses returns non empty list
res = self.find(lemma)
if upper_limit is not None and len(res) > upper_limit:
return False
return (len(res) is not 0)
def cached_glosses(self, lemma):
# preprocessed self_glosses (not used)
res = list(self.vallex.db.cached_glosses.find(
{"lemma": lemma, "dictionary": self.dictionary}))
if len(res) == 0:
return []
return res[0]["glosses"]
def sense_glosses(self, lemma):
# array: gloss for each sense
# gloss: {"gloss": ["<sense>", ...], "def": ["<sense"], ...}
return "dictionary_interface.py: not_yet_implemented"
# Recursively pull strgins out of a dictionary,
# based on a list of keys.
# uses self.recursion_buffer
def pull_strings_wrapper(self, element, keys):
if element is None:
return []
self.recursion_buffer = []
self.pull_strings(element, keys)
return self.recursion_buffer[:]
def pull_strings(self, element, keys):
# Recursively pull values out of a dict.
# correct key + element as string or list of strings
for k, e in element.items():
if k not in keys:
continue
if isinstance(e, dict):
self.pull_strings(e, keys)
elif isinstance(e, str):
self.recursion_buffer.append(e)
elif isinstance(e, list):
for ee in e:
if isinstance(ee, dict):
self.pull_strings(ee, keys)
elif isinstance(ee, str):
self.recursion_buffer.append(ee)
class Sskj(DictionaryInterface):
def __init__(self, vallex):
super().__init__(vallex, "sskj")
def find(self, lemma):
res = list(self.vallex.db.sskj.find(
{"ns0:entry.ns0:form.ns0:orth": lemma}
))
return res
def sense_glosses(self, lemma, upper_limit=GUL):
entries = self.find(lemma)
if upper_limit is not None and len(entries) > upper_limit:
log.info("sense_glosses({}): too many sense entries".format(lemma))
return []
senses = []
if len(entries) == 0:
return []
for e in entries:
senses.extend(dsk(
e["ns0:entry"], "ns0:sense"))
keys = [
"ns0:def", "ns0:cit", "ns0:quote",
"ns0:gloss", "ns0:sense", "ns0:orth",
"ns0:form", "#text"
]
glosses = []
for s in senses:
gloss = self.pull_strings_wrapper(s, keys)
if len(gloss) == 0:
continue
glosses.append({
"gloss": gloss,
"def": self.pull_strings_wrapper(s, ["ns0:sense", "ns0:def"])
})
return glosses
class SloWnet(DictionaryInterface):
def __init__(self, vallex):
super().__init__(vallex, "slownet")
self.hypernym_buffer = []
def slo_to_eng(self, lemma):
def helper_get_eng_lemmas(r):
res = []
for literal in dsk(r, "SYNONYM"):
if literal["@xml:lang"] == "en":
for lt in dsk(literal, "LITERAL"):
res.append(lt["#text"])
return res
# takes a slo token, returns array of english counterparts
results = self.find(lemma)
eng_lemmas = []
for r in results:
eng_lemmas.extend(helper_get_eng_lemmas(r))
return eng_lemmas
def helper_get_hypernyms(self, entry):
res = []
dd = dsk(entry, "ILR")
for d in dd:
if d["@type"] == "hypernym":
res.append(d["#text"])
return res
def helper_get_en_literals(self, entry):
res = []
synonyms = dsk(entry, "SYNONYM")
for syn in synonyms:
if syn["@xml:lang"] == "en":
literals = dsk(syn, "LITERAL")
for lit in literals:
res.append(lit["#text"])
return res
def rek_root_chain(self, slownet_id):
entry = self.find_by_id(slownet_id)
if entry is None:
return []
res = self.helper_get_en_literals(entry)
for hypernym_id in self.helper_get_hypernyms(slownet_id):
res.extend(self.rek_root_chain(hypernym_id))
return res
def root_chain(self, lemma):
cached = list(self.vallex.db.cached_root_chains.find({
"lemma": lemma
}))
if cached:
return cached[0]["data"]
res = self.slo_to_eng(lemma)
entries = self.find(lemma)
start_hypernym_ids = []
for ent in entries:
start_hypernym_ids.extend(self.helper_get_hypernyms(ent))
for shi in start_hypernym_ids:
res.extend(self.rek_root_chain(shi))
self.vallex.db.cached_root_chains.insert({
"lemma": lemma,
"data": res
})
return res
def find_by_id(self, slownet_id):
res = list(self.vallex.db.slownet.find({"ID": slownet_id}))
if len(res) == 0:
log.error("ID: {} not in db.slownet.".format(slownet_id))
return None
return res[0]
def find(self, lemma):
return list(self.vallex.db.slownet.find({"slo_lemma": lemma}))
"""
# elemMatch for array query
res = list(self.vallex.db.slownet.find({
"SYNONYM": {'$elemMatch': {
"LITERAL": {'$elemMatch': {"#text": lemma}}
}}
}))
"""
def hypernyms(self, slownet_id, level):
if level == 3:
return
elements = list(self.vallex.db.slownet.find({"ID": slownet_id}))
if len(elements) == 0:
return
for e in elements:
ei = self.extract_element_info(e)
self.hypernym_buffer.append({
"def": ei["domain"] + ei["def"],
"gloss": ei["domain"] + ei["def"] + ei["usage"]
})
for ilr in ei["ilr"]:
self.hypernyms(ilr, level + 1)
def extract_element_info(self, e):
domain = []
dd = dsk(e, "DOMAIN")
for d in dd:
domain.append(d)
definition = []
dd = dsk(e, "DEF")
for d in dd:
if d["@xml:lang"] == "en":
definition.append(d["#text"])
ilr = []
dd = dsk(e, "ILR")
for d in dd:
if d["@type"] == "hypernym":
ilr.append(d["#text"])
usage = []
dd = dsk(e, "USAGE")
for d in dd:
if d["@xml:lang"] == "en":
usage.append(d["#text"])
return {
"domain": domain,
"def": definition,
"ilr": ilr,
"usage": usage,
}
def sense_glosses(self, lemma, upper_limit=GUL):
# stime = time()
# caching
db_key = {
"lemma": lemma,
"upper_limit": upper_limit
}
cache = list(self.vallex.db[SLOWNET_CACHE].find(db_key))
if len(cache) > 0:
return cache[0]["data"]
entries = self.find(lemma)
if upper_limit is not None and len(entries) > upper_limit:
# log.info("sense_glosses({}): too many senses".format(lemma))
return []
ret_glosses = []
for e in entries:
defs = []
glosses = []
self.hypernym_buffer = []
ei = self.extract_element_info(e)
self.hypernym_buffer.append({
"def": ei["domain"] + ei["def"],
"gloss": ei["domain"] + ei["def"] + ei["usage"]
})
for ilr in ei["ilr"]:
self.hypernyms(ilr, 1)
[defs.extend(x["def"]) for x in self.hypernym_buffer]
[glosses.extend(x["gloss"]) for x in self.hypernym_buffer]
ret_glosses.append({
"def": defs,
"gloss": glosses,
})
# log.debug("slownet.sense_glosses({}): {:.2f}s".format(
# lemma, time() - stime))
# caching
db_entry = {
"lemma": db_key["lemma"],
"upper_limit": db_key["upper_limit"],
"data": ret_glosses
}
self.vallex.db.slownet_sense_glosses.update(
db_key, db_entry, upsert=True
)
return ret_glosses
class Sskj2(DictionaryInterface):
def __init__(self, vallex):
super().__init__(vallex, "sskj")
def find(self, lemma):
pos = "glagol"
if lemma[-1] == "_":
pos = "pridevnik"
res = list(self.vallex.db.sskj.find({
"izt_clean": lemma,
"pos": pos
}))
return res
def count_senses(self, lemma):
entries = self.find(lemma)
if len(entries) == 0:
return 0
ol = dsk(entries[0], "ol")
if len(ol) == 0:
return 1
return len(ol[0]["li"])
def sense_glosses(self, lemma, upper_limit=GUL):
def helper_dict_safe_add(dic, key, el):
if key not in dic:
dic[key] = []
dic[key].append(el)
def helper_pull_rec(el_lst, res_dct):
for el in el_lst:
if isinstance(el, dict):
if ("@title" in el) and ("#text" in el):
helper_dict_safe_add(
res_dct, el["@title"], el["#text"])
if "span" in el:
helper_pull_rec(dsk(el, "span"), res_dct)
if ("ol" in el) and ("li" in el["ol"]):
helper_pull_rec(el["ol"]["li"], res_dct)
if "li" in el:
helper_pull_rec(el["li"], res_dct)
entries = self.find(lemma)
if len(entries) == 0:
return []
if len(entries) > 1:
log.warning("{} entries for {} in sskj2.".format(
len(entries), lemma))
glosses_per_entry = []
for idx, entry in enumerate(entries):
res_dict = {}
if "span" in entry:
helper_pull_rec(dsk(entry, "span"), res_dict)
# senses
res_dict["senses"] = []
if ("ol" in entry) and ("li" in entry["ol"]):
for el in dsk(entry["ol"], "li"):
tmp = {"sskj_sense_id": el["span"][0]}
helper_pull_rec(dsk(el, "span"), tmp)
helper_pull_rec(dsk(el, "ol"), tmp)
res_dict["senses"].append(DC(tmp))
def helper_create_gloss(dct):
keys = ["Razlaga", "Zgled", "Stranska razlaga", "Sopomenka"]
ret = []
for k in keys:
ret.extend(dsk(dct, k))
return ret
glosses = []
n_senses = len(res_dict["senses"])
if n_senses == 0:
glosses.append({
"sskj_sense_id": "1-1",
"gloss": helper_create_gloss(res_dict),
"def": dsk(res_dict, "Razlaga")
})
return glosses
for sense in res_dict["senses"]:
glosses.append({
"sskj_sense_id": "{}-{}".format(
sense["sskj_sense_id"], n_senses),
"gloss": helper_create_gloss(sense),
"def": dsk(sense, "Razlaga")
})
glosses_per_entry.append(glosses)
# add entry_id before the_sense id
# entry_id-sskj_sense_id-n_senses
all_glosses = []
for idx, glosses in enumerate(glosses_per_entry):
entry_id = idx + 1 # start with 1
for gloss in glosses:
gloss["sskj_sense_id"] = "{}-{}".format(
entry_id, gloss["sskj_sense_id"])
all_glosses.append(gloss)
return all_glosses

96
dip_src/valency/frame.py Normal file
View File

@@ -0,0 +1,96 @@
import logging
log = logging.getLogger(__name__)
class Frame():
def __init__(self, tids, deep_links=None, slots=None, hw=None):
self.hw = hw
self.tids = tids # list of tokens with the same hw_lemma
# Each tid = "S123.t123";
# you can get sentence with vallex.get_sentence(S123)
self.slots = []
if slots is None:
self.slots = self.init_slots(deep_links)
else:
self.slots = slots
self.sense_info = {}
self.sentences = None # Used for passing to view in app.py, get_frames
self.aggr_sent = None # Dictionary { hw: self.sentences idx }
def to_json(self):
ret = {
"hw": self.hw,
"tids": self.tids,
"slots": [slot.to_json() for slot in self.slots],
"sentences": self.sentences,
"aggr_sent": self.aggr_sent,
"sense_info": self.sense_info
}
return ret
def init_slots(self, deep):
slots = []
for link in deep:
slots.append(Slot(
functor=link["functor"],
tids=[link["to"]]
))
return slots
def sort_slots(self):
# ACT, PAT, alphabetically
srt1 = [
x for x in self.slots
if (x.functor == "ACT" or
x.functor == "PAT")
]
srt1 = sorted(srt1, key=lambda x: x.functor)
srt2 = [
x for x in self.slots
if (x.functor != "ACT" and
x.functor != "PAT")
]
srt2 = sorted(srt2, key=lambda x: x.functor)
self.slots = (srt1 + srt2)
def to_string(self):
ret = "Frame:\n"
ret += "sense_info: {}\n".format(str(self.sense_info))
ret += "tids: ["
for t in self.tids:
ret += (str(t) + ", ")
ret += "]\n"
if self.slots is not None:
ret += "slots:\n"
for sl in self.slots:
ret += (sl.to_string() + "\n")
return ret
class Slot():
# Each slot is identified by its functor (ACT, PAT, ...)
# It consists of different tokens.
def __init__(self, functor, tids=None, count=None):
self.functor = functor
self.tids = tids or []
self.count = count or 1
def to_string(self):
ret = "---- Slot:\n"
ret += "functor: {}\n".format(self.functor)
ret += "tids: ["
for t in self.tids:
ret += (str(t) + ", ")
ret += "]\n"
ret += "]\n"
ret += "----\n"
return ret
def to_json(self):
ret = {
"functor": self.functor,
"tids": self.tids,
"count": self.count
}
return ret

367
dip_src/valency/k_utils.py Normal file
View File

@@ -0,0 +1,367 @@
import os
import pickle
import nltk
import random
from time import time
import string
from polyglot.text import Word
import logging
log = logging.getLogger(__name__)
sno = nltk.stem.SnowballStemmer("english")
def dict_safe_key(dic, key):
# Returns a list, no matter what.
# Transform 1 element into a list.
# Return key not found as empty list.
if (
dic is None or
key not in dic
):
return []
subdic = dic[key]
if not isinstance(subdic, list):
return [subdic]
return subdic
def pickle_dump(data, path):
with open(path, "wb") as file:
pickle.dump(data, file)
log.info("Dumped data to {}.".format(path))
return True
def pickle_load(path):
ret = None
if os.path.isfile(path):
with open(path, "rb") as file:
ret = pickle.load(file)
log.info("Loaded data from {}.".format(path))
return ret # Returns None in case of failure.
# Implemented bucket sort for alphabetically sorting slovenian words.
# Bucket sort >>>>>>>>>>>>>>>>>>>>
def gen_sbs_alphabet():
alphabet = "abcčdefghijklmnoprsštuvzž"
return {letter: (idx + 1) for idx, letter in enumerate(alphabet)}
slo_bucket_sort_alphabet = gen_sbs_alphabet()
def slo_bucket_sort(words, key=None):
if key is None:
def key(x):
return x
def alph_score(word, idx):
kword = key(word)
if idx >= len(kword):
return 0
return slo_bucket_sort_alphabet.get(kword[idx]) or 0
def list_to_bins(words, idx):
bins = [[] for i in range(len(slo_bucket_sort_alphabet.keys()) + 1)]
for word in words:
bins[alph_score(word, idx)].append(word)
return bins
def bins_to_list(bins):
lst = []
for b in bins:
for el in b:
lst.append(el)
return lst
maxLen = 0
for w in words:
if len(key(w)) > maxLen:
maxLen = len(key(w))
maxIdx = maxLen - 1
for idx in range(maxIdx, -1, -1):
bins = list_to_bins(words, idx)
words = bins_to_list(bins)
"""
print(idx)
def get_letter(idx, word):
kword = key(word)
if idx < len(kword):
return(kword[idx])
return "#"
print([(word, get_letter(idx, word)) for word in words])
"""
return words
# Bucket sort <<<<<<<<<<<<<<<<<<<<
def stem_slo(x):
# Simplified;
# Remove the last syllable.
w = Word(x, language="sl").morphemes
ret = "".join(w[:-1])
return ret
def stem_eng(x):
return sno.stem(x)
def tokenize(sentence, min_token_len=3, stem=None):
# input: sentence string
# output: list of token strings
if stem is None:
def stem(x):
return x
all_tokens = []
sent_txt = nltk.sent_tokenize(sentence)
for sent in sent_txt:
tokens = nltk.word_tokenize(sent)
all_tokens.extend(tokens)
res = []
for x in all_tokens:
if x in string.punctuation:
continue
stemmed = stem(x.lower())
if len(stemmed) >= min_token_len:
res.append(stemmed)
return res
def tokenize_multiple(str_list, min_token_len=3, stem=None):
# tstart = time()
res = []
for sentence in str_list:
res.extend(tokenize(sentence, min_token_len, stem))
# log.debug("tokenize_multiple: {:.2f}s".format(time() - tstart))
return res
def t_tokenize():
teststring = "This is a test sentence. I hope it works. .. Asdf. asdf ,,,;"
print(teststring)
res = tokenize(teststring, min_token_len=None)
print(res)
def permute_paths(list2d, x=None, y=None, paths=None, current_path=None):
# python stuff
if x is None:
x = -1
if paths is None:
paths = []
if current_path is None:
current_path = []
if x >= len(list2d) - 1:
paths.append(current_path)
return paths
for i in range(len(list2d[x + 1])):
tmp_path = current_path + [(x + 1, i)]
# Computational complexity peoblem (prune long lists)
# len == 12 -> 30%, len == 5 -> 100%
# if random.randint(0, 100) <= (100 - 10 * (len(list2d) - 5)):
if True:
paths = permute_paths(
list2d,
x + 1,
i,
paths,
tmp_path
)
return paths
def t_permute_paths():
list2d = [
["Greta"],
["backflips"],
["through", "around"],
["North Korea", "kindergarten"],
["with", "without"],
["a"],
["bag of", "abundance of"],
["bolts", "janitors"]
]
print(list2d)
paths = permute_paths(list2d=list2d)
for path in paths:
print([list2d[p[0]][p[1]] for p in path])
def find_overlaps(list_a, list_b):
# Input: two lists.
# Output: lists of overlapping elements.
dict_a = {}
dict_b = {}
lists = [list_a, list_b]
dicts = [dict_a, dict_b]
for lidx in range(len(lists)):
for elidx in range(len(lists[lidx])):
el = lists[lidx][elidx]
if el not in dicts[lidx]:
dicts[lidx][el] = []
dicts[lidx][el].append(elidx)
substrings = []
sda = sorted(dict_a.keys())
sdb = sorted(dict_b.keys())
i_sda = 0
i_sdb = 0
while ((i_sda < len(sda) and i_sdb < len(sdb))):
if sda[i_sda] == sdb[i_sdb]:
lia = dict_a[sda[i_sda]]
lib = dict_b[sdb[i_sdb]]
for llia in lia:
for llib in lib:
tmp_substr = []
ii = 0
while (
(llia + ii < len(list_a)) and
(llib + ii < len(list_b)) and
(list_a[llia + ii] == list_b[llib + ii])
):
tmp_substr.append(list_a[llia + ii])
ii += 1
ii = 1
while (
(llia - ii >= 0) and
(llib - ii >= 0) and
(list_a[llia - ii] == list_b[llib - ii])
):
tmp_substr.insert(0, list_a[llia - ii])
ii += 1
substrings.append(tmp_substr)
if sda[i_sda] < sdb[i_sdb]:
i_sda += 1
else:
i_sdb += 1
uniques = set()
res = []
for ss in substrings:
if str(ss) not in uniques:
uniques.add(str(ss))
res.append(ss)
return res
def find_overlaps_str(tokens_a, tokens_b):
# Strings only.
overlaps = []
for N in range(1, 5):
ngrams_a = []
for i in range(len(tokens_a)):
if i + N <= len(tokens_a):
ngrams_a.append(tuple(tokens_a[i:i + N]))
ngrams_b = []
for i in range(len(tokens_b)):
if i + N <= len(tokens_b):
ngrams_b.append(tuple(tokens_b[i:i + N]))
overlaps.extend(list(set(ngrams_a).intersection(set(ngrams_b))))
res = []
for ovl in sorted(overlaps, key=lambda x: len(x), reverse=True):
oovl = " ".join(ovl)
for r in res:
if oovl in r:
break
else:
res.append(oovl)
res[:] = [x.split(" ") for x in res]
return res
def t_find_overlaps():
res = []
input_len = [10, 100, 1000, 10000]
for ll in input_len:
alen = ll + int(ll * random.uniform(0.8, 1))
blen = ll + int(ll * random.uniform(0.8, 1))
a = [random.randint(0, 100) for x in range(alen)]
b = [random.randint(0, 100) for x in range(blen)]
tstart = time()
find_overlaps(a, b)
res.append({
"time": time() - tstart,
"input_size": ll
})
"""
list_a = [6, 6, 4, 8, 3, 2, 2, 5, 6, 3, 4, 7, 5]
list_b = [5, 3, 6, 8, 6, 6, 5, 3, 2, 6, 7, 8, 3, 2, 3, 2, 2, 5]
res = find_overlaps(list_a, list_b)
"""
for r in res:
print(r)
def t1_find_overlaps():
t1 = "This is a test sentence. I hope it works. .. Asdf. asdf ,,,;"
t2 = "this is a seconde sentence. I hope my stuff works."
print(t1)
print(t2)
res = find_overlaps(tokenize(t1), tokenize(t2))
for r in res:
print(r)
print()
res = find_overlaps_str(tokenize(t1), tokenize(t2))
for r in res:
print(r)
def t_find_overlaps_str():
t1 = [
'vsa', 'moja', 'možganska', 'beda', 'se', 'združuje',
'v', 'dejstvu', 'da', 'sem', 'si', 'čeprav', 'sem', 'pozabil',
'ulico', 'zapomnil', 'hišno', 'številko'
]
t2 = [
'narediti', 'doseči', 'da', 'se', 'kaj', 'aktivno', 'ohrani',
'v', 'zavesti', 'zapomniti', 'si', 'imena', 'predstavljenih',
'gostov', 'dobro', 'natančno', 'slabo', 'si', 'kaj', 'zapomniti',
'takega', 'sem', 'si', 'zapomnil', 'zapomnite', 'te', 'prizore'
]
res = find_overlaps(t1, t2)
print(res)
def t_slo_bucket_sort():
a1 = []
a2 = []
with open("./tests/m_besede2.txt") as f:
for line in f:
a1.append(line.split("\n")[0])
a2.append((line.split("\n")[0], random.randint(0, 9)))
a1 = slo_bucket_sort(a1)
a2 = slo_bucket_sort(a2, key=lambda x: x[0])
check = True
for i in range(len(a1)):
check &= (a1[i] == a2[i][0])
print("{:<10}{:>10}".format(str(a1[i]), str(a2[i])))
print(check)
def t1_slo_bucket_sort():
words = "_xyz zebra. .bober raca bor borovnica antilopa".split(" ")
words.append("test space")
words.append("test srrrr")
words.append("test saaa")
for w in slo_bucket_sort(words):
print(w)
if __name__ == "__main__":
# t_find_overlaps()
# t1_find_overlaps()
# t_tokenize()
# t_find_overlaps_str()
t1_slo_bucket_sort()

View File

@@ -0,0 +1,247 @@
import pymongo
import xmltodict
import xml.etree.ElementTree as ET
from time import time
import json
from valency.sskj_scraper import SskjScraper
from bs4 import BeautifulSoup
# Get rid of accented characters.
intab = "ÁÉÍÓÚàáäçèéêìíîñòóôöùúüčŔŕ"
outtb = "AEIOUaaaceeeiiinoooouuučRr"
transtab = str.maketrans(intab, outtb)
def mongo_test():
client = pymongo.MongoClient(
"mongodb://{}:{}@127.0.0.1:26633/texts".format("kristjan", "simple567")
)
db = client.texts
coll = db.test
print(coll.find_one())
def basic_connection(ip_addr=None, port=None):
if ip_addr is None:
ip_addr = "127.0.0.1"
if port is None:
port = 26644
client = pymongo.MongoClient(
"mongodb://{}:{}@{}:{}/texts".format(
"kristjan", "simple567", ip_addr, str(port))
)
err_msg = "OK"
try:
client.server_info()
except pymongo.errors.ServerSelectionTimeoutError as err:
err_msg = err
return (None, err_msg)
db = client.texts
return (db, err_msg)
def check_collections(db, coll_names):
collections = db.collection_names()
for cn in coll_names:
if cn not in collections:
db.create_collection(cn)
def prepare_user_tokens(db):
CNAME = "v2_user_tokens"
db[CNAME].drop()
db.create_collection(CNAME)
EXPIRE = 151200 # 2 days
# EXPIRE = 10 # 10 sec
db[CNAME].ensure_index("date", expireAfterSeconds=EXPIRE)
# user this: utc_timestamp = datetime.datetime.utcnow()
# user_tokens.insert({
# '_id': 'utc_session', "date": utc_timestamp,
# "session": "test session"})
def sskj_to_mongo(sskj_path):
# Deprecated, use sskj2_to_mongo
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
ts = time()
sskj = ET.parse(sskj_path).getroot()
db = basic_connection()
col_names = ["sskj"]
for cn in col_names:
if cn in db.collection_names():
db[cn].drop()
text = sskj.find("tei:text", ns)
body = text.find("tei:body", ns)
n_ent = 0
for entry in body.findall("tei:entry", ns):
n_ent += 1
tmpstr = ET.tostring(entry)
datachunk = xmltodict.parse(tmpstr)
dictchunk = json.loads(json.dumps(datachunk))
"""
pp = pprint.PrettyPrinter()
pp.pprint(dictchunk)
"""
db.sskj.insert(dictchunk)
# iskanje: db.sskj.find({'ns0:entry.ns0:form.ns0:orth':"kaplanček"})
print("sskj to mongo: {} entries in {:.2f}s".format(n_ent, time() - ts))
def slownet_to_mongo(slw_path):
# .slownet contains the database from .xml file
# added toplevel field ["slo_lemma"] for faster querying
ts = time()
slownet = ET.parse(slw_path).getroot()
db = basic_connection()
col_names = ["slownet_map", "slownet"]
for cn in col_names:
if cn in db.collection_names():
db[cn].drop()
slo_to_id = {}
for synset in slownet.findall("SYNSET"):
tmpstr = ET.tostring(synset)
datachunk = xmltodict.parse(tmpstr)
dictchunk = json.loads(json.dumps(datachunk))
dictchunk = dictchunk["SYNSET"]
# pp.pprint(dictchunk)
# insert into slo_ti_id
if "SYNONYM" in dictchunk:
synonyms = dictchunk["SYNONYM"]
if not isinstance(synonyms, list):
synonyms = [synonyms]
for syn in synonyms:
if syn["@xml:lang"] == "sl":
if "LITERAL" in syn:
literals = syn["LITERAL"]
if not isinstance(literals, list):
literals = [literals]
for lit in literals:
slo_keyword = lit["#text"]
if "." in slo_keyword:
continue
if "slo_lemma" not in dictchunk:
dictchunk["slo_lemma"] = []
dictchunk["slo_lemma"].append(slo_keyword)
db.slownet.insert(dictchunk)
# pp.pprint(slo_to_id)
db.slownet.ensure_index([("id", pymongo.ASCENDING)])
db.slo_to_id.insert(slo_to_id)
print("sloWNet to mongo in {:.2f}s".format(time() - ts))
def scrape_sskj():
# Deprecated!
# Walk through keys in slo_to_id and scrape sskj data.
client = pymongo.MongoClient(
"mongodb://{}:{}@127.0.0.1:26633/texts".format("kristjan", "simple567")
)
db = client.texts
words_list = sorted(db.slo_to_id.find_one())
print(len(words_list))
sscraper = SskjScraper()
last_word = "nogometaš"
db.scraped_sskj.remove({"word": last_word})
lock = True
for word in words_list:
if word == last_word:
lock = False
if not lock:
res = sscraper.scrape(word)
if len(res) > 0:
db.scraped_sskj.insert({"word": word, "bag": res})
def sskj2_to_mongo(sskj2_path):
tstart = time()
db = basic_connection()
col_names = ["sskj2"]
for cn in col_names:
if cn in db.collection_names():
db[cn].drop()
with open(sskj2_path) as f:
soup = BeautifulSoup(f.read(), "html.parser")
divs = soup.find_all("div")
for i, div in enumerate(divs):
if (i) % 100 == 0:
print("{}/{}".format(i, len(divs)))
datachunk = xmltodict.parse(str(div))
datachunk = datachunk["div"]
# pos (besedna vrsta)
pos_keywords = {
"samostalnik": 0,
"pridevnik": 0,
"glagol": 0,
"prislov": 0,
"predlog": 0,
"členek": 0,
"veznik": 0,
"medmet": 0,
"povedkovnik": 0
}
for span in div.find_all("span"):
attrs = [e for k, e in span.attrs.items()]
for attr in attrs:
for ak in attr.split(" "):
akl = ak.lower()
if akl in pos_keywords:
pos_keywords[akl] += 1
pos = "unknonw"
pos_max = 0
for k, e in pos_keywords.items():
if e > pos_max:
pos = k
pos_max = e
datachunk["pos"] = pos
# izt_clean
izts = div.find_all("span", {"title": "Iztočnica"})
if len(izts) == 0:
print("Entry {} has no Iztočnica.".format(i))
continue
izt = ((izts[0].text).translate(transtab)).lower()
ispl = izt.split(" ")
has_se = False
if len(ispl) and ispl[-1] == "se":
izt = " ".join(ispl[:-1])
has_se = True
datachunk["izt_clean"] = izt
datachunk["has_se"] = has_se
dictchunk = json.loads(json.dumps(datachunk))
db.sskj.insert(dictchunk)
db.sskj.create_index([("izt_clean", pymongo.TEXT)])
print("sskj2 to mongo: {} entries in {:.2f}s".format(
len(divs), time() - tstart))
return None
if __name__ == "__main__":
# slownet_path = "../../data/slownet/slownet-2015-05-07.xml"
# slownet_to_mongo(slownet_path)
# scrape_sskj()
# sskj_path = "../../data/sskj/sskj.p5.xml"
# sskj_to_mongo(sskj_path)
# first file for testing, the original file takes up most of RAM
# sskj2_path = "../../data/sskj/sskj2_200.html"
# sskj2_path = "../../data/sskj/sskj2_v1.html"
# sskj2_to_mongo(sskj2_path)
print("nothing here")

View File

@@ -0,0 +1,239 @@
# Reduction function for frames.
# Input: list of Frame objects, output: list of Frame objects.
# App uses reduce_0, 1 and 5
from valency.frame import Frame, Slot
from copy import deepcopy as DC
import logging
log = logging.getLogger(__name__)
SENSE_UNDEFINED = "nedefinirano"
def sorted_by_len_tids(frames):
return sorted(
frames,
key=lambda x: len(x.tids),
reverse=True
)
def reduce_0(frames, vallex=None):
# new request... frames should be sorded by
# functors list (basically reduce_1, just each
# sentence gets its own frame)
r1_frames = reduce_1(frames)
sorting_strings = []
separated_frames = []
for frame in r1_frames:
for tid in frame.tids:
tmp_frame = DC(frame)
tmp_frame.tids = [tid]
separated_frames.append(tmp_frame)
sorting_strings.append("".join(
[slot.functor for slot in tmp_frame.slots]
))
permutation = [x for _, x in sorted(
zip(sorting_strings, range(len(sorting_strings))))]
sorted_sep_frames = [separated_frames[i] for i in permutation]
return sorted_sep_frames
def reduce_1(frames, vallex=None):
# Combine frames with the same set of functors.
# The order of functors is not important.
frame_sets = [] # [set of functors, list of frames]
for frame in frames:
functors = [slot.functor for slot in frame.slots]
for fs in frame_sets:
if set(functors) == set(fs[0]):
fs[1].append(frame)
break
else:
# Python for else -> fires if loop has ended.
frame_sets.append([functors, [frame]])
ret_frames = []
for fs in frame_sets:
tids = []
slots = {}
# All possible slots in this frame.
for functor in fs[0]:
slots[functor] = Slot(functor=functor)
# Reduce slots from all frames. (Merge ACT from all frames, ...)
for frame in fs[1]:
tids += frame.tids
for sl in frame.slots:
slots[sl.functor].tids += sl.tids
slots_list = []
for k, e in slots.items():
slots_list.append(e)
rf = Frame(tids=tids, slots=slots_list)
rf.sort_slots()
ret_frames.append(rf)
return sorted_by_len_tids(ret_frames)
def reduce_3(raw_frames, vallex):
# sskj simple lesk ids
ssj_ids = [frame.tids[0] for frame in raw_frames]
db_results = list(vallex.db.sskj_simple_lesk.find(
{"ssj_id": {"$in": ssj_ids}}))
id_map = {}
for entry in db_results:
id_map.update({entry["ssj_id"]: {
"sense_id": entry.get("sense_id"),
"sense_desc": entry.get("sense_desc")
}})
return frames_from_sense_ids(raw_frames, id_map)
def reduce_4(raw_frames, vallex):
# kmeans ids
ssj_ids = [frame.tids[0] for frame in raw_frames]
db_results = list(vallex.db.kmeans.find(
{"ssj_id": {"$in": ssj_ids}}))
id_map = {}
for entry in db_results:
id_map.update({entry["ssj_id"]: {
"sense_id": entry["sense_id"]
}})
return frames_from_sense_ids(raw_frames, id_map)
def reduce_5(raw_frames, vallex):
USER_SENSE_COLL = "v2_sense_map"
headword = raw_frames[0].hw
ssj_ids_full = [frame.tids[0] for frame in raw_frames]
# v2_sense_map stores only sentence half of ssj_id
ssj_ids = [".".join(ssj_id.split(".")[:-1]) for ssj_id in ssj_ids_full]
db_results = list(vallex.db[USER_SENSE_COLL].find({
"ssj_id": {"$in": ssj_ids},
"hw": headword,
}))
id_map = {}
for entry in db_results:
id_map[entry["ssj_id"]] = entry["sense_id"]
ret_frames = frames_from_sense_ids(raw_frames, id_map)
# sort: frames with senses to top
senses_undefined = []
senses_defined = []
for frame in ret_frames:
if frame.sense_info["sense_id"] == SENSE_UNDEFINED:
senses_undefined.append(frame)
else:
senses_defined.append(frame)
ret_frames = senses_defined + senses_undefined
return ret_frames
def frames_from_sense_ids(raw_frames, id_map):
# id map = dict {
# ssj_id: sense_id
# }
# id_dict = dict {
# sense_id: [frame, ...]
# }
id_dict = {}
for frame in raw_frames:
# long version ssj_id (S123.t12)
frame_ssj_id = frame.tids[0]
frame_sense_id = id_map.get(frame_ssj_id)
if frame_sense_id is None:
# try short version ssj_id (S123)
frame_ssj_id = ".".join(frame_ssj_id.split(".")[:-1])
frame_sense_id = id_map.get(frame_ssj_id)
# set default if sense_id not found
if frame_sense_id is None:
frame_sense_id = SENSE_UNDEFINED
"""
sense_id = id_map.get(frame.tids[0])
if sense_id is not None:
sense_id = sense_id.get("sense_id")
else:
sense_id = "nedefinirano"
"""
if frame_sense_id not in id_dict:
id_dict[frame_sense_id] = []
id_dict[frame_sense_id].append(DC(frame))
ret_frames = []
for sense_id, frames in id_dict.items():
tids = []
reduced_slots = []
for frame in frames:
tids.extend(frame.tids)
for slot in frame.slots:
# if functor not in reduced slots,
# add new slot; else increase count
for rslot in reduced_slots:
if slot.functor == rslot.functor:
rslot.count += 1
rslot.tids.extend(slot.tids)
break
else:
# in case for loop didn't match a slot
reduced_slots.append(Slot(
functor=slot.functor,
tids=slot.tids,
count=1
))
reduced_frame = Frame(tids, slots=reduced_slots)
id_map_entry = (
id_map.get(tids[0]) or
id_map.get(".".join(tids[0].split(".")[:-1]))
)
if id_map_entry is None:
reduced_frame.sense_info = {
"sense_id": SENSE_UNDEFINED,
}
else:
reduced_frame.sense_info = {
"sense_id": id_map_entry
}
reduced_frame.sort_slots()
ret_frames.append(reduced_frame)
return ret_frames
reduce_functions = {
"reduce_0": {
"f": reduce_0,
"desc":
"Vsaka pojavitev glagola dobi svoj stavčni vzorec.",
"simple_name": "posamezni stavki"
},
"reduce_1": {
"f": reduce_1,
"desc":
"Združevanje stavčnih vzorcev z enako skupino udeleženskih vlog.",
"simple_name": "združeni stavki"
},
"reduce_3": {
"f": reduce_3,
"desc":
"Združevanje stavčnih vzorcev na osnovi pomenov povedi v SSKJ. "
"Pomeni so dodeljeni s pomočjo algoritma Simple Lesk.",
"simple_name": "SSKJ_pomeni"
},
"reduce_4": {
"f": reduce_4,
"desc":
"Združevanje stavčnih vzorcev na osnovi pomenov povedi "
"s pomočjo algoritma K-Means. Število predvidenih pomenov "
"podano na osnovi SSKJ.",
"simple_name": "KMeans_pomeni"
},
"reduce_5": {
"f": reduce_5,
"desc":
"Uporabniško dodeljeni pomeni povedi.",
"simple_name": "po meri"
}
}

View File

View File

@@ -0,0 +1,284 @@
from bs4 import BeautifulSoup as BS
import re
from collections import defaultdict
from time import time
import pickle
import json
from copy import deepcopy as DC
# Match sese ordinals (1., 2., ...)
rord = re.compile(r"^ *[0-9]+\. *$")
# Get rid of accented characters.
intab = "ÁÉÍÓÚàáäçèéêìíîñòóôöùúüčŔŕ"
outtb = "AEIOUaaaceeeiiinoooouuučRr"
transtab = str.maketrans(intab, outtb)
class Seqparser:
def __init__(self):
pass
# main functions
def html_to_raw_pickle(self, sskj_html_filepath, raw_pickle_filepath):
entries = dict(self.parse_file(sskj_html_filepath, self.parse_line))
print("entries len: " + str(len(entries)))
with open(raw_pickle_filepath, "wb") as f:
tmpstr = json.dumps(dict(entries))
pickle.dump(tmpstr, f)
# debugging
def raw_pickle_to_parsed_pickle(
self, raw_pickle_filepath, parsed_pickle_filepath,
se_list_filepath
):
data = self.load_raw_pickle(raw_pickle_filepath)
print("raw_pickle data len: " + str(len(data)))
se_list = self.gen_se_list(data)
print("se_list len: " + str(len(se_list)))
with open(se_list_filepath, "wb") as f:
pickle.dump(se_list, f)
data1 = self.remove_se(data)
data2 = self.reorganize(data1, se_list)
print("data2 len: " + str(len(data2.keys())))
with open(parsed_pickle_filepath, "wb") as f:
pickle.dump(data2, f)
# helper html reading functions
def parse_file(self, path, f_parse_line):
tstart = time()
entries = defaultdict(list)
with open(path, "r") as f:
for line in f:
data = f_parse_line(line)
if data is not None:
entries[data["izt_clean"]].append(data)
print("parse_file({}) in {:.2f}s".format(path, time() - tstart))
return entries
def parse_line(self, line):
def helper_bv_set(g_or_p):
if g_or_p not in ["G", "P"]:
print("Err g_or_p.")
exit(1)
if data.get("bv") is not None:
if data["bv"] != g_or_p:
print(str(line))
# exit(1)
data["bv"] = g_or_p
data = {
"izt": "",
"izt_clean": "",
"senses": defaultdict(list)
}
soup = BS(line, "html.parser")
current_sense_id = "0"
for span in soup.find_all("span"):
# sense id
if span.string is not None:
rmatch = rord.match(span.string)
if rmatch is not None:
current_sense_id = rmatch.group().strip()
title = span.attrs.get("title")
if title is not None:
title = title.lower()
# only verbs and adjectives
if "glagol" in title:
helper_bv_set("G")
data["bv_full"] = title
elif "pridevn" in title:
helper_bv_set("P")
data["bv_full"] = title
# žšč
if title == "iztočnica":
data["izt"] = span.string
data["izt_clean"] = span.string.translate(transtab).lower()
# sense description
if title == "razlaga" and span.string is not None:
data["senses"][current_sense_id].append(
("razl", span.string))
if "pridevnik od" in span.string:
helper_bv_set("P")
if title == "sopomenka":
subspan = span.find_all("a")[0]
if subspan.string is not None:
data["senses"][current_sense_id].append(
("sopo", subspan.string))
# save verbs and adjectives
if (
("bv" not in data) or
(data["bv"] != "P" and data["bv"] != "G")
):
return None
# sanity check
if data["bv"] == "P" and " se" in data["izt_clean"]:
print(data)
exit(1)
# append _ to adjective keywords
if data["bv"] == "P":
data["izt_clean"] = data["izt_clean"] + "_"
# cleanup
if "bv" not in data:
print("Should not be here (no bv).")
exit(1)
del(data["bv"])
if "bv_full" in data:
del(data["bv_full"])
return data
# helper functions
def load_raw_pickle(self, raw_pickle_filepath):
with open(raw_pickle_filepath, "rb") as f:
tmpstr = pickle.load(f)
return json.loads(tmpstr)
def helper_loop(self, data, fnc):
for k, lst in data.items():
for el in lst:
fnc(el)
def gen_se_list(self, data):
def fnc1(el):
ic = el["izt_clean"]
if " se" in ic:
se_list.append(ic)
def fnc2(el):
ic = el["izt_clean"]
if ic in se_pruned:
se_pruned.remove(ic)
# hw entries that only exist with " se"
se_list = []
self.helper_loop(data, fnc1)
se_pruned = set([hw.split(" se")[0] for hw in se_list])
self.helper_loop(data, fnc2)
return sorted(list(se_pruned))
def remove_se(self, data):
def fnc1(el):
nel = DC(el)
ic = nel["izt_clean"]
if " se" in ic:
nic = ic.split(" se")[0]
nel["izt_clean"] = nic
data_new[nel["izt_clean"]].append(nel)
data_new = defaultdict(list)
self.helper_loop(data, fnc1)
return dict(data_new)
def reorganize(self, data, se_list):
# some hw entries have several headwords,
# some senses have subsenses
# index everything, make 1 object per hw
def helper_prune(sense_str):
# remove space padding
sense_str = sense_str.strip()
if len(sense_str) == 1:
return sense_str
# remove banned characters from string ending
banned = ": ; . , - ! ?".split(" ")
if sense_str[-1] in banned:
return sense_str[:-1]
return sense_str
data_new = {}
for k, lst in data.items():
new_el = {
"hw": k,
"has_se": k in se_list,
"senses": []
}
# if there is a single hw entry, hw_id is 0
if len(lst) == 1:
homonym_id = -1
else:
homonym_id = 0
# loop homonyms
for el in lst:
homonym_id += 1
# loop top lvl sense ids
for sense_id, sens_lst in el["senses"].items():
# loop subsenses
for i, sens in enumerate(sens_lst):
nsid = sense_id.split(".")[0]
if len(sens_lst) == 1:
nsid += "-0"
else:
nsid += ("-" + str(i + 1))
new_sense = {
"homonym_id": homonym_id,
# sense_id: sense_id-subsense_id
"sense_id": nsid,
"sense_type": sens[0],
"sense_desc": helper_prune(sens[1]),
}
new_el["senses"].append(new_sense)
hw = new_el["hw"]
if hw in data_new:
print("Shouldn't be here.")
print(new_el)
exit(1)
data_new[hw] = DC(new_el)
# return data_new
# check
for hw, el in data_new.items():
for sens in el["senses"]:
if sens["sense_desc"] is None:
print(sens)
return data_new
def plst(lst):
for el in lst:
print(el)
if __name__ == "__main__":
datapath = "../../../data"
html_filepath = datapath + "/sskj/sskj2_v1.html"
raw_pickle_filepath = datapath + "/tmp_pickles/raw_sskj.pickle"
parsed_pickle_filepath = datapath + "/no_del_pickles/sskj_senses.pickle"
se_list_filepath = datapath + "/no_del_pickles/se_list.pickle"
p = Seqparser()
if True:
print("html_to_raw_pickle({}, {})".format(
html_filepath, raw_pickle_filepath))
print("Big file, this might take a while (2 min).")
tstart = time()
p.html_to_raw_pickle(html_filepath, raw_pickle_filepath)
print("Finished in {:.2f}.".format(time() - tstart))
if True:
print("raw_pickle_to_parsed_pickle({}, {}, {})".format(
raw_pickle_filepath, parsed_pickle_filepath, se_list_filepath))
tstart = time()
p.raw_pickle_to_parsed_pickle(
raw_pickle_filepath, parsed_pickle_filepath, se_list_filepath)
print("Finished in {:.2f}.".format(time() - tstart))
print("Done.")

View File

@@ -0,0 +1,218 @@
import xml.etree.ElementTree as ET
from copy import deepcopy as DC
from time import time
import re
import logging
import sys
import pickle
log = logging.getLogger(__name__)
ET.register_namespace("xml", "http://www.w3.org/XML/1998/namespace")
XML_ID = "{http://www.w3.org/XML/1998/namespace}id"
# |$ for a default empty match
re_int = re.compile(r"t\d+|$")
# For sorting a "s" section in ssj; returns key as integer.
# example: "S123.t34" --> 34
def re_lmbd(el):
s = re_int.findall(el)[0]
if len(s) == 0:
return 0
else:
return int(s[1:])
class SsjEntry:
def __init__(self, ssj_id, s, deep_links):
# See ssj xml structure.
self.id = ssj_id
self.s = DC(s)
self.deep_links = DC(deep_links)
class SsjDict:
def __init__(self):
self.entries = {}
"""
def read_xml(self, filepath):
# No data loss.
log.info("SsjDict.read_xml({})".format(filepath))
t_start = time()
tree = ET.parse(filepath)
root = tree.getroot()
stats = {
"skipped": [],
"duplicates": []
}
for s in root.iter("s"):
s_id = s.attrib[XML_ID]
tokens = {}
for token in s:
if token.tag == "linkGrp":
continue
if token.tag == "w":
tokens[token.attrib[XML_ID]] = {
"msd": token.attrib["msd"],
"lemma": token.attrib["lemma"],
"word": token.text
}
elif token.tag == "c":
tokens[token.attrib[XML_ID]] = {
"word": token.text
}
else:
# <S />
pass
linkGrps = s.findall("linkGrp")
if len(linkGrps) < 2:
# Take only entries with both deep and shallow
# syntactic annotation
stats["skipped"].append(s_id)
continue
linkG = {}
for el in linkGrps:
if el.attrib["type"] == "dep":
linkG["dep"] = el
elif el.attrib["type"] == "SRL":
linkG["SRL"] = el
else:
raise KeyError("Unknown linkGrp.")
if s_id in self.entries:
stats["duplicates"].append(s_id)
self.entries[s_id] = SsjEntry(
s_id,
s.attrib["n"],
tokens,
create_edge_dict(linkG["dep"]),
create_edge_dict(linkG["SRL"])
)
t_end = time()
log.info("Time: {}s.".format(t_end - t_start))
log.info(
"{} duplicates, skipped {} elements (missing linkGrp).".format(
len(stats["duplicates"]), len(stats["skipped"]))
)
"""
def read_xml_v2(self, filepath):
NS_DICT = {
"tei": "http://www.tei-c.org/ns/1.0",
"xml": "http://www.w3.org/XML/1998/namespace",
}
def ns_prefix(ns):
return "{" + NS_DICT[ns] + "}"
def helper_get_sentence(tree_s):
# all w and pc elements
ret = []
for el in tree_s.iter():
if (
el.tag == ns_prefix("tei") + "w" or
el.tag == ns_prefix("tei") + "pc"
):
ret.append(el)
return ret
def helper_get_functor_links(tree_s):
# links for SRL linkGrp
lg = None
for linkGrp in tree_s.findall("tei:linkGrp", NS_DICT):
if linkGrp.attrib["type"] == "SRL":
lg = linkGrp
break
else:
return []
ret = []
for link in lg:
ret.append(link)
return ret
def helper_gen_deep_links(link_list):
deep_links = []
for link in link_list:
deep_links.append({
"from": link.attrib["target"].split(" ")[0][1:],
"to": link.attrib["target"].split(" ")[1][1:],
"functor": link.attrib["ana"].split(":")[1]
})
return deep_links
log.info("SsjDict.read_xml({})".format(filepath))
t_start = time()
stats = {
"total_count": 0,
"deep_roles_count": 0,
"duplicated_sid": 0,
}
tree = ET.parse(filepath)
root = tree.getroot()
for s in root.findall(".//tei:s", NS_DICT):
stats["total_count"] += 1
s_id = s.attrib[ns_prefix("xml") + "id"]
# get_functors (deep semantic roles)
functor_links = helper_get_functor_links(s)
if len(functor_links) == 0:
continue
stats["deep_roles_count"] += 1
# get_sentence
tokens = {}
for token in helper_get_sentence(s):
tid = token.attrib[ns_prefix("xml") + "id"]
if token.tag == ns_prefix("tei") + "w":
tokens[tid] = {
"msd": token.attrib["ana"].split(":")[1],
"lemma": token.attrib["lemma"],
"word": token.text
}
elif token.tag == ns_prefix("tei") + "pc":
tokens[tid] = {
"word": token.text
}
else:
log.warning("Unrecognized sentence element: " + token.tag)
exit(1)
if s_id in self.entries:
log.warning("duplicated sentence: " + s_id)
stats["duplicated_sid"] += 1
continue
self.entries[s_id] = SsjEntry(
s_id,
tokens,
helper_gen_deep_links(functor_links)
)
t_end = time()
log.info("Time: {}s.".format(t_end - t_start))
log.info(str(stats))
if __name__ == "__main__":
# testing
log.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
log.addHandler(ch)
# Load
fpath = "../../data/ssj500k-sl.TEI/ssj500k-sl.body.xml"
ssj = SsjDict()
ssj.read_xml_v2(fpath)
with open("ssj_test.pickle", "wb") as file:
pickle.dump(ssj, file)

View File

@@ -0,0 +1,47 @@
# Deprecated!
import requests
from bs4 import BeautifulSoup
from time import time
from valency import k_utils
SSKJ_BASE = "http://bos.zrc-sazu.si/cgi/a03.exe?name=sskj_testa&expression="
class SskjScraper:
def __init__(self):
self.base_url = SSKJ_BASE
def scrape(self, word):
# returns unique set of words
soup = BeautifulSoup(
requests.get(self.base_url + word).content,
"html.parser"
)
# Check for failure.
h2 = soup.find_all("h2")
if len(h2) >= 2:
# <h2>Zadetkov ni bilo: ...</h2>
return []
li_elements = soup.find_all('li', class_="nounderline")
if len(li_elements) == 0:
return []
li = li_elements[0]
# It was horrible...
# <li> ... <li> ... <li> ...</li></li></li>
# Parse sequence until you find a sedond <li>
txts = []
for el in li.find_all():
if el.name == "li":
break
txts.append(el.get_text())
print("sskj scraped {}.".format(word))
return k_utils.tokenize(txts)
if __name__ == "__main__":
sskjScr = SskjScraper()
word = "tek"
tp = sskjScr.scrape("čaj")
print(tp)

View File

@@ -0,0 +1,40 @@
from valency.val_struct import *
from valency.ssj_struct import *
from valency import k_utils
from valency.lesk import Lesk
vallex_path = "../../data/vallex.xml"
vallex = k_utils.pickle_load(vallex_path)
if vallex is None:
ssj_path = "../../data/anno_final.conll.xml"
# ssj_path = "../../data/ssj500kv1_1-SRL_500_stavkov_2017-04-11.xml"
ssj = k_utils.pickle_load(ssj_path)
if ssj is None:
ssj = SsjDict()
ssj.read_xml(ssj_path)
k_utils.pickle_dump(ssj, ssj_path)
vallex = Vallex()
vallex.read_ssj(ssj)
k_utils.pickle_dump(vallex, vallex_path)
vallex.process_after_read(False, False)
random_frame = None
lesk = Lesk()
successes = 0
for k, e in vallex.entries.items():
for rf in e.raw_frames:
random_frame = rf
break
print(rf.to_string())
print(vallex.get_token(random_frame.tids[0]))
print(vallex.get_sentence(random_frame.tids[0]))
tid = random_frame.tids[0]
token = vallex.get_token(tid)
context = vallex.get_context(tid)
sense = lesk.lesk(token, context)
if sense is not None:
successes += 1
if successes >= 10:
break

View File

@@ -0,0 +1,706 @@
from time import time
from copy import deepcopy as DC
from valency.frame import Frame
from valency.reduce_functions import *
from valency.lesk import *
from valency import mongo_tools
import random
import logging
from valency.evaluation import Evaluation
from valency.dictionary_interface import SloWnet, Sskj2
from valency.leskFour import LeskFour
from valency.k_kmeans import KmeansClass
from valency.ssj_struct import SsjDict, SsjEntry
from valency.seqparser.seqparser import Seqparser
import pickle
import sys
import hashlib
log = logging.getLogger(__name__)
def split_id(myid):
tmp = myid.split(".")
sid = ".".join(tmp[:-1])
tid = tmp[-1]
return (sid, tid)
class ValEntry():
def __init__(self, hw, frame):
self.hw = hw
self.raw_frames = [frame]
self.has_senses = False
class Vallex():
# Main class
def __init__(self):
# database
self.db, err_msg = mongo_tools.basic_connection("127.0.0.1", 26633)
if self.db is None:
log.error((
"Database not connected:"
"{}".format(err_msg)
))
exit(1)
mongo_tools.check_collections(self.db, [
"v2_users", "v2_senses", "v2_sense_map", "v2_user_tokens"
])
mongo_tools.prepare_user_tokens(self.db)
# these 3 might be obsolete for the web app (used for ML)
self.db_senses_map = self.db.senses_map3
self.slownet_interface = SloWnet(self)
self.sskj_interface = Sskj2(self)
# self.tokens["s0][t0"] = {word, lemma, msd, ...}
self.tokens = {}
# key = verb / adjective headword
self.entries = {}
# For alphabetical indexing in web app.
self.sorted_words = {}
# words = { first_letter: [hw1, hw2, ... sorted] }
self.functors_index = {}
self.has_se = [] # list of verbs with "se" ("bati se")
# Used for ML (deprecated).
self.leskFour = LeskFour(self)
self.kmeans = KmeansClass(self)
self.evaluation = Evaluation(self)
self.test_samples = []
# run self.process_after_read() after initiating Vallex
def read_ssj(self, ssj):
# ssj: object generated with ssj_strict.py.
BANNED_HW = ["biti"]
stats = {
"P_count": 0,
"skipped": 0,
}
log.info("Vallex.read_ssj({}).".format(
ssj
))
t_start = time()
for ssj_id, entry in ssj.entries.items():
# Read tokens
skip_entry = False
tmp_tokens = {}
for ssj_tid, token in entry.s.items():
sid, tid = split_id(ssj_tid)
# safety checks
if tid != "t" and not tid[1:].isdigit():
log.warning("dropping SID={} - corrupted keys".format(k))
skip_entry = True
break
if tid in tmp_tokens:
log.error(
"Vallex.read_ssj(): Duplicated ssj_tid:" + ssj_tid)
exit(1)
tmp_tokens[tid] = DC(token)
if skip_entry:
continue # skip corrupted keys
if sid in self.tokens:
log.error("sid duplicate: " + sid)
exit(1)
self.tokens[sid] = DC(tmp_tokens)
# Read frame data (each deep link gets its own raw frame).
link_map = {}
# hw_id: { hw_lemma: lemma, deep: [{functor: fnct, to: to}]}
for deep_link in entry.deep_links:
hw_id = deep_link["from"]
hw_token = self.get_token(hw_id)
hw_lemma = hw_token["lemma"]
hw_bv = hw_token["msd"][0]
if (hw_bv != "G" and hw_bv != "P"):
stats["skipped"] += 1
log.info("Skipping {}: not a verb or adjective.".format(
hw_lemma))
continue
if hw_bv == "P":
hw_lemma = hw_lemma + "_"
stats["P_count"] += 1
if hw_id in link_map:
link_map[hw_id]["deep"].append(deep_link)
else:
link_map[hw_id] = {
"hw_lemma": hw_lemma,
"deep": [deep_link]
}
for hw_id, data in link_map.items():
hw_lemma = data["hw_lemma"]
raw_frame = Frame(
hw=hw_lemma,
tids=[hw_id],
deep_links=data["deep"],
slots=None,
)
if hw_lemma not in self.entries:
self.entries[hw_lemma] = ValEntry(hw_lemma, raw_frame)
else:
self.entries[hw_lemma].raw_frames.append(raw_frame)
# cleanup banned
for hw in BANNED_HW:
if hw in self.entries:
del(self.entries[hw])
t_end = time()
log.info("Finished build_from_ssj() in {:.2}s.".format(
t_end - t_start
))
log.info("Vallex has a total of {} key entries.".format(
len(self.entries.keys())
))
log.info("Number of adjectives: {}".format(stats["P_count"]))
log.info("Number of skipped (not a verb or adjective): {}".format(
stats["skipped"]))
# Frames per hw
"""
for k, e in self.entries.items():
print(k + "," + str(len(e.raw_frames)))
"""
def get_token(self, myid):
# id = S123.t1
sid, tid = split_id(myid)
return self.tokens[sid][tid]
def get_sentence(self, myid):
sid, tid = split_id(myid)
tmp = []
sentence = ""
for k, token in self.tokens[sid].items():
if (k != "t") and (token["word"] is not None):
tmp.append((k, token))
for token in sorted(tmp, key=lambda x: int(x[0][1:])):
sentence += (token[1]["word"] + " ")
return sentence
def get_tokenized_sentence(self, myid):
sid, tid = split_id(myid)
tmp = []
sentence = []
for k, token in self.tokens[sid].items():
if k != "t":
tmp.append((k, token))
for token in sorted(tmp, key=lambda x: int(x[0][1:])):
sentence.append((".".join([sid, token[0]]), token[1]))
# return [(ssj_id, {word: _, lemma: _, msd: _}), ...]
return sentence
def process_after_read(
self, sskj_senses_pickle_path, se_list_pickle_path,
reload_sskj_senses
):
tstart = time()
# web app: index by hw
self.sorted_words = {}
self.gen_sorted_words()
# web app: index by functor
self.functors_index = {}
self.gen_functors_index()
# fill db.v2_senses
self.has_se = []
self.read_seqparser_pickles(
sskj_senses_pickle_path, se_list_pickle_path, reload_sskj_senses)
log.debug(
"vallex.process_after_read(): {:.2f}s".format(time() - tstart))
def gen_sorted_words(self):
res = {}
for hw, e in self.entries.items():
letter = hw[0].lower()
n_sent = len(e.raw_frames)
if letter not in res:
res[letter] = []
res[letter].append((hw, n_sent))
# sort and add to vallex object
self.sorted_words = {}
for letter, lst in res.items():
self.sorted_words[letter] = k_utils.slo_bucket_sort(
lst, key=lambda x: x[0])
def gen_functors_index(self):
for hw, e in self.entries.items():
for frame in e.raw_frames:
for slot in frame.slots:
if slot.functor not in self.functors_index:
self.functors_index[slot.functor] = []
self.functors_index[slot.functor].append(frame)
def read_seqparser_pickles(
self, sskj_senses_pickle_path, se_list_pickle_path,
reload_sskj_senses
):
log.info("read_seqparser_pickles()")
log.info((
"Reading list of has_se verbs from {}."
"Sskj senses into db.v2_senses from {}."
).format(se_list_pickle_path, sskj_senses_pickle_path))
AUTHOR_SSKJ = "SSKJ"
ERR_MSG = (
"Need to generate .pickle files first."
"Use: "
"$ python3 /script/valency/seqparser/seqparser.py"
"Input is /data/sskj_v2.html."
)
# has_se
with open(se_list_pickle_path, "rb") as f:
self.has_se = pickle.load(f)
if self.has_se is None:
log.error(ERR_MSG)
exit(1)
self.has_se = sorted(self.has_se)
log.info("Loaded self.has_se (len: {}) from {}.".format(
len(self.has_se), se_list_pickle_path))
# sskj senses
if reload_sskj_senses:
log.info("Reloading sskj_senses.")
reply = self.db.v2_senses.remove({"author": AUTHOR_SSKJ})
log.info(reply)
query = list(self.db.v2_senses.find({"author": AUTHOR_SSKJ}))
if len(query) > 0:
log.info("Sskj senses already in database.")
return
tstart = time()
data = None
with open(sskj_senses_pickle_path, "rb") as f:
data = pickle.load(f)
if data is None:
log.error(ERR_MSG)
exit(1)
for k, e in data.items():
for sense in e["senses"]:
db_entry = {
"hw": k,
"author": AUTHOR_SSKJ,
"desc": sense["sense_desc"],
# unique id for each sense
"sense_id": "{}-{}-{}-{}-{}".format(
AUTHOR_SSKJ,
sense["homonym_id"],
sense["sense_id"],
sense["sense_type"],
hashlib.sha256(
sense["sense_desc"].encode("utf-8")
).hexdigest()[:5]
)
}
self.db.v2_senses.insert(db_entry)
# print(db_entry)
log.info("db.v2_senses prepared in {:.2f}s".format(time() - tstart))
# Functions below can be used for interactively with flask_api.
def test_dev(self):
# self.prepare_sskj_senses()
hw = "dajati"
senses = self.sskj_interface.sense_glosses(hw)
return str(senses)
def calc_senses(self):
# self.calc_all_senses(self.leskFour.lesk_nltk)
# self.calc_all_senses(self.leskFour.lesk_sl)
# self.calc_all_senses(self.leskFour.lesk_al) # cca 8h!
# self.calc_all_senses(self.leskFour.lesk_ram)
self.calc_all_senses_kmeans(self.kmeans.bisection_kmeans)
self.calc_all_senses_kmeans(self.kmeans.normal_kmeans)
return "edit val_struct.py: calc_senses()"
# deprecated functions (used for machine learning experiments)
def prepare_sskj_senses(self):
# obsolete, using read_seqparser_pickles()
log.info("prepare_sskj_senses() (db.v2_senses)")
query = list(self.db.v2_senses.find({"author": "SSKJ2"}))
if len(query) > 0:
log.info("Sskj senses already in database.")
return
tstart = time()
log.info("Iterating over {} hw entries:".format(
len(self.entries.keys())))
for hw, e in self.entries.items():
senses = self.sskj_interface.sense_glosses(hw)
if len(senses) == 0:
continue
for sense in senses:
# create sense from each description
for i, de in enumerate(sense["def"]):
sense_def = sense["def"][i]
sense_def = sense_def[0].upper() + sense_def[1:]
if sense_def[-1] == ":" or sense_def[-1] == ";":
sense_def = sense_def[:-1] + "."
data = {
"hw": hw,
"author": "SSKJ2",
"desc": sense_def,
"sskj_id": sense["sskj_sense_id"],
"sskj_desc_id": i
}
self.db.v2_senses.insert(data)
log.info("sskj_senses prepared in {:.2f}s".format(time() - tstart))
def gen_sskj_sl(self):
# Takes about an hour.
tstart = time()
log.info("Generating new sskj_simple_lesk with Simple Lesk.")
for k, e in self.entries.items():
self.gen_sskj_sl_one(e.hw)
log.debug("gen_sskj_sl in {:.2f}s".format(time() - tstart))
def gen_sskj_sl_one(self, hw, update_db=True):
entry = None
ttstart = time()
e = self.entries.get(hw)
if e is None:
return
for frame in e.raw_frames:
tid = frame.tids[0]
sentence = self.get_sentence(tid)
res = self.lesk.simple_lesk_sskj(sentence, hw)
if res is None:
log.debug("headword {} not in sskj".format(hw))
continue
key = {"ssj_id": tid}
entry = {
"headword": hw,
"ssj_id": tid, # uniqe identifier
"sense_id": res[1],
# "sense_desc": k_utils.dict_safe_key(res[2], "ns0:def"),
"sense_desc": res[2]["def"]
}
# log.debug(str(res[2]))
# log.debug(entry["sense_id"])
# log.debug(entry["sense_desc"])
if update_db:
self.db.sskj_simple_lesk.update(key, entry, upsert=True)
log.debug("[*] sskj_ids for {} in {:.2f}s".format(
hw, time() - ttstart))
def get_context(self, myid, radius=None, min_lemma_size=None):
radius = radius or 5
min_lemma_size = min_lemma_size or 4
# gives you the token and 10 of its neighbors
sentence = self.get_sentence(myid)
sentlen = len(sentence.split(" "))
sid, tid = split_id(myid)
idx = int(tid[1:])
tokens_after = []
i = idx
while i < sentlen - 1 and len(tokens_after) < radius:
i += 1
token = self.get_token(sid + ".t" + str(i))
if (
token is not None and "lemma" in token and
len(token["lemma"]) >= min_lemma_size and
token["lemma"] != "biti"
):
tokens_after.append(token)
tokens_before = []
i = idx
while i > 1 and len(tokens_before) < radius:
i -= 1
token = self.get_token(sid + ".t" + str(i))
if (
token is not None and "lemma" in token and
len(token["lemma"]) >= min_lemma_size and
token["lemma"] != "biti"
):
tokens_before.append(token)
tokens = tokens_before + [self.get_token(myid)] + tokens_after
# find position of original token:
mid_idx = len(tokens_before)
return (mid_idx, tokens)
def get_sense_ids(self, collname, hw, sense_group=None):
query = {"headword": hw}
if sense_group is not None:
query["sense_group"] = sense_group
result = list(self.db[collname].find(query))
sense_ids = {}
for r in result:
sense_ids[r["ssj_id"]] = r["sense_id"]
return sense_ids
def t_get_context(self):
ii = 10
for k, e in self.entries.items():
for frame in e.raw_frames:
if random.randint(0, 100) > 20:
continue
ii -= 1
if ii <= 0:
return
mytid = frame.tids[0]
print()
print(mytid)
print(self.get_token(mytid))
sent = self.get_context(mytid, radius=3, min_lemma_size=4)
print("mid: {}".format(sent[0]))
for ii in range(len(sent[1])):
print("{} -> {}".format(
ii, sent[1][ii]))
def t_simple_lesk_sskj(self):
ii = 10
for k, e in self.entries.items():
if random.randint(0, 100) > 20:
continue
for frame in e.raw_frames:
if random.randint(0, 100) > 20:
continue
if ii == 0:
return
ii -= 1
print("\nTest frame: {}.".format(frame.tids))
hw_token = self.get_token(frame.tids[0])
print(hw_token)
context_sentence = self.get_sentence(frame.tids[0])
print(context_sentence)
self.lesk.simple_lesk_sskj(
context_sentence=context_sentence,
word_lemma=hw_token["lemma"]
)
def process_kmeans(self):
# Convert words to lemmas, cluseter using k-means.
# Number of clusters from sskj.
tstart = time()
log.info("Processing senses using kmeans.")
for k, e in self.entries.items():
# Frame start
ttstart = time()
lemma = e.hw
tokenized_sentences = []
for frame in e.raw_frames:
tid = frame.tids[0]
tokenized_sentences.append(self.get_tokenized_sentence(tid))
lemmatized_sentences = []
for sent in tokenized_sentences:
lemmatized = ""
for token in sent:
if "lemma" in token[1]:
lemmatized += (token[1]["lemma"] + " ")
lemmatized_sentences.append(lemmatized)
lls = len(lemmatized_sentences)
# We got the sentences
sskj_entry = self.db.sskj.find_one(
{"ns0:entry.ns0:form.ns0:orth": lemma})
if sskj_entry is None:
log.debug("headword {} has no <sense> in sskj".format(lemma))
continue
n_clusters = 1
if "ns0:sense" in sskj_entry["ns0:entry"]:
# Guess number of senses based on sskj senses.
n_clusters = len(sskj_entry["ns0:entry"]["ns0:sense"])
if lls >= n_clusters and n_clusters > 1:
labels = k_kmeans.k_means(
sentences=lemmatized_sentences,
n_clusters=n_clusters
)
kmeans_ids = [str(x) + "-" + str(lls) for x in labels]
elif n_clusters == 1:
kmeans_ids = ["1-1" for x in lemmatized_sentences]
elif lls < n_clusters:
# Each sentence gets its own sense.
kmeans_ids = []
for i in range(lls):
kmeans_ids.append(str(i + 1) + "lt" + str(n_clusters))
else:
log.error("Shouldn't be here (val_struct: process_kmeans()")
exit(1)
# Feed sense ides of whole frame to database.
for i in range(len(e.raw_frames)):
tid = e.raw_frames[i].tids[0]
key = {"ssj_id": tid}
entry = {
"headword": lemma,
"ssj_id": tid, # unique idenfitier
"sense_id": kmeans_ids[i],
}
self.db.kmeans.update(key, entry, upsert=True)
log.debug("[*] kemans_ids for {} in {:.2f}s".format(
lemma, time() - ttstart))
# Frame end
log.debug("process_kmeans in {:.2f}s".format(time() - tstart))
def get_context1(
self, mytid, collname, radius=None, min_token_len=3, get_glosses=None
):
# returns {
# "hw": headword lemma and its glosses
# "context": a list of lemmas and their glosses around the hw that
# have entries in collname dictionary (if get_glosses=True)
# }
# tstart = time()
if get_glosses is None:
get_glosses = False
if radius is None:
radius = 10000
if collname == "slownet":
dictionary_interface = self.slownet_interface
elif collname == "sskj":
dictionary_interface = self.sskj_interface
else:
log.error("argument error: get_context1(collname=<slownet/sskj>)")
return []
sentence = self.get_tokenized_sentence(mytid)
# return [(ssj_id, {word: _, lemma: _, msd: _}), ...]
hw_idx = -1
for i, e in enumerate(sentence):
if e[0] == mytid:
hw_idx = i
hw_lemma = e[1]["lemma"]
break
hw_glosses = dictionary_interface.sense_glosses(hw_lemma)
if len(hw_glosses) == 0:
log.info("hw: {} has 0 glosses".format(hw_lemma))
return {
"hw": None,
"err": "headword {} has no glosses in {}".format(
hw_lemma, collname)
}
tokens_before = []
ii = hw_idx - 1
while(ii >= 0 and len(tokens_before) < radius):
lemma = sentence[ii][1].get("lemma")
if (
lemma is not None and
len(lemma) >= min_token_len
):
if get_glosses:
glosses = dictionary_interface.sense_glosses(lemma)
else:
glosses = [{"def": "--none--", "gloss": "--none--"}]
if len(glosses) > 0:
tokens_before.insert(0, {
"lemma": lemma,
"glosses": glosses
})
ii -= 1
tokens_after = []
ii = hw_idx + 1
while(ii < len(sentence) and len(tokens_after) < radius):
lemma = sentence[ii][1].get("lemma")
if (
lemma is not None and
len(lemma) >= min_token_len
):
if get_glosses:
glosses = dictionary_interface.sense_glosses(lemma)
else:
glosses = [{"def": "--none--", "gloss": "--none--"}]
if len(glosses) > 0:
tokens_after.append({
"lemma": lemma,
"glosses": glosses
})
ii += 1
# log.debug("context1({}): {:.2f}".format(mytid, time() - tstart))
return {
"hw": {"lemma": hw_lemma, "glosses": hw_glosses},
"context": tokens_before + tokens_after
}
def test_context1(self, mytid, hw=""):
res = ""
context = self.get_context1(
mytid, collname="slownet", radius=2, get_glosses=True)
if context["hw"] is None:
return context["err"] + "<br><br>"
res = "hw: {}<br>sentence: {}<br>".format(
hw, self.get_sentence(mytid))
tfigf_input = []
glosses = [context["hw"]] + context["context"]
for e in glosses:
res += "--->lemma: {} ({} senses)<br>".format(
e["lemma"], len(e["glosses"]))
for g in e["glosses"]:
res += "{}<br>".format(str(g))
tfigf_input.append(" ".join(k_utils.tokenize_multiple(
g["gloss"],
min_token_len=3,
stem=k_utils.stem_eng
)))
res += "<br><br>"
return res
def calc_all_senses(self, lesk_algorithm):
allcount = 0
count = 0
for k, e in self.entries.items():
allcount += len(e.raw_frames)
for k, e in self.entries.items():
if k == "biti": # skip this huge bag of words
continue
for frame in e.raw_frames:
count += 1
if count % 10 == 0:
log.info("calc_all_senses: ({}/{})".format(
count, allcount))
lesk_algorithm(frame.tids[0])
return None
def calc_all_senses_kmeans(self, kmeans_algorithm):
tstart = time()
allcount = len(self.entries)
count = 0
avg_times = []
for key in self.entries:
count += 1
if key == "biti":
continue
# cluster frames of each entry
log.info("calc_all_senses_kmeans: ({}/{}) [{}]".format(
count, allcount, key))
kmeans_algorithm(key)
"""
try:
kmeans_algorithm(key)
except ValueError:
continue
"""
avg_times.append(1.0 * (time() - tstart) / count)
log.info("avg_time: {:.2f}s".format(avg_times[-1]))
log.info("calc_all_senses_kmeans in {:.2f}s.".format(time() - tstart))
return None
if __name__ == "__main__":
log.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
log.addHandler(ch)
# run ssj_struct to create a ssj_test.pickle file
with open("ssj_test.pickle", "rb") as file:
ssj = pickle.load(file)
vallex = Vallex()
vallex.read_ssj(ssj)
vallex.sorted_words = {}
vallex.gen_sorted_words()
vallex.functors_index = {}
vallex.gen_functors_index()