forked from kristjan/cjvt-valency
frontend_devops fix
This commit is contained in:
1
dip_src/valency/.gitignore
vendored
Normal file
1
dip_src/valency/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
*.pickle
|
||||
0
dip_src/valency/__init__.py
Normal file
0
dip_src/valency/__init__.py
Normal file
386
dip_src/valency/dictionary_interface.py
Normal file
386
dip_src/valency/dictionary_interface.py
Normal file
@@ -0,0 +1,386 @@
|
||||
from valency import k_utils
|
||||
import logging
|
||||
from time import time
|
||||
from valency.k_utils import dict_safe_key as dsk
|
||||
from copy import deepcopy as DC
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# Upper limit for how many senses a lemma can have.
|
||||
GUL = 20
|
||||
SLOWNET_CACHE = "slownet_glosses_cache"
|
||||
|
||||
|
||||
class DictionaryInterface:
|
||||
def __init__(self, vallex, dictionary):
|
||||
self.vallex = vallex
|
||||
self.dictionary = "interface"
|
||||
|
||||
def find(self, lemma):
|
||||
return []
|
||||
|
||||
def contains(self, lemma, upper_limit=GUL):
|
||||
# useless. need to check if sense_glosses returns non empty list
|
||||
res = self.find(lemma)
|
||||
if upper_limit is not None and len(res) > upper_limit:
|
||||
return False
|
||||
return (len(res) is not 0)
|
||||
|
||||
def cached_glosses(self, lemma):
|
||||
# preprocessed self_glosses (not used)
|
||||
res = list(self.vallex.db.cached_glosses.find(
|
||||
{"lemma": lemma, "dictionary": self.dictionary}))
|
||||
if len(res) == 0:
|
||||
return []
|
||||
return res[0]["glosses"]
|
||||
|
||||
def sense_glosses(self, lemma):
|
||||
# array: gloss for each sense
|
||||
# gloss: {"gloss": ["<sense>", ...], "def": ["<sense"], ...}
|
||||
return "dictionary_interface.py: not_yet_implemented"
|
||||
|
||||
# Recursively pull strgins out of a dictionary,
|
||||
# based on a list of keys.
|
||||
# uses self.recursion_buffer
|
||||
def pull_strings_wrapper(self, element, keys):
|
||||
if element is None:
|
||||
return []
|
||||
self.recursion_buffer = []
|
||||
self.pull_strings(element, keys)
|
||||
return self.recursion_buffer[:]
|
||||
|
||||
def pull_strings(self, element, keys):
|
||||
# Recursively pull values out of a dict.
|
||||
# correct key + element as string or list of strings
|
||||
for k, e in element.items():
|
||||
if k not in keys:
|
||||
continue
|
||||
if isinstance(e, dict):
|
||||
self.pull_strings(e, keys)
|
||||
elif isinstance(e, str):
|
||||
self.recursion_buffer.append(e)
|
||||
elif isinstance(e, list):
|
||||
for ee in e:
|
||||
if isinstance(ee, dict):
|
||||
self.pull_strings(ee, keys)
|
||||
elif isinstance(ee, str):
|
||||
self.recursion_buffer.append(ee)
|
||||
|
||||
|
||||
class Sskj(DictionaryInterface):
|
||||
def __init__(self, vallex):
|
||||
super().__init__(vallex, "sskj")
|
||||
|
||||
def find(self, lemma):
|
||||
res = list(self.vallex.db.sskj.find(
|
||||
{"ns0:entry.ns0:form.ns0:orth": lemma}
|
||||
))
|
||||
return res
|
||||
|
||||
def sense_glosses(self, lemma, upper_limit=GUL):
|
||||
entries = self.find(lemma)
|
||||
if upper_limit is not None and len(entries) > upper_limit:
|
||||
log.info("sense_glosses({}): too many sense entries".format(lemma))
|
||||
return []
|
||||
senses = []
|
||||
if len(entries) == 0:
|
||||
return []
|
||||
for e in entries:
|
||||
senses.extend(dsk(
|
||||
e["ns0:entry"], "ns0:sense"))
|
||||
keys = [
|
||||
"ns0:def", "ns0:cit", "ns0:quote",
|
||||
"ns0:gloss", "ns0:sense", "ns0:orth",
|
||||
"ns0:form", "#text"
|
||||
]
|
||||
glosses = []
|
||||
for s in senses:
|
||||
gloss = self.pull_strings_wrapper(s, keys)
|
||||
if len(gloss) == 0:
|
||||
continue
|
||||
glosses.append({
|
||||
"gloss": gloss,
|
||||
"def": self.pull_strings_wrapper(s, ["ns0:sense", "ns0:def"])
|
||||
})
|
||||
return glosses
|
||||
|
||||
|
||||
class SloWnet(DictionaryInterface):
|
||||
def __init__(self, vallex):
|
||||
super().__init__(vallex, "slownet")
|
||||
self.hypernym_buffer = []
|
||||
|
||||
def slo_to_eng(self, lemma):
|
||||
|
||||
def helper_get_eng_lemmas(r):
|
||||
res = []
|
||||
for literal in dsk(r, "SYNONYM"):
|
||||
if literal["@xml:lang"] == "en":
|
||||
for lt in dsk(literal, "LITERAL"):
|
||||
res.append(lt["#text"])
|
||||
return res
|
||||
|
||||
# takes a slo token, returns array of english counterparts
|
||||
results = self.find(lemma)
|
||||
eng_lemmas = []
|
||||
for r in results:
|
||||
eng_lemmas.extend(helper_get_eng_lemmas(r))
|
||||
return eng_lemmas
|
||||
|
||||
def helper_get_hypernyms(self, entry):
|
||||
res = []
|
||||
dd = dsk(entry, "ILR")
|
||||
for d in dd:
|
||||
if d["@type"] == "hypernym":
|
||||
res.append(d["#text"])
|
||||
return res
|
||||
|
||||
def helper_get_en_literals(self, entry):
|
||||
res = []
|
||||
synonyms = dsk(entry, "SYNONYM")
|
||||
for syn in synonyms:
|
||||
if syn["@xml:lang"] == "en":
|
||||
literals = dsk(syn, "LITERAL")
|
||||
for lit in literals:
|
||||
res.append(lit["#text"])
|
||||
return res
|
||||
|
||||
def rek_root_chain(self, slownet_id):
|
||||
entry = self.find_by_id(slownet_id)
|
||||
if entry is None:
|
||||
return []
|
||||
res = self.helper_get_en_literals(entry)
|
||||
for hypernym_id in self.helper_get_hypernyms(slownet_id):
|
||||
res.extend(self.rek_root_chain(hypernym_id))
|
||||
return res
|
||||
|
||||
def root_chain(self, lemma):
|
||||
cached = list(self.vallex.db.cached_root_chains.find({
|
||||
"lemma": lemma
|
||||
}))
|
||||
if cached:
|
||||
return cached[0]["data"]
|
||||
|
||||
res = self.slo_to_eng(lemma)
|
||||
entries = self.find(lemma)
|
||||
start_hypernym_ids = []
|
||||
for ent in entries:
|
||||
start_hypernym_ids.extend(self.helper_get_hypernyms(ent))
|
||||
for shi in start_hypernym_ids:
|
||||
res.extend(self.rek_root_chain(shi))
|
||||
self.vallex.db.cached_root_chains.insert({
|
||||
"lemma": lemma,
|
||||
"data": res
|
||||
})
|
||||
return res
|
||||
|
||||
def find_by_id(self, slownet_id):
|
||||
res = list(self.vallex.db.slownet.find({"ID": slownet_id}))
|
||||
if len(res) == 0:
|
||||
log.error("ID: {} not in db.slownet.".format(slownet_id))
|
||||
return None
|
||||
return res[0]
|
||||
|
||||
def find(self, lemma):
|
||||
return list(self.vallex.db.slownet.find({"slo_lemma": lemma}))
|
||||
"""
|
||||
# elemMatch for array query
|
||||
res = list(self.vallex.db.slownet.find({
|
||||
"SYNONYM": {'$elemMatch': {
|
||||
"LITERAL": {'$elemMatch': {"#text": lemma}}
|
||||
}}
|
||||
}))
|
||||
"""
|
||||
|
||||
def hypernyms(self, slownet_id, level):
|
||||
if level == 3:
|
||||
return
|
||||
elements = list(self.vallex.db.slownet.find({"ID": slownet_id}))
|
||||
if len(elements) == 0:
|
||||
return
|
||||
for e in elements:
|
||||
ei = self.extract_element_info(e)
|
||||
self.hypernym_buffer.append({
|
||||
"def": ei["domain"] + ei["def"],
|
||||
"gloss": ei["domain"] + ei["def"] + ei["usage"]
|
||||
})
|
||||
for ilr in ei["ilr"]:
|
||||
self.hypernyms(ilr, level + 1)
|
||||
|
||||
def extract_element_info(self, e):
|
||||
domain = []
|
||||
dd = dsk(e, "DOMAIN")
|
||||
for d in dd:
|
||||
domain.append(d)
|
||||
definition = []
|
||||
dd = dsk(e, "DEF")
|
||||
for d in dd:
|
||||
if d["@xml:lang"] == "en":
|
||||
definition.append(d["#text"])
|
||||
ilr = []
|
||||
dd = dsk(e, "ILR")
|
||||
for d in dd:
|
||||
if d["@type"] == "hypernym":
|
||||
ilr.append(d["#text"])
|
||||
usage = []
|
||||
dd = dsk(e, "USAGE")
|
||||
for d in dd:
|
||||
if d["@xml:lang"] == "en":
|
||||
usage.append(d["#text"])
|
||||
return {
|
||||
"domain": domain,
|
||||
"def": definition,
|
||||
"ilr": ilr,
|
||||
"usage": usage,
|
||||
}
|
||||
|
||||
def sense_glosses(self, lemma, upper_limit=GUL):
|
||||
# stime = time()
|
||||
|
||||
# caching
|
||||
db_key = {
|
||||
"lemma": lemma,
|
||||
"upper_limit": upper_limit
|
||||
}
|
||||
cache = list(self.vallex.db[SLOWNET_CACHE].find(db_key))
|
||||
if len(cache) > 0:
|
||||
return cache[0]["data"]
|
||||
|
||||
entries = self.find(lemma)
|
||||
if upper_limit is not None and len(entries) > upper_limit:
|
||||
# log.info("sense_glosses({}): too many senses".format(lemma))
|
||||
return []
|
||||
ret_glosses = []
|
||||
for e in entries:
|
||||
defs = []
|
||||
glosses = []
|
||||
self.hypernym_buffer = []
|
||||
ei = self.extract_element_info(e)
|
||||
self.hypernym_buffer.append({
|
||||
"def": ei["domain"] + ei["def"],
|
||||
"gloss": ei["domain"] + ei["def"] + ei["usage"]
|
||||
})
|
||||
for ilr in ei["ilr"]:
|
||||
self.hypernyms(ilr, 1)
|
||||
|
||||
[defs.extend(x["def"]) for x in self.hypernym_buffer]
|
||||
[glosses.extend(x["gloss"]) for x in self.hypernym_buffer]
|
||||
ret_glosses.append({
|
||||
"def": defs,
|
||||
"gloss": glosses,
|
||||
})
|
||||
|
||||
# log.debug("slownet.sense_glosses({}): {:.2f}s".format(
|
||||
# lemma, time() - stime))
|
||||
|
||||
# caching
|
||||
db_entry = {
|
||||
"lemma": db_key["lemma"],
|
||||
"upper_limit": db_key["upper_limit"],
|
||||
"data": ret_glosses
|
||||
}
|
||||
self.vallex.db.slownet_sense_glosses.update(
|
||||
db_key, db_entry, upsert=True
|
||||
)
|
||||
return ret_glosses
|
||||
|
||||
|
||||
class Sskj2(DictionaryInterface):
|
||||
def __init__(self, vallex):
|
||||
super().__init__(vallex, "sskj")
|
||||
|
||||
def find(self, lemma):
|
||||
pos = "glagol"
|
||||
if lemma[-1] == "_":
|
||||
pos = "pridevnik"
|
||||
res = list(self.vallex.db.sskj.find({
|
||||
"izt_clean": lemma,
|
||||
"pos": pos
|
||||
}))
|
||||
return res
|
||||
|
||||
def count_senses(self, lemma):
|
||||
entries = self.find(lemma)
|
||||
if len(entries) == 0:
|
||||
return 0
|
||||
ol = dsk(entries[0], "ol")
|
||||
if len(ol) == 0:
|
||||
return 1
|
||||
return len(ol[0]["li"])
|
||||
|
||||
def sense_glosses(self, lemma, upper_limit=GUL):
|
||||
|
||||
def helper_dict_safe_add(dic, key, el):
|
||||
if key not in dic:
|
||||
dic[key] = []
|
||||
dic[key].append(el)
|
||||
|
||||
def helper_pull_rec(el_lst, res_dct):
|
||||
for el in el_lst:
|
||||
if isinstance(el, dict):
|
||||
if ("@title" in el) and ("#text" in el):
|
||||
helper_dict_safe_add(
|
||||
res_dct, el["@title"], el["#text"])
|
||||
if "span" in el:
|
||||
helper_pull_rec(dsk(el, "span"), res_dct)
|
||||
if ("ol" in el) and ("li" in el["ol"]):
|
||||
helper_pull_rec(el["ol"]["li"], res_dct)
|
||||
if "li" in el:
|
||||
helper_pull_rec(el["li"], res_dct)
|
||||
|
||||
entries = self.find(lemma)
|
||||
if len(entries) == 0:
|
||||
return []
|
||||
if len(entries) > 1:
|
||||
log.warning("{} entries for {} in sskj2.".format(
|
||||
len(entries), lemma))
|
||||
glosses_per_entry = []
|
||||
for idx, entry in enumerate(entries):
|
||||
res_dict = {}
|
||||
if "span" in entry:
|
||||
helper_pull_rec(dsk(entry, "span"), res_dict)
|
||||
# senses
|
||||
res_dict["senses"] = []
|
||||
if ("ol" in entry) and ("li" in entry["ol"]):
|
||||
for el in dsk(entry["ol"], "li"):
|
||||
tmp = {"sskj_sense_id": el["span"][0]}
|
||||
helper_pull_rec(dsk(el, "span"), tmp)
|
||||
helper_pull_rec(dsk(el, "ol"), tmp)
|
||||
res_dict["senses"].append(DC(tmp))
|
||||
|
||||
def helper_create_gloss(dct):
|
||||
keys = ["Razlaga", "Zgled", "Stranska razlaga", "Sopomenka"]
|
||||
ret = []
|
||||
for k in keys:
|
||||
ret.extend(dsk(dct, k))
|
||||
return ret
|
||||
|
||||
glosses = []
|
||||
n_senses = len(res_dict["senses"])
|
||||
if n_senses == 0:
|
||||
glosses.append({
|
||||
"sskj_sense_id": "1-1",
|
||||
"gloss": helper_create_gloss(res_dict),
|
||||
"def": dsk(res_dict, "Razlaga")
|
||||
})
|
||||
return glosses
|
||||
|
||||
for sense in res_dict["senses"]:
|
||||
glosses.append({
|
||||
"sskj_sense_id": "{}-{}".format(
|
||||
sense["sskj_sense_id"], n_senses),
|
||||
"gloss": helper_create_gloss(sense),
|
||||
"def": dsk(sense, "Razlaga")
|
||||
})
|
||||
glosses_per_entry.append(glosses)
|
||||
|
||||
# add entry_id before the_sense id
|
||||
# entry_id-sskj_sense_id-n_senses
|
||||
all_glosses = []
|
||||
for idx, glosses in enumerate(glosses_per_entry):
|
||||
entry_id = idx + 1 # start with 1
|
||||
for gloss in glosses:
|
||||
gloss["sskj_sense_id"] = "{}-{}".format(
|
||||
entry_id, gloss["sskj_sense_id"])
|
||||
all_glosses.append(gloss)
|
||||
return all_glosses
|
||||
96
dip_src/valency/frame.py
Normal file
96
dip_src/valency/frame.py
Normal file
@@ -0,0 +1,96 @@
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Frame():
|
||||
def __init__(self, tids, deep_links=None, slots=None, hw=None):
|
||||
self.hw = hw
|
||||
self.tids = tids # list of tokens with the same hw_lemma
|
||||
# Each tid = "S123.t123";
|
||||
# you can get sentence with vallex.get_sentence(S123)
|
||||
self.slots = []
|
||||
if slots is None:
|
||||
self.slots = self.init_slots(deep_links)
|
||||
else:
|
||||
self.slots = slots
|
||||
self.sense_info = {}
|
||||
self.sentences = None # Used for passing to view in app.py, get_frames
|
||||
self.aggr_sent = None # Dictionary { hw: self.sentences idx }
|
||||
|
||||
def to_json(self):
|
||||
ret = {
|
||||
"hw": self.hw,
|
||||
"tids": self.tids,
|
||||
"slots": [slot.to_json() for slot in self.slots],
|
||||
"sentences": self.sentences,
|
||||
"aggr_sent": self.aggr_sent,
|
||||
"sense_info": self.sense_info
|
||||
}
|
||||
return ret
|
||||
|
||||
def init_slots(self, deep):
|
||||
slots = []
|
||||
for link in deep:
|
||||
slots.append(Slot(
|
||||
functor=link["functor"],
|
||||
tids=[link["to"]]
|
||||
))
|
||||
return slots
|
||||
|
||||
def sort_slots(self):
|
||||
# ACT, PAT, alphabetically
|
||||
srt1 = [
|
||||
x for x in self.slots
|
||||
if (x.functor == "ACT" or
|
||||
x.functor == "PAT")
|
||||
]
|
||||
srt1 = sorted(srt1, key=lambda x: x.functor)
|
||||
srt2 = [
|
||||
x for x in self.slots
|
||||
if (x.functor != "ACT" and
|
||||
x.functor != "PAT")
|
||||
]
|
||||
srt2 = sorted(srt2, key=lambda x: x.functor)
|
||||
self.slots = (srt1 + srt2)
|
||||
|
||||
def to_string(self):
|
||||
ret = "Frame:\n"
|
||||
ret += "sense_info: {}\n".format(str(self.sense_info))
|
||||
ret += "tids: ["
|
||||
for t in self.tids:
|
||||
ret += (str(t) + ", ")
|
||||
ret += "]\n"
|
||||
if self.slots is not None:
|
||||
ret += "slots:\n"
|
||||
for sl in self.slots:
|
||||
ret += (sl.to_string() + "\n")
|
||||
return ret
|
||||
|
||||
|
||||
class Slot():
|
||||
# Each slot is identified by its functor (ACT, PAT, ...)
|
||||
# It consists of different tokens.
|
||||
def __init__(self, functor, tids=None, count=None):
|
||||
self.functor = functor
|
||||
self.tids = tids or []
|
||||
self.count = count or 1
|
||||
|
||||
def to_string(self):
|
||||
ret = "---- Slot:\n"
|
||||
ret += "functor: {}\n".format(self.functor)
|
||||
ret += "tids: ["
|
||||
for t in self.tids:
|
||||
ret += (str(t) + ", ")
|
||||
ret += "]\n"
|
||||
ret += "]\n"
|
||||
ret += "----\n"
|
||||
return ret
|
||||
|
||||
def to_json(self):
|
||||
ret = {
|
||||
"functor": self.functor,
|
||||
"tids": self.tids,
|
||||
"count": self.count
|
||||
}
|
||||
return ret
|
||||
367
dip_src/valency/k_utils.py
Normal file
367
dip_src/valency/k_utils.py
Normal file
@@ -0,0 +1,367 @@
|
||||
import os
|
||||
import pickle
|
||||
import nltk
|
||||
import random
|
||||
from time import time
|
||||
import string
|
||||
from polyglot.text import Word
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
sno = nltk.stem.SnowballStemmer("english")
|
||||
|
||||
|
||||
def dict_safe_key(dic, key):
|
||||
# Returns a list, no matter what.
|
||||
# Transform 1 element into a list.
|
||||
# Return key not found as empty list.
|
||||
if (
|
||||
dic is None or
|
||||
key not in dic
|
||||
):
|
||||
return []
|
||||
subdic = dic[key]
|
||||
if not isinstance(subdic, list):
|
||||
return [subdic]
|
||||
return subdic
|
||||
|
||||
|
||||
def pickle_dump(data, path):
|
||||
with open(path, "wb") as file:
|
||||
pickle.dump(data, file)
|
||||
log.info("Dumped data to {}.".format(path))
|
||||
return True
|
||||
|
||||
|
||||
def pickle_load(path):
|
||||
ret = None
|
||||
if os.path.isfile(path):
|
||||
with open(path, "rb") as file:
|
||||
ret = pickle.load(file)
|
||||
log.info("Loaded data from {}.".format(path))
|
||||
return ret # Returns None in case of failure.
|
||||
|
||||
|
||||
# Implemented bucket sort for alphabetically sorting slovenian words.
|
||||
# Bucket sort >>>>>>>>>>>>>>>>>>>>
|
||||
def gen_sbs_alphabet():
|
||||
alphabet = "abcčdefghijklmnoprsštuvzž"
|
||||
return {letter: (idx + 1) for idx, letter in enumerate(alphabet)}
|
||||
|
||||
|
||||
slo_bucket_sort_alphabet = gen_sbs_alphabet()
|
||||
|
||||
|
||||
def slo_bucket_sort(words, key=None):
|
||||
if key is None:
|
||||
def key(x):
|
||||
return x
|
||||
|
||||
def alph_score(word, idx):
|
||||
kword = key(word)
|
||||
if idx >= len(kword):
|
||||
return 0
|
||||
return slo_bucket_sort_alphabet.get(kword[idx]) or 0
|
||||
|
||||
def list_to_bins(words, idx):
|
||||
bins = [[] for i in range(len(slo_bucket_sort_alphabet.keys()) + 1)]
|
||||
for word in words:
|
||||
bins[alph_score(word, idx)].append(word)
|
||||
return bins
|
||||
|
||||
def bins_to_list(bins):
|
||||
lst = []
|
||||
for b in bins:
|
||||
for el in b:
|
||||
lst.append(el)
|
||||
return lst
|
||||
|
||||
maxLen = 0
|
||||
for w in words:
|
||||
if len(key(w)) > maxLen:
|
||||
maxLen = len(key(w))
|
||||
maxIdx = maxLen - 1
|
||||
for idx in range(maxIdx, -1, -1):
|
||||
bins = list_to_bins(words, idx)
|
||||
words = bins_to_list(bins)
|
||||
"""
|
||||
print(idx)
|
||||
def get_letter(idx, word):
|
||||
kword = key(word)
|
||||
if idx < len(kword):
|
||||
return(kword[idx])
|
||||
return "#"
|
||||
print([(word, get_letter(idx, word)) for word in words])
|
||||
"""
|
||||
return words
|
||||
# Bucket sort <<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
|
||||
def stem_slo(x):
|
||||
# Simplified;
|
||||
# Remove the last syllable.
|
||||
w = Word(x, language="sl").morphemes
|
||||
ret = "".join(w[:-1])
|
||||
return ret
|
||||
|
||||
|
||||
def stem_eng(x):
|
||||
return sno.stem(x)
|
||||
|
||||
|
||||
def tokenize(sentence, min_token_len=3, stem=None):
|
||||
# input: sentence string
|
||||
# output: list of token strings
|
||||
if stem is None:
|
||||
def stem(x):
|
||||
return x
|
||||
all_tokens = []
|
||||
sent_txt = nltk.sent_tokenize(sentence)
|
||||
for sent in sent_txt:
|
||||
tokens = nltk.word_tokenize(sent)
|
||||
all_tokens.extend(tokens)
|
||||
res = []
|
||||
for x in all_tokens:
|
||||
if x in string.punctuation:
|
||||
continue
|
||||
stemmed = stem(x.lower())
|
||||
if len(stemmed) >= min_token_len:
|
||||
res.append(stemmed)
|
||||
return res
|
||||
|
||||
|
||||
def tokenize_multiple(str_list, min_token_len=3, stem=None):
|
||||
# tstart = time()
|
||||
res = []
|
||||
for sentence in str_list:
|
||||
res.extend(tokenize(sentence, min_token_len, stem))
|
||||
# log.debug("tokenize_multiple: {:.2f}s".format(time() - tstart))
|
||||
return res
|
||||
|
||||
|
||||
def t_tokenize():
|
||||
teststring = "This is a test sentence. I hope it works. .. Asdf. asdf ,,,;"
|
||||
print(teststring)
|
||||
res = tokenize(teststring, min_token_len=None)
|
||||
print(res)
|
||||
|
||||
|
||||
def permute_paths(list2d, x=None, y=None, paths=None, current_path=None):
|
||||
# python stuff
|
||||
if x is None:
|
||||
x = -1
|
||||
if paths is None:
|
||||
paths = []
|
||||
if current_path is None:
|
||||
current_path = []
|
||||
|
||||
if x >= len(list2d) - 1:
|
||||
paths.append(current_path)
|
||||
return paths
|
||||
for i in range(len(list2d[x + 1])):
|
||||
tmp_path = current_path + [(x + 1, i)]
|
||||
# Computational complexity peoblem (prune long lists)
|
||||
# len == 12 -> 30%, len == 5 -> 100%
|
||||
# if random.randint(0, 100) <= (100 - 10 * (len(list2d) - 5)):
|
||||
if True:
|
||||
paths = permute_paths(
|
||||
list2d,
|
||||
x + 1,
|
||||
i,
|
||||
paths,
|
||||
tmp_path
|
||||
)
|
||||
return paths
|
||||
|
||||
|
||||
def t_permute_paths():
|
||||
list2d = [
|
||||
["Greta"],
|
||||
["backflips"],
|
||||
["through", "around"],
|
||||
["North Korea", "kindergarten"],
|
||||
["with", "without"],
|
||||
["a"],
|
||||
["bag of", "abundance of"],
|
||||
["bolts", "janitors"]
|
||||
]
|
||||
|
||||
print(list2d)
|
||||
paths = permute_paths(list2d=list2d)
|
||||
for path in paths:
|
||||
print([list2d[p[0]][p[1]] for p in path])
|
||||
|
||||
|
||||
def find_overlaps(list_a, list_b):
|
||||
# Input: two lists.
|
||||
# Output: lists of overlapping elements.
|
||||
dict_a = {}
|
||||
dict_b = {}
|
||||
lists = [list_a, list_b]
|
||||
dicts = [dict_a, dict_b]
|
||||
for lidx in range(len(lists)):
|
||||
for elidx in range(len(lists[lidx])):
|
||||
el = lists[lidx][elidx]
|
||||
if el not in dicts[lidx]:
|
||||
dicts[lidx][el] = []
|
||||
dicts[lidx][el].append(elidx)
|
||||
|
||||
substrings = []
|
||||
|
||||
sda = sorted(dict_a.keys())
|
||||
sdb = sorted(dict_b.keys())
|
||||
|
||||
i_sda = 0
|
||||
i_sdb = 0
|
||||
while ((i_sda < len(sda) and i_sdb < len(sdb))):
|
||||
if sda[i_sda] == sdb[i_sdb]:
|
||||
lia = dict_a[sda[i_sda]]
|
||||
lib = dict_b[sdb[i_sdb]]
|
||||
for llia in lia:
|
||||
for llib in lib:
|
||||
tmp_substr = []
|
||||
ii = 0
|
||||
while (
|
||||
(llia + ii < len(list_a)) and
|
||||
(llib + ii < len(list_b)) and
|
||||
(list_a[llia + ii] == list_b[llib + ii])
|
||||
):
|
||||
tmp_substr.append(list_a[llia + ii])
|
||||
ii += 1
|
||||
ii = 1
|
||||
while (
|
||||
(llia - ii >= 0) and
|
||||
(llib - ii >= 0) and
|
||||
(list_a[llia - ii] == list_b[llib - ii])
|
||||
):
|
||||
tmp_substr.insert(0, list_a[llia - ii])
|
||||
ii += 1
|
||||
substrings.append(tmp_substr)
|
||||
if sda[i_sda] < sdb[i_sdb]:
|
||||
i_sda += 1
|
||||
else:
|
||||
i_sdb += 1
|
||||
|
||||
uniques = set()
|
||||
res = []
|
||||
for ss in substrings:
|
||||
if str(ss) not in uniques:
|
||||
uniques.add(str(ss))
|
||||
res.append(ss)
|
||||
return res
|
||||
|
||||
|
||||
def find_overlaps_str(tokens_a, tokens_b):
|
||||
# Strings only.
|
||||
overlaps = []
|
||||
for N in range(1, 5):
|
||||
ngrams_a = []
|
||||
for i in range(len(tokens_a)):
|
||||
if i + N <= len(tokens_a):
|
||||
ngrams_a.append(tuple(tokens_a[i:i + N]))
|
||||
ngrams_b = []
|
||||
for i in range(len(tokens_b)):
|
||||
if i + N <= len(tokens_b):
|
||||
ngrams_b.append(tuple(tokens_b[i:i + N]))
|
||||
overlaps.extend(list(set(ngrams_a).intersection(set(ngrams_b))))
|
||||
|
||||
res = []
|
||||
for ovl in sorted(overlaps, key=lambda x: len(x), reverse=True):
|
||||
oovl = " ".join(ovl)
|
||||
for r in res:
|
||||
if oovl in r:
|
||||
break
|
||||
else:
|
||||
res.append(oovl)
|
||||
res[:] = [x.split(" ") for x in res]
|
||||
return res
|
||||
|
||||
|
||||
def t_find_overlaps():
|
||||
res = []
|
||||
input_len = [10, 100, 1000, 10000]
|
||||
for ll in input_len:
|
||||
alen = ll + int(ll * random.uniform(0.8, 1))
|
||||
blen = ll + int(ll * random.uniform(0.8, 1))
|
||||
a = [random.randint(0, 100) for x in range(alen)]
|
||||
b = [random.randint(0, 100) for x in range(blen)]
|
||||
tstart = time()
|
||||
find_overlaps(a, b)
|
||||
res.append({
|
||||
"time": time() - tstart,
|
||||
"input_size": ll
|
||||
})
|
||||
"""
|
||||
list_a = [6, 6, 4, 8, 3, 2, 2, 5, 6, 3, 4, 7, 5]
|
||||
list_b = [5, 3, 6, 8, 6, 6, 5, 3, 2, 6, 7, 8, 3, 2, 3, 2, 2, 5]
|
||||
res = find_overlaps(list_a, list_b)
|
||||
"""
|
||||
for r in res:
|
||||
print(r)
|
||||
|
||||
|
||||
def t1_find_overlaps():
|
||||
t1 = "This is a test sentence. I hope it works. .. Asdf. asdf ,,,;"
|
||||
t2 = "this is a seconde sentence. I hope my stuff works."
|
||||
print(t1)
|
||||
print(t2)
|
||||
res = find_overlaps(tokenize(t1), tokenize(t2))
|
||||
for r in res:
|
||||
print(r)
|
||||
|
||||
print()
|
||||
|
||||
res = find_overlaps_str(tokenize(t1), tokenize(t2))
|
||||
for r in res:
|
||||
print(r)
|
||||
|
||||
|
||||
def t_find_overlaps_str():
|
||||
t1 = [
|
||||
'vsa', 'moja', 'možganska', 'beda', 'se', 'združuje',
|
||||
'v', 'dejstvu', 'da', 'sem', 'si', 'čeprav', 'sem', 'pozabil',
|
||||
'ulico', 'zapomnil', 'hišno', 'številko'
|
||||
]
|
||||
t2 = [
|
||||
'narediti', 'doseči', 'da', 'se', 'kaj', 'aktivno', 'ohrani',
|
||||
'v', 'zavesti', 'zapomniti', 'si', 'imena', 'predstavljenih',
|
||||
'gostov', 'dobro', 'natančno', 'slabo', 'si', 'kaj', 'zapomniti',
|
||||
'takega', 'sem', 'si', 'zapomnil', 'zapomnite', 'te', 'prizore'
|
||||
]
|
||||
res = find_overlaps(t1, t2)
|
||||
print(res)
|
||||
|
||||
|
||||
def t_slo_bucket_sort():
|
||||
a1 = []
|
||||
a2 = []
|
||||
with open("./tests/m_besede2.txt") as f:
|
||||
for line in f:
|
||||
a1.append(line.split("\n")[0])
|
||||
a2.append((line.split("\n")[0], random.randint(0, 9)))
|
||||
|
||||
a1 = slo_bucket_sort(a1)
|
||||
a2 = slo_bucket_sort(a2, key=lambda x: x[0])
|
||||
|
||||
check = True
|
||||
for i in range(len(a1)):
|
||||
check &= (a1[i] == a2[i][0])
|
||||
print("{:<10}{:>10}".format(str(a1[i]), str(a2[i])))
|
||||
print(check)
|
||||
|
||||
|
||||
def t1_slo_bucket_sort():
|
||||
words = "_xyz zebra. .bober raca bor borovnica antilopa".split(" ")
|
||||
words.append("test space")
|
||||
words.append("test srrrr")
|
||||
words.append("test saaa")
|
||||
for w in slo_bucket_sort(words):
|
||||
print(w)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# t_find_overlaps()
|
||||
# t1_find_overlaps()
|
||||
# t_tokenize()
|
||||
# t_find_overlaps_str()
|
||||
t1_slo_bucket_sort()
|
||||
247
dip_src/valency/mongo_tools.py
Normal file
247
dip_src/valency/mongo_tools.py
Normal file
@@ -0,0 +1,247 @@
|
||||
import pymongo
|
||||
import xmltodict
|
||||
import xml.etree.ElementTree as ET
|
||||
from time import time
|
||||
import json
|
||||
from valency.sskj_scraper import SskjScraper
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Get rid of accented characters.
|
||||
intab = "ÁÉÍÓÚàáäçèéêìíîñòóôöùúüčŔŕ"
|
||||
outtb = "AEIOUaaaceeeiiinoooouuučRr"
|
||||
transtab = str.maketrans(intab, outtb)
|
||||
|
||||
|
||||
def mongo_test():
|
||||
client = pymongo.MongoClient(
|
||||
"mongodb://{}:{}@127.0.0.1:26633/texts".format("kristjan", "simple567")
|
||||
)
|
||||
|
||||
db = client.texts
|
||||
|
||||
coll = db.test
|
||||
|
||||
print(coll.find_one())
|
||||
|
||||
|
||||
def basic_connection(ip_addr=None, port=None):
|
||||
if ip_addr is None:
|
||||
ip_addr = "127.0.0.1"
|
||||
if port is None:
|
||||
port = 26644
|
||||
client = pymongo.MongoClient(
|
||||
"mongodb://{}:{}@{}:{}/texts".format(
|
||||
"kristjan", "simple567", ip_addr, str(port))
|
||||
)
|
||||
err_msg = "OK"
|
||||
try:
|
||||
client.server_info()
|
||||
except pymongo.errors.ServerSelectionTimeoutError as err:
|
||||
err_msg = err
|
||||
return (None, err_msg)
|
||||
db = client.texts
|
||||
return (db, err_msg)
|
||||
|
||||
|
||||
def check_collections(db, coll_names):
|
||||
collections = db.collection_names()
|
||||
for cn in coll_names:
|
||||
if cn not in collections:
|
||||
db.create_collection(cn)
|
||||
|
||||
|
||||
def prepare_user_tokens(db):
|
||||
CNAME = "v2_user_tokens"
|
||||
db[CNAME].drop()
|
||||
db.create_collection(CNAME)
|
||||
EXPIRE = 151200 # 2 days
|
||||
# EXPIRE = 10 # 10 sec
|
||||
db[CNAME].ensure_index("date", expireAfterSeconds=EXPIRE)
|
||||
|
||||
# user this: utc_timestamp = datetime.datetime.utcnow()
|
||||
# user_tokens.insert({
|
||||
# '_id': 'utc_session', "date": utc_timestamp,
|
||||
# "session": "test session"})
|
||||
|
||||
|
||||
def sskj_to_mongo(sskj_path):
|
||||
# Deprecated, use sskj2_to_mongo
|
||||
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
|
||||
ts = time()
|
||||
sskj = ET.parse(sskj_path).getroot()
|
||||
db = basic_connection()
|
||||
col_names = ["sskj"]
|
||||
for cn in col_names:
|
||||
if cn in db.collection_names():
|
||||
db[cn].drop()
|
||||
text = sskj.find("tei:text", ns)
|
||||
body = text.find("tei:body", ns)
|
||||
n_ent = 0
|
||||
for entry in body.findall("tei:entry", ns):
|
||||
n_ent += 1
|
||||
tmpstr = ET.tostring(entry)
|
||||
datachunk = xmltodict.parse(tmpstr)
|
||||
dictchunk = json.loads(json.dumps(datachunk))
|
||||
"""
|
||||
pp = pprint.PrettyPrinter()
|
||||
pp.pprint(dictchunk)
|
||||
"""
|
||||
db.sskj.insert(dictchunk)
|
||||
# iskanje: db.sskj.find({'ns0:entry.ns0:form.ns0:orth':"kaplanček"})
|
||||
print("sskj to mongo: {} entries in {:.2f}s".format(n_ent, time() - ts))
|
||||
|
||||
|
||||
def slownet_to_mongo(slw_path):
|
||||
# .slownet contains the database from .xml file
|
||||
# added toplevel field ["slo_lemma"] for faster querying
|
||||
ts = time()
|
||||
slownet = ET.parse(slw_path).getroot()
|
||||
db = basic_connection()
|
||||
col_names = ["slownet_map", "slownet"]
|
||||
for cn in col_names:
|
||||
if cn in db.collection_names():
|
||||
db[cn].drop()
|
||||
|
||||
slo_to_id = {}
|
||||
for synset in slownet.findall("SYNSET"):
|
||||
tmpstr = ET.tostring(synset)
|
||||
datachunk = xmltodict.parse(tmpstr)
|
||||
dictchunk = json.loads(json.dumps(datachunk))
|
||||
dictchunk = dictchunk["SYNSET"]
|
||||
# pp.pprint(dictchunk)
|
||||
|
||||
# insert into slo_ti_id
|
||||
if "SYNONYM" in dictchunk:
|
||||
synonyms = dictchunk["SYNONYM"]
|
||||
if not isinstance(synonyms, list):
|
||||
synonyms = [synonyms]
|
||||
for syn in synonyms:
|
||||
if syn["@xml:lang"] == "sl":
|
||||
if "LITERAL" in syn:
|
||||
literals = syn["LITERAL"]
|
||||
if not isinstance(literals, list):
|
||||
literals = [literals]
|
||||
for lit in literals:
|
||||
slo_keyword = lit["#text"]
|
||||
if "." in slo_keyword:
|
||||
continue
|
||||
if "slo_lemma" not in dictchunk:
|
||||
dictchunk["slo_lemma"] = []
|
||||
dictchunk["slo_lemma"].append(slo_keyword)
|
||||
db.slownet.insert(dictchunk)
|
||||
|
||||
# pp.pprint(slo_to_id)
|
||||
db.slownet.ensure_index([("id", pymongo.ASCENDING)])
|
||||
db.slo_to_id.insert(slo_to_id)
|
||||
print("sloWNet to mongo in {:.2f}s".format(time() - ts))
|
||||
|
||||
|
||||
def scrape_sskj():
|
||||
# Deprecated!
|
||||
# Walk through keys in slo_to_id and scrape sskj data.
|
||||
client = pymongo.MongoClient(
|
||||
"mongodb://{}:{}@127.0.0.1:26633/texts".format("kristjan", "simple567")
|
||||
)
|
||||
db = client.texts
|
||||
words_list = sorted(db.slo_to_id.find_one())
|
||||
|
||||
print(len(words_list))
|
||||
sscraper = SskjScraper()
|
||||
|
||||
last_word = "nogometaš"
|
||||
db.scraped_sskj.remove({"word": last_word})
|
||||
lock = True
|
||||
for word in words_list:
|
||||
if word == last_word:
|
||||
lock = False
|
||||
|
||||
if not lock:
|
||||
res = sscraper.scrape(word)
|
||||
if len(res) > 0:
|
||||
db.scraped_sskj.insert({"word": word, "bag": res})
|
||||
|
||||
|
||||
def sskj2_to_mongo(sskj2_path):
|
||||
tstart = time()
|
||||
|
||||
db = basic_connection()
|
||||
col_names = ["sskj2"]
|
||||
for cn in col_names:
|
||||
if cn in db.collection_names():
|
||||
db[cn].drop()
|
||||
|
||||
with open(sskj2_path) as f:
|
||||
soup = BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
divs = soup.find_all("div")
|
||||
for i, div in enumerate(divs):
|
||||
if (i) % 100 == 0:
|
||||
print("{}/{}".format(i, len(divs)))
|
||||
datachunk = xmltodict.parse(str(div))
|
||||
datachunk = datachunk["div"]
|
||||
|
||||
# pos (besedna vrsta)
|
||||
pos_keywords = {
|
||||
"samostalnik": 0,
|
||||
"pridevnik": 0,
|
||||
"glagol": 0,
|
||||
"prislov": 0,
|
||||
"predlog": 0,
|
||||
"členek": 0,
|
||||
"veznik": 0,
|
||||
"medmet": 0,
|
||||
"povedkovnik": 0
|
||||
}
|
||||
for span in div.find_all("span"):
|
||||
attrs = [e for k, e in span.attrs.items()]
|
||||
for attr in attrs:
|
||||
for ak in attr.split(" "):
|
||||
akl = ak.lower()
|
||||
if akl in pos_keywords:
|
||||
pos_keywords[akl] += 1
|
||||
pos = "unknonw"
|
||||
pos_max = 0
|
||||
for k, e in pos_keywords.items():
|
||||
if e > pos_max:
|
||||
pos = k
|
||||
pos_max = e
|
||||
datachunk["pos"] = pos
|
||||
|
||||
# izt_clean
|
||||
izts = div.find_all("span", {"title": "Iztočnica"})
|
||||
if len(izts) == 0:
|
||||
print("Entry {} has no Iztočnica.".format(i))
|
||||
continue
|
||||
izt = ((izts[0].text).translate(transtab)).lower()
|
||||
ispl = izt.split(" ")
|
||||
has_se = False
|
||||
if len(ispl) and ispl[-1] == "se":
|
||||
izt = " ".join(ispl[:-1])
|
||||
has_se = True
|
||||
datachunk["izt_clean"] = izt
|
||||
datachunk["has_se"] = has_se
|
||||
|
||||
dictchunk = json.loads(json.dumps(datachunk))
|
||||
db.sskj.insert(dictchunk)
|
||||
|
||||
db.sskj.create_index([("izt_clean", pymongo.TEXT)])
|
||||
print("sskj2 to mongo: {} entries in {:.2f}s".format(
|
||||
len(divs), time() - tstart))
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# slownet_path = "../../data/slownet/slownet-2015-05-07.xml"
|
||||
# slownet_to_mongo(slownet_path)
|
||||
|
||||
# scrape_sskj()
|
||||
|
||||
# sskj_path = "../../data/sskj/sskj.p5.xml"
|
||||
# sskj_to_mongo(sskj_path)
|
||||
|
||||
# first file for testing, the original file takes up most of RAM
|
||||
# sskj2_path = "../../data/sskj/sskj2_200.html"
|
||||
# sskj2_path = "../../data/sskj/sskj2_v1.html"
|
||||
# sskj2_to_mongo(sskj2_path)
|
||||
|
||||
print("nothing here")
|
||||
239
dip_src/valency/reduce_functions.py
Normal file
239
dip_src/valency/reduce_functions.py
Normal file
@@ -0,0 +1,239 @@
|
||||
# Reduction function for frames.
|
||||
# Input: list of Frame objects, output: list of Frame objects.
|
||||
# App uses reduce_0, 1 and 5
|
||||
|
||||
from valency.frame import Frame, Slot
|
||||
from copy import deepcopy as DC
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
SENSE_UNDEFINED = "nedefinirano"
|
||||
|
||||
|
||||
def sorted_by_len_tids(frames):
|
||||
return sorted(
|
||||
frames,
|
||||
key=lambda x: len(x.tids),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
|
||||
def reduce_0(frames, vallex=None):
|
||||
# new request... frames should be sorded by
|
||||
# functors list (basically reduce_1, just each
|
||||
# sentence gets its own frame)
|
||||
r1_frames = reduce_1(frames)
|
||||
sorting_strings = []
|
||||
separated_frames = []
|
||||
for frame in r1_frames:
|
||||
for tid in frame.tids:
|
||||
tmp_frame = DC(frame)
|
||||
tmp_frame.tids = [tid]
|
||||
separated_frames.append(tmp_frame)
|
||||
sorting_strings.append("".join(
|
||||
[slot.functor for slot in tmp_frame.slots]
|
||||
))
|
||||
permutation = [x for _, x in sorted(
|
||||
zip(sorting_strings, range(len(sorting_strings))))]
|
||||
sorted_sep_frames = [separated_frames[i] for i in permutation]
|
||||
return sorted_sep_frames
|
||||
|
||||
|
||||
def reduce_1(frames, vallex=None):
|
||||
# Combine frames with the same set of functors.
|
||||
# The order of functors is not important.
|
||||
frame_sets = [] # [set of functors, list of frames]
|
||||
for frame in frames:
|
||||
functors = [slot.functor for slot in frame.slots]
|
||||
|
||||
for fs in frame_sets:
|
||||
if set(functors) == set(fs[0]):
|
||||
fs[1].append(frame)
|
||||
break
|
||||
else:
|
||||
# Python for else -> fires if loop has ended.
|
||||
frame_sets.append([functors, [frame]])
|
||||
|
||||
ret_frames = []
|
||||
for fs in frame_sets:
|
||||
tids = []
|
||||
slots = {}
|
||||
# All possible slots in this frame.
|
||||
for functor in fs[0]:
|
||||
slots[functor] = Slot(functor=functor)
|
||||
# Reduce slots from all frames. (Merge ACT from all frames, ...)
|
||||
for frame in fs[1]:
|
||||
tids += frame.tids
|
||||
for sl in frame.slots:
|
||||
slots[sl.functor].tids += sl.tids
|
||||
slots_list = []
|
||||
for k, e in slots.items():
|
||||
slots_list.append(e)
|
||||
rf = Frame(tids=tids, slots=slots_list)
|
||||
rf.sort_slots()
|
||||
ret_frames.append(rf)
|
||||
return sorted_by_len_tids(ret_frames)
|
||||
|
||||
|
||||
def reduce_3(raw_frames, vallex):
|
||||
# sskj simple lesk ids
|
||||
ssj_ids = [frame.tids[0] for frame in raw_frames]
|
||||
db_results = list(vallex.db.sskj_simple_lesk.find(
|
||||
{"ssj_id": {"$in": ssj_ids}}))
|
||||
id_map = {}
|
||||
for entry in db_results:
|
||||
id_map.update({entry["ssj_id"]: {
|
||||
"sense_id": entry.get("sense_id"),
|
||||
"sense_desc": entry.get("sense_desc")
|
||||
}})
|
||||
return frames_from_sense_ids(raw_frames, id_map)
|
||||
|
||||
|
||||
def reduce_4(raw_frames, vallex):
|
||||
# kmeans ids
|
||||
ssj_ids = [frame.tids[0] for frame in raw_frames]
|
||||
db_results = list(vallex.db.kmeans.find(
|
||||
{"ssj_id": {"$in": ssj_ids}}))
|
||||
id_map = {}
|
||||
for entry in db_results:
|
||||
id_map.update({entry["ssj_id"]: {
|
||||
"sense_id": entry["sense_id"]
|
||||
}})
|
||||
return frames_from_sense_ids(raw_frames, id_map)
|
||||
|
||||
|
||||
def reduce_5(raw_frames, vallex):
|
||||
USER_SENSE_COLL = "v2_sense_map"
|
||||
headword = raw_frames[0].hw
|
||||
ssj_ids_full = [frame.tids[0] for frame in raw_frames]
|
||||
# v2_sense_map stores only sentence half of ssj_id
|
||||
ssj_ids = [".".join(ssj_id.split(".")[:-1]) for ssj_id in ssj_ids_full]
|
||||
db_results = list(vallex.db[USER_SENSE_COLL].find({
|
||||
"ssj_id": {"$in": ssj_ids},
|
||||
"hw": headword,
|
||||
}))
|
||||
id_map = {}
|
||||
for entry in db_results:
|
||||
id_map[entry["ssj_id"]] = entry["sense_id"]
|
||||
|
||||
ret_frames = frames_from_sense_ids(raw_frames, id_map)
|
||||
|
||||
# sort: frames with senses to top
|
||||
senses_undefined = []
|
||||
senses_defined = []
|
||||
for frame in ret_frames:
|
||||
if frame.sense_info["sense_id"] == SENSE_UNDEFINED:
|
||||
senses_undefined.append(frame)
|
||||
else:
|
||||
senses_defined.append(frame)
|
||||
ret_frames = senses_defined + senses_undefined
|
||||
|
||||
return ret_frames
|
||||
|
||||
|
||||
def frames_from_sense_ids(raw_frames, id_map):
|
||||
# id map = dict {
|
||||
# ssj_id: sense_id
|
||||
# }
|
||||
# id_dict = dict {
|
||||
# sense_id: [frame, ...]
|
||||
# }
|
||||
id_dict = {}
|
||||
for frame in raw_frames:
|
||||
# long version ssj_id (S123.t12)
|
||||
frame_ssj_id = frame.tids[0]
|
||||
frame_sense_id = id_map.get(frame_ssj_id)
|
||||
if frame_sense_id is None:
|
||||
# try short version ssj_id (S123)
|
||||
frame_ssj_id = ".".join(frame_ssj_id.split(".")[:-1])
|
||||
frame_sense_id = id_map.get(frame_ssj_id)
|
||||
|
||||
# set default if sense_id not found
|
||||
if frame_sense_id is None:
|
||||
frame_sense_id = SENSE_UNDEFINED
|
||||
"""
|
||||
sense_id = id_map.get(frame.tids[0])
|
||||
if sense_id is not None:
|
||||
sense_id = sense_id.get("sense_id")
|
||||
else:
|
||||
sense_id = "nedefinirano"
|
||||
"""
|
||||
if frame_sense_id not in id_dict:
|
||||
id_dict[frame_sense_id] = []
|
||||
id_dict[frame_sense_id].append(DC(frame))
|
||||
|
||||
ret_frames = []
|
||||
for sense_id, frames in id_dict.items():
|
||||
tids = []
|
||||
reduced_slots = []
|
||||
for frame in frames:
|
||||
tids.extend(frame.tids)
|
||||
for slot in frame.slots:
|
||||
# if functor not in reduced slots,
|
||||
# add new slot; else increase count
|
||||
for rslot in reduced_slots:
|
||||
if slot.functor == rslot.functor:
|
||||
rslot.count += 1
|
||||
rslot.tids.extend(slot.tids)
|
||||
break
|
||||
else:
|
||||
# in case for loop didn't match a slot
|
||||
reduced_slots.append(Slot(
|
||||
functor=slot.functor,
|
||||
tids=slot.tids,
|
||||
count=1
|
||||
))
|
||||
reduced_frame = Frame(tids, slots=reduced_slots)
|
||||
id_map_entry = (
|
||||
id_map.get(tids[0]) or
|
||||
id_map.get(".".join(tids[0].split(".")[:-1]))
|
||||
)
|
||||
if id_map_entry is None:
|
||||
reduced_frame.sense_info = {
|
||||
"sense_id": SENSE_UNDEFINED,
|
||||
}
|
||||
else:
|
||||
reduced_frame.sense_info = {
|
||||
"sense_id": id_map_entry
|
||||
}
|
||||
reduced_frame.sort_slots()
|
||||
ret_frames.append(reduced_frame)
|
||||
return ret_frames
|
||||
|
||||
|
||||
reduce_functions = {
|
||||
"reduce_0": {
|
||||
"f": reduce_0,
|
||||
"desc":
|
||||
"Vsaka pojavitev glagola dobi svoj stavčni vzorec.",
|
||||
"simple_name": "posamezni stavki"
|
||||
},
|
||||
"reduce_1": {
|
||||
"f": reduce_1,
|
||||
"desc":
|
||||
"Združevanje stavčnih vzorcev z enako skupino udeleženskih vlog.",
|
||||
"simple_name": "združeni stavki"
|
||||
},
|
||||
"reduce_3": {
|
||||
"f": reduce_3,
|
||||
"desc":
|
||||
"Združevanje stavčnih vzorcev na osnovi pomenov povedi v SSKJ. "
|
||||
"Pomeni so dodeljeni s pomočjo algoritma Simple Lesk.",
|
||||
"simple_name": "SSKJ_pomeni"
|
||||
},
|
||||
"reduce_4": {
|
||||
"f": reduce_4,
|
||||
"desc":
|
||||
"Združevanje stavčnih vzorcev na osnovi pomenov povedi "
|
||||
"s pomočjo algoritma K-Means. Število predvidenih pomenov "
|
||||
"podano na osnovi SSKJ.",
|
||||
"simple_name": "KMeans_pomeni"
|
||||
},
|
||||
"reduce_5": {
|
||||
"f": reduce_5,
|
||||
"desc":
|
||||
"Uporabniško dodeljeni pomeni povedi.",
|
||||
"simple_name": "po meri"
|
||||
}
|
||||
}
|
||||
0
dip_src/valency/seqparser/__init__.py
Normal file
0
dip_src/valency/seqparser/__init__.py
Normal file
284
dip_src/valency/seqparser/seqparser.py
Normal file
284
dip_src/valency/seqparser/seqparser.py
Normal file
@@ -0,0 +1,284 @@
|
||||
from bs4 import BeautifulSoup as BS
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from time import time
|
||||
import pickle
|
||||
import json
|
||||
from copy import deepcopy as DC
|
||||
|
||||
# Match sese ordinals (1., 2., ...)
|
||||
rord = re.compile(r"^ *[0-9]+\. *$")
|
||||
|
||||
# Get rid of accented characters.
|
||||
intab = "ÁÉÍÓÚàáäçèéêìíîñòóôöùúüčŔŕ"
|
||||
outtb = "AEIOUaaaceeeiiinoooouuučRr"
|
||||
transtab = str.maketrans(intab, outtb)
|
||||
|
||||
|
||||
class Seqparser:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
# main functions
|
||||
def html_to_raw_pickle(self, sskj_html_filepath, raw_pickle_filepath):
|
||||
entries = dict(self.parse_file(sskj_html_filepath, self.parse_line))
|
||||
print("entries len: " + str(len(entries)))
|
||||
with open(raw_pickle_filepath, "wb") as f:
|
||||
tmpstr = json.dumps(dict(entries))
|
||||
pickle.dump(tmpstr, f)
|
||||
# debugging
|
||||
|
||||
def raw_pickle_to_parsed_pickle(
|
||||
self, raw_pickle_filepath, parsed_pickle_filepath,
|
||||
se_list_filepath
|
||||
):
|
||||
data = self.load_raw_pickle(raw_pickle_filepath)
|
||||
print("raw_pickle data len: " + str(len(data)))
|
||||
se_list = self.gen_se_list(data)
|
||||
print("se_list len: " + str(len(se_list)))
|
||||
with open(se_list_filepath, "wb") as f:
|
||||
pickle.dump(se_list, f)
|
||||
data1 = self.remove_se(data)
|
||||
data2 = self.reorganize(data1, se_list)
|
||||
print("data2 len: " + str(len(data2.keys())))
|
||||
with open(parsed_pickle_filepath, "wb") as f:
|
||||
pickle.dump(data2, f)
|
||||
|
||||
# helper html reading functions
|
||||
def parse_file(self, path, f_parse_line):
|
||||
tstart = time()
|
||||
entries = defaultdict(list)
|
||||
with open(path, "r") as f:
|
||||
for line in f:
|
||||
data = f_parse_line(line)
|
||||
if data is not None:
|
||||
entries[data["izt_clean"]].append(data)
|
||||
print("parse_file({}) in {:.2f}s".format(path, time() - tstart))
|
||||
return entries
|
||||
|
||||
def parse_line(self, line):
|
||||
def helper_bv_set(g_or_p):
|
||||
if g_or_p not in ["G", "P"]:
|
||||
print("Err g_or_p.")
|
||||
exit(1)
|
||||
if data.get("bv") is not None:
|
||||
if data["bv"] != g_or_p:
|
||||
print(str(line))
|
||||
# exit(1)
|
||||
data["bv"] = g_or_p
|
||||
data = {
|
||||
"izt": "",
|
||||
"izt_clean": "",
|
||||
"senses": defaultdict(list)
|
||||
}
|
||||
soup = BS(line, "html.parser")
|
||||
|
||||
current_sense_id = "0"
|
||||
for span in soup.find_all("span"):
|
||||
|
||||
# sense id
|
||||
if span.string is not None:
|
||||
rmatch = rord.match(span.string)
|
||||
if rmatch is not None:
|
||||
current_sense_id = rmatch.group().strip()
|
||||
|
||||
title = span.attrs.get("title")
|
||||
if title is not None:
|
||||
title = title.lower()
|
||||
|
||||
# only verbs and adjectives
|
||||
if "glagol" in title:
|
||||
helper_bv_set("G")
|
||||
data["bv_full"] = title
|
||||
elif "pridevn" in title:
|
||||
helper_bv_set("P")
|
||||
data["bv_full"] = title
|
||||
|
||||
# žšč
|
||||
if title == "iztočnica":
|
||||
data["izt"] = span.string
|
||||
data["izt_clean"] = span.string.translate(transtab).lower()
|
||||
|
||||
# sense description
|
||||
if title == "razlaga" and span.string is not None:
|
||||
data["senses"][current_sense_id].append(
|
||||
("razl", span.string))
|
||||
if "pridevnik od" in span.string:
|
||||
helper_bv_set("P")
|
||||
|
||||
if title == "sopomenka":
|
||||
subspan = span.find_all("a")[0]
|
||||
if subspan.string is not None:
|
||||
data["senses"][current_sense_id].append(
|
||||
("sopo", subspan.string))
|
||||
|
||||
# save verbs and adjectives
|
||||
if (
|
||||
("bv" not in data) or
|
||||
(data["bv"] != "P" and data["bv"] != "G")
|
||||
):
|
||||
return None
|
||||
|
||||
# sanity check
|
||||
if data["bv"] == "P" and " se" in data["izt_clean"]:
|
||||
print(data)
|
||||
exit(1)
|
||||
|
||||
# append _ to adjective keywords
|
||||
if data["bv"] == "P":
|
||||
data["izt_clean"] = data["izt_clean"] + "_"
|
||||
|
||||
# cleanup
|
||||
if "bv" not in data:
|
||||
print("Should not be here (no bv).")
|
||||
exit(1)
|
||||
del(data["bv"])
|
||||
if "bv_full" in data:
|
||||
del(data["bv_full"])
|
||||
|
||||
return data
|
||||
|
||||
# helper functions
|
||||
def load_raw_pickle(self, raw_pickle_filepath):
|
||||
with open(raw_pickle_filepath, "rb") as f:
|
||||
tmpstr = pickle.load(f)
|
||||
return json.loads(tmpstr)
|
||||
|
||||
def helper_loop(self, data, fnc):
|
||||
for k, lst in data.items():
|
||||
for el in lst:
|
||||
fnc(el)
|
||||
|
||||
def gen_se_list(self, data):
|
||||
|
||||
def fnc1(el):
|
||||
ic = el["izt_clean"]
|
||||
if " se" in ic:
|
||||
se_list.append(ic)
|
||||
|
||||
def fnc2(el):
|
||||
ic = el["izt_clean"]
|
||||
if ic in se_pruned:
|
||||
se_pruned.remove(ic)
|
||||
|
||||
# hw entries that only exist with " se"
|
||||
se_list = []
|
||||
self.helper_loop(data, fnc1)
|
||||
se_pruned = set([hw.split(" se")[0] for hw in se_list])
|
||||
self.helper_loop(data, fnc2)
|
||||
return sorted(list(se_pruned))
|
||||
|
||||
def remove_se(self, data):
|
||||
|
||||
def fnc1(el):
|
||||
nel = DC(el)
|
||||
ic = nel["izt_clean"]
|
||||
if " se" in ic:
|
||||
nic = ic.split(" se")[0]
|
||||
nel["izt_clean"] = nic
|
||||
data_new[nel["izt_clean"]].append(nel)
|
||||
|
||||
data_new = defaultdict(list)
|
||||
self.helper_loop(data, fnc1)
|
||||
return dict(data_new)
|
||||
|
||||
def reorganize(self, data, se_list):
|
||||
# some hw entries have several headwords,
|
||||
# some senses have subsenses
|
||||
# index everything, make 1 object per hw
|
||||
|
||||
def helper_prune(sense_str):
|
||||
# remove space padding
|
||||
sense_str = sense_str.strip()
|
||||
|
||||
if len(sense_str) == 1:
|
||||
return sense_str
|
||||
|
||||
# remove banned characters from string ending
|
||||
banned = ": ; . , - ! ?".split(" ")
|
||||
if sense_str[-1] in banned:
|
||||
return sense_str[:-1]
|
||||
|
||||
return sense_str
|
||||
|
||||
data_new = {}
|
||||
for k, lst in data.items():
|
||||
new_el = {
|
||||
"hw": k,
|
||||
"has_se": k in se_list,
|
||||
"senses": []
|
||||
}
|
||||
|
||||
# if there is a single hw entry, hw_id is 0
|
||||
if len(lst) == 1:
|
||||
homonym_id = -1
|
||||
else:
|
||||
homonym_id = 0
|
||||
|
||||
# loop homonyms
|
||||
for el in lst:
|
||||
homonym_id += 1
|
||||
# loop top lvl sense ids
|
||||
for sense_id, sens_lst in el["senses"].items():
|
||||
# loop subsenses
|
||||
for i, sens in enumerate(sens_lst):
|
||||
nsid = sense_id.split(".")[0]
|
||||
if len(sens_lst) == 1:
|
||||
nsid += "-0"
|
||||
else:
|
||||
nsid += ("-" + str(i + 1))
|
||||
new_sense = {
|
||||
"homonym_id": homonym_id,
|
||||
# sense_id: sense_id-subsense_id
|
||||
"sense_id": nsid,
|
||||
"sense_type": sens[0],
|
||||
"sense_desc": helper_prune(sens[1]),
|
||||
}
|
||||
new_el["senses"].append(new_sense)
|
||||
hw = new_el["hw"]
|
||||
if hw in data_new:
|
||||
print("Shouldn't be here.")
|
||||
print(new_el)
|
||||
exit(1)
|
||||
data_new[hw] = DC(new_el)
|
||||
# return data_new
|
||||
|
||||
# check
|
||||
for hw, el in data_new.items():
|
||||
for sens in el["senses"]:
|
||||
if sens["sense_desc"] is None:
|
||||
print(sens)
|
||||
|
||||
return data_new
|
||||
|
||||
|
||||
def plst(lst):
|
||||
for el in lst:
|
||||
print(el)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
datapath = "../../../data"
|
||||
html_filepath = datapath + "/sskj/sskj2_v1.html"
|
||||
raw_pickle_filepath = datapath + "/tmp_pickles/raw_sskj.pickle"
|
||||
parsed_pickle_filepath = datapath + "/no_del_pickles/sskj_senses.pickle"
|
||||
se_list_filepath = datapath + "/no_del_pickles/se_list.pickle"
|
||||
|
||||
p = Seqparser()
|
||||
|
||||
if True:
|
||||
print("html_to_raw_pickle({}, {})".format(
|
||||
html_filepath, raw_pickle_filepath))
|
||||
print("Big file, this might take a while (2 min).")
|
||||
tstart = time()
|
||||
p.html_to_raw_pickle(html_filepath, raw_pickle_filepath)
|
||||
print("Finished in {:.2f}.".format(time() - tstart))
|
||||
|
||||
if True:
|
||||
print("raw_pickle_to_parsed_pickle({}, {}, {})".format(
|
||||
raw_pickle_filepath, parsed_pickle_filepath, se_list_filepath))
|
||||
tstart = time()
|
||||
p.raw_pickle_to_parsed_pickle(
|
||||
raw_pickle_filepath, parsed_pickle_filepath, se_list_filepath)
|
||||
print("Finished in {:.2f}.".format(time() - tstart))
|
||||
print("Done.")
|
||||
218
dip_src/valency/ssj_struct.py
Normal file
218
dip_src/valency/ssj_struct.py
Normal file
@@ -0,0 +1,218 @@
|
||||
import xml.etree.ElementTree as ET
|
||||
from copy import deepcopy as DC
|
||||
from time import time
|
||||
import re
|
||||
import logging
|
||||
import sys
|
||||
import pickle
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
ET.register_namespace("xml", "http://www.w3.org/XML/1998/namespace")
|
||||
XML_ID = "{http://www.w3.org/XML/1998/namespace}id"
|
||||
|
||||
|
||||
# |$ for a default empty match
|
||||
re_int = re.compile(r"t\d+|$")
|
||||
|
||||
|
||||
# For sorting a "s" section in ssj; returns key as integer.
|
||||
# example: "S123.t34" --> 34
|
||||
def re_lmbd(el):
|
||||
s = re_int.findall(el)[0]
|
||||
if len(s) == 0:
|
||||
return 0
|
||||
else:
|
||||
return int(s[1:])
|
||||
|
||||
|
||||
class SsjEntry:
|
||||
def __init__(self, ssj_id, s, deep_links):
|
||||
# See ssj xml structure.
|
||||
self.id = ssj_id
|
||||
self.s = DC(s)
|
||||
self.deep_links = DC(deep_links)
|
||||
|
||||
|
||||
class SsjDict:
|
||||
def __init__(self):
|
||||
self.entries = {}
|
||||
|
||||
"""
|
||||
def read_xml(self, filepath):
|
||||
# No data loss.
|
||||
log.info("SsjDict.read_xml({})".format(filepath))
|
||||
t_start = time()
|
||||
tree = ET.parse(filepath)
|
||||
root = tree.getroot()
|
||||
stats = {
|
||||
"skipped": [],
|
||||
"duplicates": []
|
||||
}
|
||||
|
||||
for s in root.iter("s"):
|
||||
s_id = s.attrib[XML_ID]
|
||||
tokens = {}
|
||||
for token in s:
|
||||
if token.tag == "linkGrp":
|
||||
continue
|
||||
|
||||
if token.tag == "w":
|
||||
tokens[token.attrib[XML_ID]] = {
|
||||
"msd": token.attrib["msd"],
|
||||
"lemma": token.attrib["lemma"],
|
||||
"word": token.text
|
||||
}
|
||||
elif token.tag == "c":
|
||||
tokens[token.attrib[XML_ID]] = {
|
||||
"word": token.text
|
||||
}
|
||||
else:
|
||||
# <S />
|
||||
pass
|
||||
|
||||
linkGrps = s.findall("linkGrp")
|
||||
if len(linkGrps) < 2:
|
||||
# Take only entries with both deep and shallow
|
||||
# syntactic annotation
|
||||
stats["skipped"].append(s_id)
|
||||
continue
|
||||
|
||||
linkG = {}
|
||||
for el in linkGrps:
|
||||
if el.attrib["type"] == "dep":
|
||||
linkG["dep"] = el
|
||||
elif el.attrib["type"] == "SRL":
|
||||
linkG["SRL"] = el
|
||||
else:
|
||||
raise KeyError("Unknown linkGrp.")
|
||||
|
||||
if s_id in self.entries:
|
||||
stats["duplicates"].append(s_id)
|
||||
self.entries[s_id] = SsjEntry(
|
||||
s_id,
|
||||
s.attrib["n"],
|
||||
tokens,
|
||||
create_edge_dict(linkG["dep"]),
|
||||
create_edge_dict(linkG["SRL"])
|
||||
)
|
||||
|
||||
t_end = time()
|
||||
log.info("Time: {}s.".format(t_end - t_start))
|
||||
log.info(
|
||||
"{} duplicates, skipped {} elements (missing linkGrp).".format(
|
||||
len(stats["duplicates"]), len(stats["skipped"]))
|
||||
)
|
||||
"""
|
||||
|
||||
def read_xml_v2(self, filepath):
|
||||
NS_DICT = {
|
||||
"tei": "http://www.tei-c.org/ns/1.0",
|
||||
"xml": "http://www.w3.org/XML/1998/namespace",
|
||||
}
|
||||
|
||||
def ns_prefix(ns):
|
||||
return "{" + NS_DICT[ns] + "}"
|
||||
|
||||
def helper_get_sentence(tree_s):
|
||||
# all w and pc elements
|
||||
ret = []
|
||||
for el in tree_s.iter():
|
||||
if (
|
||||
el.tag == ns_prefix("tei") + "w" or
|
||||
el.tag == ns_prefix("tei") + "pc"
|
||||
):
|
||||
ret.append(el)
|
||||
return ret
|
||||
|
||||
def helper_get_functor_links(tree_s):
|
||||
# links for SRL linkGrp
|
||||
lg = None
|
||||
for linkGrp in tree_s.findall("tei:linkGrp", NS_DICT):
|
||||
if linkGrp.attrib["type"] == "SRL":
|
||||
lg = linkGrp
|
||||
break
|
||||
else:
|
||||
return []
|
||||
ret = []
|
||||
for link in lg:
|
||||
ret.append(link)
|
||||
return ret
|
||||
|
||||
def helper_gen_deep_links(link_list):
|
||||
deep_links = []
|
||||
for link in link_list:
|
||||
deep_links.append({
|
||||
"from": link.attrib["target"].split(" ")[0][1:],
|
||||
"to": link.attrib["target"].split(" ")[1][1:],
|
||||
"functor": link.attrib["ana"].split(":")[1]
|
||||
})
|
||||
return deep_links
|
||||
|
||||
log.info("SsjDict.read_xml({})".format(filepath))
|
||||
t_start = time()
|
||||
stats = {
|
||||
"total_count": 0,
|
||||
"deep_roles_count": 0,
|
||||
"duplicated_sid": 0,
|
||||
}
|
||||
tree = ET.parse(filepath)
|
||||
root = tree.getroot()
|
||||
|
||||
for s in root.findall(".//tei:s", NS_DICT):
|
||||
stats["total_count"] += 1
|
||||
s_id = s.attrib[ns_prefix("xml") + "id"]
|
||||
|
||||
# get_functors (deep semantic roles)
|
||||
functor_links = helper_get_functor_links(s)
|
||||
if len(functor_links) == 0:
|
||||
continue
|
||||
stats["deep_roles_count"] += 1
|
||||
|
||||
# get_sentence
|
||||
tokens = {}
|
||||
for token in helper_get_sentence(s):
|
||||
tid = token.attrib[ns_prefix("xml") + "id"]
|
||||
if token.tag == ns_prefix("tei") + "w":
|
||||
tokens[tid] = {
|
||||
"msd": token.attrib["ana"].split(":")[1],
|
||||
"lemma": token.attrib["lemma"],
|
||||
"word": token.text
|
||||
}
|
||||
elif token.tag == ns_prefix("tei") + "pc":
|
||||
tokens[tid] = {
|
||||
"word": token.text
|
||||
}
|
||||
else:
|
||||
log.warning("Unrecognized sentence element: " + token.tag)
|
||||
exit(1)
|
||||
|
||||
if s_id in self.entries:
|
||||
log.warning("duplicated sentence: " + s_id)
|
||||
stats["duplicated_sid"] += 1
|
||||
continue
|
||||
|
||||
self.entries[s_id] = SsjEntry(
|
||||
s_id,
|
||||
tokens,
|
||||
helper_gen_deep_links(functor_links)
|
||||
)
|
||||
|
||||
t_end = time()
|
||||
log.info("Time: {}s.".format(t_end - t_start))
|
||||
log.info(str(stats))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# testing
|
||||
log.setLevel(logging.DEBUG)
|
||||
|
||||
ch = logging.StreamHandler(sys.stdout)
|
||||
log.addHandler(ch)
|
||||
|
||||
# Load
|
||||
fpath = "../../data/ssj500k-sl.TEI/ssj500k-sl.body.xml"
|
||||
ssj = SsjDict()
|
||||
ssj.read_xml_v2(fpath)
|
||||
with open("ssj_test.pickle", "wb") as file:
|
||||
pickle.dump(ssj, file)
|
||||
47
dip_src/valency/sskj_scraper.py
Normal file
47
dip_src/valency/sskj_scraper.py
Normal file
@@ -0,0 +1,47 @@
|
||||
# Deprecated!
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from time import time
|
||||
from valency import k_utils
|
||||
|
||||
SSKJ_BASE = "http://bos.zrc-sazu.si/cgi/a03.exe?name=sskj_testa&expression="
|
||||
|
||||
|
||||
class SskjScraper:
|
||||
def __init__(self):
|
||||
self.base_url = SSKJ_BASE
|
||||
|
||||
def scrape(self, word):
|
||||
# returns unique set of words
|
||||
soup = BeautifulSoup(
|
||||
requests.get(self.base_url + word).content,
|
||||
"html.parser"
|
||||
)
|
||||
# Check for failure.
|
||||
h2 = soup.find_all("h2")
|
||||
if len(h2) >= 2:
|
||||
# <h2>Zadetkov ni bilo: ...</h2>
|
||||
return []
|
||||
li_elements = soup.find_all('li', class_="nounderline")
|
||||
if len(li_elements) == 0:
|
||||
return []
|
||||
li = li_elements[0]
|
||||
# It was horrible...
|
||||
# <li> ... <li> ... <li> ...</li></li></li>
|
||||
# Parse sequence until you find a sedond <li>
|
||||
txts = []
|
||||
for el in li.find_all():
|
||||
if el.name == "li":
|
||||
break
|
||||
txts.append(el.get_text())
|
||||
print("sskj scraped {}.".format(word))
|
||||
return k_utils.tokenize(txts)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sskjScr = SskjScraper()
|
||||
|
||||
word = "tek"
|
||||
tp = sskjScr.scrape("čaj")
|
||||
print(tp)
|
||||
40
dip_src/valency/testing_lesk.py
Normal file
40
dip_src/valency/testing_lesk.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from valency.val_struct import *
|
||||
from valency.ssj_struct import *
|
||||
from valency import k_utils
|
||||
from valency.lesk import Lesk
|
||||
|
||||
vallex_path = "../../data/vallex.xml"
|
||||
vallex = k_utils.pickle_load(vallex_path)
|
||||
if vallex is None:
|
||||
ssj_path = "../../data/anno_final.conll.xml"
|
||||
# ssj_path = "../../data/ssj500kv1_1-SRL_500_stavkov_2017-04-11.xml"
|
||||
ssj = k_utils.pickle_load(ssj_path)
|
||||
if ssj is None:
|
||||
ssj = SsjDict()
|
||||
ssj.read_xml(ssj_path)
|
||||
k_utils.pickle_dump(ssj, ssj_path)
|
||||
|
||||
vallex = Vallex()
|
||||
vallex.read_ssj(ssj)
|
||||
k_utils.pickle_dump(vallex, vallex_path)
|
||||
|
||||
vallex.process_after_read(False, False)
|
||||
|
||||
random_frame = None
|
||||
lesk = Lesk()
|
||||
successes = 0
|
||||
for k, e in vallex.entries.items():
|
||||
for rf in e.raw_frames:
|
||||
random_frame = rf
|
||||
break
|
||||
print(rf.to_string())
|
||||
print(vallex.get_token(random_frame.tids[0]))
|
||||
print(vallex.get_sentence(random_frame.tids[0]))
|
||||
tid = random_frame.tids[0]
|
||||
token = vallex.get_token(tid)
|
||||
context = vallex.get_context(tid)
|
||||
sense = lesk.lesk(token, context)
|
||||
if sense is not None:
|
||||
successes += 1
|
||||
if successes >= 10:
|
||||
break
|
||||
706
dip_src/valency/val_struct.py
Normal file
706
dip_src/valency/val_struct.py
Normal file
@@ -0,0 +1,706 @@
|
||||
from time import time
|
||||
from copy import deepcopy as DC
|
||||
from valency.frame import Frame
|
||||
from valency.reduce_functions import *
|
||||
from valency.lesk import *
|
||||
from valency import mongo_tools
|
||||
import random
|
||||
import logging
|
||||
from valency.evaluation import Evaluation
|
||||
from valency.dictionary_interface import SloWnet, Sskj2
|
||||
from valency.leskFour import LeskFour
|
||||
from valency.k_kmeans import KmeansClass
|
||||
from valency.ssj_struct import SsjDict, SsjEntry
|
||||
from valency.seqparser.seqparser import Seqparser
|
||||
import pickle
|
||||
import sys
|
||||
import hashlib
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def split_id(myid):
|
||||
tmp = myid.split(".")
|
||||
sid = ".".join(tmp[:-1])
|
||||
tid = tmp[-1]
|
||||
return (sid, tid)
|
||||
|
||||
|
||||
class ValEntry():
|
||||
def __init__(self, hw, frame):
|
||||
self.hw = hw
|
||||
self.raw_frames = [frame]
|
||||
self.has_senses = False
|
||||
|
||||
|
||||
class Vallex():
|
||||
# Main class
|
||||
def __init__(self):
|
||||
# database
|
||||
self.db, err_msg = mongo_tools.basic_connection("127.0.0.1", 26633)
|
||||
if self.db is None:
|
||||
log.error((
|
||||
"Database not connected:"
|
||||
"{}".format(err_msg)
|
||||
))
|
||||
exit(1)
|
||||
mongo_tools.check_collections(self.db, [
|
||||
"v2_users", "v2_senses", "v2_sense_map", "v2_user_tokens"
|
||||
])
|
||||
mongo_tools.prepare_user_tokens(self.db)
|
||||
|
||||
# these 3 might be obsolete for the web app (used for ML)
|
||||
self.db_senses_map = self.db.senses_map3
|
||||
self.slownet_interface = SloWnet(self)
|
||||
self.sskj_interface = Sskj2(self)
|
||||
|
||||
# self.tokens["s0][t0"] = {word, lemma, msd, ...}
|
||||
self.tokens = {}
|
||||
|
||||
# key = verb / adjective headword
|
||||
self.entries = {}
|
||||
|
||||
# For alphabetical indexing in web app.
|
||||
self.sorted_words = {}
|
||||
# words = { first_letter: [hw1, hw2, ... sorted] }
|
||||
self.functors_index = {}
|
||||
self.has_se = [] # list of verbs with "se" ("bati se")
|
||||
|
||||
# Used for ML (deprecated).
|
||||
self.leskFour = LeskFour(self)
|
||||
self.kmeans = KmeansClass(self)
|
||||
self.evaluation = Evaluation(self)
|
||||
self.test_samples = []
|
||||
|
||||
# run self.process_after_read() after initiating Vallex
|
||||
|
||||
def read_ssj(self, ssj):
|
||||
# ssj: object generated with ssj_strict.py.
|
||||
BANNED_HW = ["biti"]
|
||||
stats = {
|
||||
"P_count": 0,
|
||||
"skipped": 0,
|
||||
}
|
||||
log.info("Vallex.read_ssj({}).".format(
|
||||
ssj
|
||||
))
|
||||
t_start = time()
|
||||
for ssj_id, entry in ssj.entries.items():
|
||||
# Read tokens
|
||||
skip_entry = False
|
||||
tmp_tokens = {}
|
||||
for ssj_tid, token in entry.s.items():
|
||||
sid, tid = split_id(ssj_tid)
|
||||
|
||||
# safety checks
|
||||
if tid != "t" and not tid[1:].isdigit():
|
||||
log.warning("dropping SID={} - corrupted keys".format(k))
|
||||
skip_entry = True
|
||||
break
|
||||
if tid in tmp_tokens:
|
||||
log.error(
|
||||
"Vallex.read_ssj(): Duplicated ssj_tid:" + ssj_tid)
|
||||
exit(1)
|
||||
|
||||
tmp_tokens[tid] = DC(token)
|
||||
if skip_entry:
|
||||
continue # skip corrupted keys
|
||||
if sid in self.tokens:
|
||||
log.error("sid duplicate: " + sid)
|
||||
exit(1)
|
||||
self.tokens[sid] = DC(tmp_tokens)
|
||||
|
||||
# Read frame data (each deep link gets its own raw frame).
|
||||
link_map = {}
|
||||
# hw_id: { hw_lemma: lemma, deep: [{functor: fnct, to: to}]}
|
||||
for deep_link in entry.deep_links:
|
||||
hw_id = deep_link["from"]
|
||||
hw_token = self.get_token(hw_id)
|
||||
hw_lemma = hw_token["lemma"]
|
||||
hw_bv = hw_token["msd"][0]
|
||||
if (hw_bv != "G" and hw_bv != "P"):
|
||||
stats["skipped"] += 1
|
||||
log.info("Skipping {}: not a verb or adjective.".format(
|
||||
hw_lemma))
|
||||
continue
|
||||
if hw_bv == "P":
|
||||
hw_lemma = hw_lemma + "_"
|
||||
stats["P_count"] += 1
|
||||
if hw_id in link_map:
|
||||
link_map[hw_id]["deep"].append(deep_link)
|
||||
else:
|
||||
link_map[hw_id] = {
|
||||
"hw_lemma": hw_lemma,
|
||||
"deep": [deep_link]
|
||||
}
|
||||
for hw_id, data in link_map.items():
|
||||
hw_lemma = data["hw_lemma"]
|
||||
raw_frame = Frame(
|
||||
hw=hw_lemma,
|
||||
tids=[hw_id],
|
||||
deep_links=data["deep"],
|
||||
slots=None,
|
||||
)
|
||||
if hw_lemma not in self.entries:
|
||||
self.entries[hw_lemma] = ValEntry(hw_lemma, raw_frame)
|
||||
else:
|
||||
self.entries[hw_lemma].raw_frames.append(raw_frame)
|
||||
|
||||
# cleanup banned
|
||||
for hw in BANNED_HW:
|
||||
if hw in self.entries:
|
||||
del(self.entries[hw])
|
||||
|
||||
t_end = time()
|
||||
log.info("Finished build_from_ssj() in {:.2}s.".format(
|
||||
t_end - t_start
|
||||
))
|
||||
log.info("Vallex has a total of {} key entries.".format(
|
||||
len(self.entries.keys())
|
||||
))
|
||||
log.info("Number of adjectives: {}".format(stats["P_count"]))
|
||||
log.info("Number of skipped (not a verb or adjective): {}".format(
|
||||
stats["skipped"]))
|
||||
# Frames per hw
|
||||
"""
|
||||
for k, e in self.entries.items():
|
||||
print(k + "," + str(len(e.raw_frames)))
|
||||
"""
|
||||
|
||||
def get_token(self, myid):
|
||||
# id = S123.t1
|
||||
sid, tid = split_id(myid)
|
||||
return self.tokens[sid][tid]
|
||||
|
||||
def get_sentence(self, myid):
|
||||
sid, tid = split_id(myid)
|
||||
tmp = []
|
||||
sentence = ""
|
||||
for k, token in self.tokens[sid].items():
|
||||
if (k != "t") and (token["word"] is not None):
|
||||
tmp.append((k, token))
|
||||
for token in sorted(tmp, key=lambda x: int(x[0][1:])):
|
||||
sentence += (token[1]["word"] + " ")
|
||||
return sentence
|
||||
|
||||
def get_tokenized_sentence(self, myid):
|
||||
sid, tid = split_id(myid)
|
||||
tmp = []
|
||||
sentence = []
|
||||
for k, token in self.tokens[sid].items():
|
||||
if k != "t":
|
||||
tmp.append((k, token))
|
||||
for token in sorted(tmp, key=lambda x: int(x[0][1:])):
|
||||
sentence.append((".".join([sid, token[0]]), token[1]))
|
||||
# return [(ssj_id, {word: _, lemma: _, msd: _}), ...]
|
||||
return sentence
|
||||
|
||||
def process_after_read(
|
||||
self, sskj_senses_pickle_path, se_list_pickle_path,
|
||||
reload_sskj_senses
|
||||
):
|
||||
tstart = time()
|
||||
|
||||
# web app: index by hw
|
||||
self.sorted_words = {}
|
||||
self.gen_sorted_words()
|
||||
|
||||
# web app: index by functor
|
||||
self.functors_index = {}
|
||||
self.gen_functors_index()
|
||||
|
||||
# fill db.v2_senses
|
||||
self.has_se = []
|
||||
self.read_seqparser_pickles(
|
||||
sskj_senses_pickle_path, se_list_pickle_path, reload_sskj_senses)
|
||||
|
||||
log.debug(
|
||||
"vallex.process_after_read(): {:.2f}s".format(time() - tstart))
|
||||
|
||||
def gen_sorted_words(self):
|
||||
res = {}
|
||||
for hw, e in self.entries.items():
|
||||
letter = hw[0].lower()
|
||||
n_sent = len(e.raw_frames)
|
||||
if letter not in res:
|
||||
res[letter] = []
|
||||
res[letter].append((hw, n_sent))
|
||||
# sort and add to vallex object
|
||||
self.sorted_words = {}
|
||||
for letter, lst in res.items():
|
||||
self.sorted_words[letter] = k_utils.slo_bucket_sort(
|
||||
lst, key=lambda x: x[0])
|
||||
|
||||
def gen_functors_index(self):
|
||||
for hw, e in self.entries.items():
|
||||
for frame in e.raw_frames:
|
||||
for slot in frame.slots:
|
||||
if slot.functor not in self.functors_index:
|
||||
self.functors_index[slot.functor] = []
|
||||
self.functors_index[slot.functor].append(frame)
|
||||
|
||||
def read_seqparser_pickles(
|
||||
self, sskj_senses_pickle_path, se_list_pickle_path,
|
||||
reload_sskj_senses
|
||||
):
|
||||
log.info("read_seqparser_pickles()")
|
||||
log.info((
|
||||
"Reading list of has_se verbs from {}."
|
||||
"Sskj senses into db.v2_senses from {}."
|
||||
).format(se_list_pickle_path, sskj_senses_pickle_path))
|
||||
AUTHOR_SSKJ = "SSKJ"
|
||||
ERR_MSG = (
|
||||
"Need to generate .pickle files first."
|
||||
"Use: "
|
||||
"$ python3 /script/valency/seqparser/seqparser.py"
|
||||
"Input is /data/sskj_v2.html."
|
||||
)
|
||||
|
||||
# has_se
|
||||
with open(se_list_pickle_path, "rb") as f:
|
||||
self.has_se = pickle.load(f)
|
||||
if self.has_se is None:
|
||||
log.error(ERR_MSG)
|
||||
exit(1)
|
||||
self.has_se = sorted(self.has_se)
|
||||
log.info("Loaded self.has_se (len: {}) from {}.".format(
|
||||
len(self.has_se), se_list_pickle_path))
|
||||
|
||||
# sskj senses
|
||||
if reload_sskj_senses:
|
||||
log.info("Reloading sskj_senses.")
|
||||
reply = self.db.v2_senses.remove({"author": AUTHOR_SSKJ})
|
||||
log.info(reply)
|
||||
|
||||
query = list(self.db.v2_senses.find({"author": AUTHOR_SSKJ}))
|
||||
if len(query) > 0:
|
||||
log.info("Sskj senses already in database.")
|
||||
return
|
||||
tstart = time()
|
||||
data = None
|
||||
with open(sskj_senses_pickle_path, "rb") as f:
|
||||
data = pickle.load(f)
|
||||
if data is None:
|
||||
log.error(ERR_MSG)
|
||||
exit(1)
|
||||
for k, e in data.items():
|
||||
for sense in e["senses"]:
|
||||
db_entry = {
|
||||
"hw": k,
|
||||
"author": AUTHOR_SSKJ,
|
||||
"desc": sense["sense_desc"],
|
||||
# unique id for each sense
|
||||
"sense_id": "{}-{}-{}-{}-{}".format(
|
||||
AUTHOR_SSKJ,
|
||||
sense["homonym_id"],
|
||||
sense["sense_id"],
|
||||
sense["sense_type"],
|
||||
hashlib.sha256(
|
||||
sense["sense_desc"].encode("utf-8")
|
||||
).hexdigest()[:5]
|
||||
)
|
||||
}
|
||||
self.db.v2_senses.insert(db_entry)
|
||||
# print(db_entry)
|
||||
log.info("db.v2_senses prepared in {:.2f}s".format(time() - tstart))
|
||||
|
||||
# Functions below can be used for interactively with flask_api.
|
||||
def test_dev(self):
|
||||
# self.prepare_sskj_senses()
|
||||
hw = "dajati"
|
||||
senses = self.sskj_interface.sense_glosses(hw)
|
||||
return str(senses)
|
||||
|
||||
def calc_senses(self):
|
||||
# self.calc_all_senses(self.leskFour.lesk_nltk)
|
||||
# self.calc_all_senses(self.leskFour.lesk_sl)
|
||||
# self.calc_all_senses(self.leskFour.lesk_al) # cca 8h!
|
||||
# self.calc_all_senses(self.leskFour.lesk_ram)
|
||||
self.calc_all_senses_kmeans(self.kmeans.bisection_kmeans)
|
||||
self.calc_all_senses_kmeans(self.kmeans.normal_kmeans)
|
||||
return "edit val_struct.py: calc_senses()"
|
||||
|
||||
# deprecated functions (used for machine learning experiments)
|
||||
|
||||
def prepare_sskj_senses(self):
|
||||
# obsolete, using read_seqparser_pickles()
|
||||
log.info("prepare_sskj_senses() (db.v2_senses)")
|
||||
query = list(self.db.v2_senses.find({"author": "SSKJ2"}))
|
||||
if len(query) > 0:
|
||||
log.info("Sskj senses already in database.")
|
||||
return
|
||||
tstart = time()
|
||||
log.info("Iterating over {} hw entries:".format(
|
||||
len(self.entries.keys())))
|
||||
for hw, e in self.entries.items():
|
||||
senses = self.sskj_interface.sense_glosses(hw)
|
||||
if len(senses) == 0:
|
||||
continue
|
||||
for sense in senses:
|
||||
# create sense from each description
|
||||
for i, de in enumerate(sense["def"]):
|
||||
sense_def = sense["def"][i]
|
||||
sense_def = sense_def[0].upper() + sense_def[1:]
|
||||
if sense_def[-1] == ":" or sense_def[-1] == ";":
|
||||
sense_def = sense_def[:-1] + "."
|
||||
data = {
|
||||
"hw": hw,
|
||||
"author": "SSKJ2",
|
||||
"desc": sense_def,
|
||||
"sskj_id": sense["sskj_sense_id"],
|
||||
"sskj_desc_id": i
|
||||
}
|
||||
self.db.v2_senses.insert(data)
|
||||
log.info("sskj_senses prepared in {:.2f}s".format(time() - tstart))
|
||||
|
||||
def gen_sskj_sl(self):
|
||||
# Takes about an hour.
|
||||
tstart = time()
|
||||
log.info("Generating new sskj_simple_lesk with Simple Lesk.")
|
||||
for k, e in self.entries.items():
|
||||
self.gen_sskj_sl_one(e.hw)
|
||||
log.debug("gen_sskj_sl in {:.2f}s".format(time() - tstart))
|
||||
|
||||
def gen_sskj_sl_one(self, hw, update_db=True):
|
||||
entry = None
|
||||
ttstart = time()
|
||||
e = self.entries.get(hw)
|
||||
if e is None:
|
||||
return
|
||||
for frame in e.raw_frames:
|
||||
tid = frame.tids[0]
|
||||
sentence = self.get_sentence(tid)
|
||||
res = self.lesk.simple_lesk_sskj(sentence, hw)
|
||||
if res is None:
|
||||
log.debug("headword {} not in sskj".format(hw))
|
||||
continue
|
||||
key = {"ssj_id": tid}
|
||||
entry = {
|
||||
"headword": hw,
|
||||
"ssj_id": tid, # uniqe identifier
|
||||
"sense_id": res[1],
|
||||
# "sense_desc": k_utils.dict_safe_key(res[2], "ns0:def"),
|
||||
"sense_desc": res[2]["def"]
|
||||
}
|
||||
# log.debug(str(res[2]))
|
||||
# log.debug(entry["sense_id"])
|
||||
# log.debug(entry["sense_desc"])
|
||||
if update_db:
|
||||
self.db.sskj_simple_lesk.update(key, entry, upsert=True)
|
||||
log.debug("[*] sskj_ids for {} in {:.2f}s".format(
|
||||
hw, time() - ttstart))
|
||||
|
||||
def get_context(self, myid, radius=None, min_lemma_size=None):
|
||||
radius = radius or 5
|
||||
min_lemma_size = min_lemma_size or 4
|
||||
# gives you the token and 10 of its neighbors
|
||||
sentence = self.get_sentence(myid)
|
||||
sentlen = len(sentence.split(" "))
|
||||
sid, tid = split_id(myid)
|
||||
idx = int(tid[1:])
|
||||
tokens_after = []
|
||||
i = idx
|
||||
while i < sentlen - 1 and len(tokens_after) < radius:
|
||||
i += 1
|
||||
token = self.get_token(sid + ".t" + str(i))
|
||||
if (
|
||||
token is not None and "lemma" in token and
|
||||
len(token["lemma"]) >= min_lemma_size and
|
||||
token["lemma"] != "biti"
|
||||
):
|
||||
tokens_after.append(token)
|
||||
tokens_before = []
|
||||
i = idx
|
||||
while i > 1 and len(tokens_before) < radius:
|
||||
i -= 1
|
||||
token = self.get_token(sid + ".t" + str(i))
|
||||
if (
|
||||
token is not None and "lemma" in token and
|
||||
len(token["lemma"]) >= min_lemma_size and
|
||||
token["lemma"] != "biti"
|
||||
):
|
||||
tokens_before.append(token)
|
||||
tokens = tokens_before + [self.get_token(myid)] + tokens_after
|
||||
# find position of original token:
|
||||
mid_idx = len(tokens_before)
|
||||
return (mid_idx, tokens)
|
||||
|
||||
def get_sense_ids(self, collname, hw, sense_group=None):
|
||||
query = {"headword": hw}
|
||||
if sense_group is not None:
|
||||
query["sense_group"] = sense_group
|
||||
result = list(self.db[collname].find(query))
|
||||
sense_ids = {}
|
||||
for r in result:
|
||||
sense_ids[r["ssj_id"]] = r["sense_id"]
|
||||
return sense_ids
|
||||
|
||||
def t_get_context(self):
|
||||
ii = 10
|
||||
for k, e in self.entries.items():
|
||||
for frame in e.raw_frames:
|
||||
if random.randint(0, 100) > 20:
|
||||
continue
|
||||
ii -= 1
|
||||
if ii <= 0:
|
||||
return
|
||||
|
||||
mytid = frame.tids[0]
|
||||
print()
|
||||
print(mytid)
|
||||
print(self.get_token(mytid))
|
||||
sent = self.get_context(mytid, radius=3, min_lemma_size=4)
|
||||
print("mid: {}".format(sent[0]))
|
||||
for ii in range(len(sent[1])):
|
||||
print("{} -> {}".format(
|
||||
ii, sent[1][ii]))
|
||||
|
||||
def t_simple_lesk_sskj(self):
|
||||
ii = 10
|
||||
for k, e in self.entries.items():
|
||||
if random.randint(0, 100) > 20:
|
||||
continue
|
||||
for frame in e.raw_frames:
|
||||
if random.randint(0, 100) > 20:
|
||||
continue
|
||||
if ii == 0:
|
||||
return
|
||||
ii -= 1
|
||||
|
||||
print("\nTest frame: {}.".format(frame.tids))
|
||||
hw_token = self.get_token(frame.tids[0])
|
||||
print(hw_token)
|
||||
context_sentence = self.get_sentence(frame.tids[0])
|
||||
print(context_sentence)
|
||||
self.lesk.simple_lesk_sskj(
|
||||
context_sentence=context_sentence,
|
||||
word_lemma=hw_token["lemma"]
|
||||
)
|
||||
|
||||
def process_kmeans(self):
|
||||
# Convert words to lemmas, cluseter using k-means.
|
||||
# Number of clusters from sskj.
|
||||
tstart = time()
|
||||
log.info("Processing senses using kmeans.")
|
||||
for k, e in self.entries.items():
|
||||
# Frame start
|
||||
ttstart = time()
|
||||
lemma = e.hw
|
||||
tokenized_sentences = []
|
||||
for frame in e.raw_frames:
|
||||
tid = frame.tids[0]
|
||||
tokenized_sentences.append(self.get_tokenized_sentence(tid))
|
||||
lemmatized_sentences = []
|
||||
for sent in tokenized_sentences:
|
||||
lemmatized = ""
|
||||
for token in sent:
|
||||
if "lemma" in token[1]:
|
||||
lemmatized += (token[1]["lemma"] + " ")
|
||||
lemmatized_sentences.append(lemmatized)
|
||||
lls = len(lemmatized_sentences)
|
||||
# We got the sentences
|
||||
sskj_entry = self.db.sskj.find_one(
|
||||
{"ns0:entry.ns0:form.ns0:orth": lemma})
|
||||
if sskj_entry is None:
|
||||
log.debug("headword {} has no <sense> in sskj".format(lemma))
|
||||
continue
|
||||
n_clusters = 1
|
||||
if "ns0:sense" in sskj_entry["ns0:entry"]:
|
||||
# Guess number of senses based on sskj senses.
|
||||
n_clusters = len(sskj_entry["ns0:entry"]["ns0:sense"])
|
||||
if lls >= n_clusters and n_clusters > 1:
|
||||
labels = k_kmeans.k_means(
|
||||
sentences=lemmatized_sentences,
|
||||
n_clusters=n_clusters
|
||||
)
|
||||
kmeans_ids = [str(x) + "-" + str(lls) for x in labels]
|
||||
elif n_clusters == 1:
|
||||
kmeans_ids = ["1-1" for x in lemmatized_sentences]
|
||||
elif lls < n_clusters:
|
||||
# Each sentence gets its own sense.
|
||||
kmeans_ids = []
|
||||
for i in range(lls):
|
||||
kmeans_ids.append(str(i + 1) + "lt" + str(n_clusters))
|
||||
else:
|
||||
log.error("Shouldn't be here (val_struct: process_kmeans()")
|
||||
exit(1)
|
||||
|
||||
# Feed sense ides of whole frame to database.
|
||||
for i in range(len(e.raw_frames)):
|
||||
tid = e.raw_frames[i].tids[0]
|
||||
key = {"ssj_id": tid}
|
||||
entry = {
|
||||
"headword": lemma,
|
||||
"ssj_id": tid, # unique idenfitier
|
||||
"sense_id": kmeans_ids[i],
|
||||
}
|
||||
self.db.kmeans.update(key, entry, upsert=True)
|
||||
|
||||
log.debug("[*] kemans_ids for {} in {:.2f}s".format(
|
||||
lemma, time() - ttstart))
|
||||
# Frame end
|
||||
log.debug("process_kmeans in {:.2f}s".format(time() - tstart))
|
||||
|
||||
def get_context1(
|
||||
self, mytid, collname, radius=None, min_token_len=3, get_glosses=None
|
||||
):
|
||||
# returns {
|
||||
# "hw": headword lemma and its glosses
|
||||
# "context": a list of lemmas and their glosses around the hw that
|
||||
# have entries in collname dictionary (if get_glosses=True)
|
||||
# }
|
||||
# tstart = time()
|
||||
if get_glosses is None:
|
||||
get_glosses = False
|
||||
if radius is None:
|
||||
radius = 10000
|
||||
if collname == "slownet":
|
||||
dictionary_interface = self.slownet_interface
|
||||
elif collname == "sskj":
|
||||
dictionary_interface = self.sskj_interface
|
||||
else:
|
||||
log.error("argument error: get_context1(collname=<slownet/sskj>)")
|
||||
return []
|
||||
|
||||
sentence = self.get_tokenized_sentence(mytid)
|
||||
# return [(ssj_id, {word: _, lemma: _, msd: _}), ...]
|
||||
hw_idx = -1
|
||||
for i, e in enumerate(sentence):
|
||||
if e[0] == mytid:
|
||||
hw_idx = i
|
||||
hw_lemma = e[1]["lemma"]
|
||||
break
|
||||
|
||||
hw_glosses = dictionary_interface.sense_glosses(hw_lemma)
|
||||
if len(hw_glosses) == 0:
|
||||
log.info("hw: {} has 0 glosses".format(hw_lemma))
|
||||
return {
|
||||
"hw": None,
|
||||
"err": "headword {} has no glosses in {}".format(
|
||||
hw_lemma, collname)
|
||||
}
|
||||
|
||||
tokens_before = []
|
||||
ii = hw_idx - 1
|
||||
while(ii >= 0 and len(tokens_before) < radius):
|
||||
lemma = sentence[ii][1].get("lemma")
|
||||
if (
|
||||
lemma is not None and
|
||||
len(lemma) >= min_token_len
|
||||
):
|
||||
if get_glosses:
|
||||
glosses = dictionary_interface.sense_glosses(lemma)
|
||||
else:
|
||||
glosses = [{"def": "--none--", "gloss": "--none--"}]
|
||||
if len(glosses) > 0:
|
||||
tokens_before.insert(0, {
|
||||
"lemma": lemma,
|
||||
"glosses": glosses
|
||||
})
|
||||
ii -= 1
|
||||
|
||||
tokens_after = []
|
||||
ii = hw_idx + 1
|
||||
while(ii < len(sentence) and len(tokens_after) < radius):
|
||||
lemma = sentence[ii][1].get("lemma")
|
||||
if (
|
||||
lemma is not None and
|
||||
len(lemma) >= min_token_len
|
||||
):
|
||||
if get_glosses:
|
||||
glosses = dictionary_interface.sense_glosses(lemma)
|
||||
else:
|
||||
glosses = [{"def": "--none--", "gloss": "--none--"}]
|
||||
if len(glosses) > 0:
|
||||
tokens_after.append({
|
||||
"lemma": lemma,
|
||||
"glosses": glosses
|
||||
})
|
||||
ii += 1
|
||||
|
||||
# log.debug("context1({}): {:.2f}".format(mytid, time() - tstart))
|
||||
return {
|
||||
"hw": {"lemma": hw_lemma, "glosses": hw_glosses},
|
||||
"context": tokens_before + tokens_after
|
||||
}
|
||||
|
||||
def test_context1(self, mytid, hw=""):
|
||||
res = ""
|
||||
context = self.get_context1(
|
||||
mytid, collname="slownet", radius=2, get_glosses=True)
|
||||
if context["hw"] is None:
|
||||
return context["err"] + "<br><br>"
|
||||
res = "hw: {}<br>sentence: {}<br>".format(
|
||||
hw, self.get_sentence(mytid))
|
||||
tfigf_input = []
|
||||
glosses = [context["hw"]] + context["context"]
|
||||
for e in glosses:
|
||||
res += "--->lemma: {} ({} senses)<br>".format(
|
||||
e["lemma"], len(e["glosses"]))
|
||||
for g in e["glosses"]:
|
||||
res += "{}<br>".format(str(g))
|
||||
tfigf_input.append(" ".join(k_utils.tokenize_multiple(
|
||||
g["gloss"],
|
||||
min_token_len=3,
|
||||
stem=k_utils.stem_eng
|
||||
)))
|
||||
res += "<br><br>"
|
||||
return res
|
||||
|
||||
def calc_all_senses(self, lesk_algorithm):
|
||||
allcount = 0
|
||||
count = 0
|
||||
for k, e in self.entries.items():
|
||||
allcount += len(e.raw_frames)
|
||||
for k, e in self.entries.items():
|
||||
if k == "biti": # skip this huge bag of words
|
||||
continue
|
||||
for frame in e.raw_frames:
|
||||
count += 1
|
||||
if count % 10 == 0:
|
||||
log.info("calc_all_senses: ({}/{})".format(
|
||||
count, allcount))
|
||||
lesk_algorithm(frame.tids[0])
|
||||
return None
|
||||
|
||||
def calc_all_senses_kmeans(self, kmeans_algorithm):
|
||||
tstart = time()
|
||||
allcount = len(self.entries)
|
||||
count = 0
|
||||
avg_times = []
|
||||
for key in self.entries:
|
||||
count += 1
|
||||
if key == "biti":
|
||||
continue
|
||||
# cluster frames of each entry
|
||||
log.info("calc_all_senses_kmeans: ({}/{}) [{}]".format(
|
||||
count, allcount, key))
|
||||
kmeans_algorithm(key)
|
||||
"""
|
||||
try:
|
||||
kmeans_algorithm(key)
|
||||
except ValueError:
|
||||
continue
|
||||
"""
|
||||
avg_times.append(1.0 * (time() - tstart) / count)
|
||||
log.info("avg_time: {:.2f}s".format(avg_times[-1]))
|
||||
log.info("calc_all_senses_kmeans in {:.2f}s.".format(time() - tstart))
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
log.setLevel(logging.DEBUG)
|
||||
ch = logging.StreamHandler(sys.stdout)
|
||||
log.addHandler(ch)
|
||||
# run ssj_struct to create a ssj_test.pickle file
|
||||
with open("ssj_test.pickle", "rb") as file:
|
||||
ssj = pickle.load(file)
|
||||
|
||||
vallex = Vallex()
|
||||
vallex.read_ssj(ssj)
|
||||
|
||||
vallex.sorted_words = {}
|
||||
vallex.gen_sorted_words()
|
||||
|
||||
vallex.functors_index = {}
|
||||
vallex.gen_functors_index()
|
||||
Reference in New Issue
Block a user