forked from kristjan/cjvt-valency
707 lines
25 KiB
Python
707 lines
25 KiB
Python
|
from time import time
|
||
|
from copy import deepcopy as DC
|
||
|
from valency.frame import Frame
|
||
|
from valency.reduce_functions import *
|
||
|
from valency.lesk import *
|
||
|
from valency import mongo_tools
|
||
|
import random
|
||
|
import logging
|
||
|
from valency.evaluation import Evaluation
|
||
|
from valency.dictionary_interface import SloWnet, Sskj2
|
||
|
from valency.leskFour import LeskFour
|
||
|
from valency.k_kmeans import KmeansClass
|
||
|
from valency.ssj_struct import SsjDict, SsjEntry
|
||
|
from valency.seqparser.seqparser import Seqparser
|
||
|
import pickle
|
||
|
import sys
|
||
|
import hashlib
|
||
|
|
||
|
log = logging.getLogger(__name__)
|
||
|
|
||
|
|
||
|
def split_id(myid):
|
||
|
tmp = myid.split(".")
|
||
|
sid = ".".join(tmp[:-1])
|
||
|
tid = tmp[-1]
|
||
|
return (sid, tid)
|
||
|
|
||
|
|
||
|
class ValEntry():
|
||
|
def __init__(self, hw, frame):
|
||
|
self.hw = hw
|
||
|
self.raw_frames = [frame]
|
||
|
self.has_senses = False
|
||
|
|
||
|
|
||
|
class Vallex():
|
||
|
# Main class
|
||
|
def __init__(self):
|
||
|
# database
|
||
|
self.db, err_msg = mongo_tools.basic_connection("127.0.0.1", 26633)
|
||
|
if self.db is None:
|
||
|
log.error((
|
||
|
"Database not connected:"
|
||
|
"{}".format(err_msg)
|
||
|
))
|
||
|
exit(1)
|
||
|
mongo_tools.check_collections(self.db, [
|
||
|
"v2_users", "v2_senses", "v2_sense_map", "v2_user_tokens"
|
||
|
])
|
||
|
mongo_tools.prepare_user_tokens(self.db)
|
||
|
|
||
|
# these 3 might be obsolete for the web app (used for ML)
|
||
|
self.db_senses_map = self.db.senses_map3
|
||
|
self.slownet_interface = SloWnet(self)
|
||
|
self.sskj_interface = Sskj2(self)
|
||
|
|
||
|
# self.tokens["s0][t0"] = {word, lemma, msd, ...}
|
||
|
self.tokens = {}
|
||
|
|
||
|
# key = verb / adjective headword
|
||
|
self.entries = {}
|
||
|
|
||
|
# For alphabetical indexing in web app.
|
||
|
self.sorted_words = {}
|
||
|
# words = { first_letter: [hw1, hw2, ... sorted] }
|
||
|
self.functors_index = {}
|
||
|
self.has_se = [] # list of verbs with "se" ("bati se")
|
||
|
|
||
|
# Used for ML (deprecated).
|
||
|
self.leskFour = LeskFour(self)
|
||
|
self.kmeans = KmeansClass(self)
|
||
|
self.evaluation = Evaluation(self)
|
||
|
self.test_samples = []
|
||
|
|
||
|
# run self.process_after_read() after initiating Vallex
|
||
|
|
||
|
def read_ssj(self, ssj):
|
||
|
# ssj: object generated with ssj_strict.py.
|
||
|
BANNED_HW = ["biti"]
|
||
|
stats = {
|
||
|
"P_count": 0,
|
||
|
"skipped": 0,
|
||
|
}
|
||
|
log.info("Vallex.read_ssj({}).".format(
|
||
|
ssj
|
||
|
))
|
||
|
t_start = time()
|
||
|
for ssj_id, entry in ssj.entries.items():
|
||
|
# Read tokens
|
||
|
skip_entry = False
|
||
|
tmp_tokens = {}
|
||
|
for ssj_tid, token in entry.s.items():
|
||
|
sid, tid = split_id(ssj_tid)
|
||
|
|
||
|
# safety checks
|
||
|
if tid != "t" and not tid[1:].isdigit():
|
||
|
log.warning("dropping SID={} - corrupted keys".format(k))
|
||
|
skip_entry = True
|
||
|
break
|
||
|
if tid in tmp_tokens:
|
||
|
log.error(
|
||
|
"Vallex.read_ssj(): Duplicated ssj_tid:" + ssj_tid)
|
||
|
exit(1)
|
||
|
|
||
|
tmp_tokens[tid] = DC(token)
|
||
|
if skip_entry:
|
||
|
continue # skip corrupted keys
|
||
|
if sid in self.tokens:
|
||
|
log.error("sid duplicate: " + sid)
|
||
|
exit(1)
|
||
|
self.tokens[sid] = DC(tmp_tokens)
|
||
|
|
||
|
# Read frame data (each deep link gets its own raw frame).
|
||
|
link_map = {}
|
||
|
# hw_id: { hw_lemma: lemma, deep: [{functor: fnct, to: to}]}
|
||
|
for deep_link in entry.deep_links:
|
||
|
hw_id = deep_link["from"]
|
||
|
hw_token = self.get_token(hw_id)
|
||
|
hw_lemma = hw_token["lemma"]
|
||
|
hw_bv = hw_token["msd"][0]
|
||
|
if (hw_bv != "G" and hw_bv != "P"):
|
||
|
stats["skipped"] += 1
|
||
|
log.info("Skipping {}: not a verb or adjective.".format(
|
||
|
hw_lemma))
|
||
|
continue
|
||
|
if hw_bv == "P":
|
||
|
hw_lemma = hw_lemma + "_"
|
||
|
stats["P_count"] += 1
|
||
|
if hw_id in link_map:
|
||
|
link_map[hw_id]["deep"].append(deep_link)
|
||
|
else:
|
||
|
link_map[hw_id] = {
|
||
|
"hw_lemma": hw_lemma,
|
||
|
"deep": [deep_link]
|
||
|
}
|
||
|
for hw_id, data in link_map.items():
|
||
|
hw_lemma = data["hw_lemma"]
|
||
|
raw_frame = Frame(
|
||
|
hw=hw_lemma,
|
||
|
tids=[hw_id],
|
||
|
deep_links=data["deep"],
|
||
|
slots=None,
|
||
|
)
|
||
|
if hw_lemma not in self.entries:
|
||
|
self.entries[hw_lemma] = ValEntry(hw_lemma, raw_frame)
|
||
|
else:
|
||
|
self.entries[hw_lemma].raw_frames.append(raw_frame)
|
||
|
|
||
|
# cleanup banned
|
||
|
for hw in BANNED_HW:
|
||
|
if hw in self.entries:
|
||
|
del(self.entries[hw])
|
||
|
|
||
|
t_end = time()
|
||
|
log.info("Finished build_from_ssj() in {:.2}s.".format(
|
||
|
t_end - t_start
|
||
|
))
|
||
|
log.info("Vallex has a total of {} key entries.".format(
|
||
|
len(self.entries.keys())
|
||
|
))
|
||
|
log.info("Number of adjectives: {}".format(stats["P_count"]))
|
||
|
log.info("Number of skipped (not a verb or adjective): {}".format(
|
||
|
stats["skipped"]))
|
||
|
# Frames per hw
|
||
|
"""
|
||
|
for k, e in self.entries.items():
|
||
|
print(k + "," + str(len(e.raw_frames)))
|
||
|
"""
|
||
|
|
||
|
def get_token(self, myid):
|
||
|
# id = S123.t1
|
||
|
sid, tid = split_id(myid)
|
||
|
return self.tokens[sid][tid]
|
||
|
|
||
|
def get_sentence(self, myid):
|
||
|
sid, tid = split_id(myid)
|
||
|
tmp = []
|
||
|
sentence = ""
|
||
|
for k, token in self.tokens[sid].items():
|
||
|
if (k != "t") and (token["word"] is not None):
|
||
|
tmp.append((k, token))
|
||
|
for token in sorted(tmp, key=lambda x: int(x[0][1:])):
|
||
|
sentence += (token[1]["word"] + " ")
|
||
|
return sentence
|
||
|
|
||
|
def get_tokenized_sentence(self, myid):
|
||
|
sid, tid = split_id(myid)
|
||
|
tmp = []
|
||
|
sentence = []
|
||
|
for k, token in self.tokens[sid].items():
|
||
|
if k != "t":
|
||
|
tmp.append((k, token))
|
||
|
for token in sorted(tmp, key=lambda x: int(x[0][1:])):
|
||
|
sentence.append((".".join([sid, token[0]]), token[1]))
|
||
|
# return [(ssj_id, {word: _, lemma: _, msd: _}), ...]
|
||
|
return sentence
|
||
|
|
||
|
def process_after_read(
|
||
|
self, sskj_senses_pickle_path, se_list_pickle_path,
|
||
|
reload_sskj_senses
|
||
|
):
|
||
|
tstart = time()
|
||
|
|
||
|
# web app: index by hw
|
||
|
self.sorted_words = {}
|
||
|
self.gen_sorted_words()
|
||
|
|
||
|
# web app: index by functor
|
||
|
self.functors_index = {}
|
||
|
self.gen_functors_index()
|
||
|
|
||
|
# fill db.v2_senses
|
||
|
self.has_se = []
|
||
|
self.read_seqparser_pickles(
|
||
|
sskj_senses_pickle_path, se_list_pickle_path, reload_sskj_senses)
|
||
|
|
||
|
log.debug(
|
||
|
"vallex.process_after_read(): {:.2f}s".format(time() - tstart))
|
||
|
|
||
|
def gen_sorted_words(self):
|
||
|
res = {}
|
||
|
for hw, e in self.entries.items():
|
||
|
letter = hw[0].lower()
|
||
|
n_sent = len(e.raw_frames)
|
||
|
if letter not in res:
|
||
|
res[letter] = []
|
||
|
res[letter].append((hw, n_sent))
|
||
|
# sort and add to vallex object
|
||
|
self.sorted_words = {}
|
||
|
for letter, lst in res.items():
|
||
|
self.sorted_words[letter] = k_utils.slo_bucket_sort(
|
||
|
lst, key=lambda x: x[0])
|
||
|
|
||
|
def gen_functors_index(self):
|
||
|
for hw, e in self.entries.items():
|
||
|
for frame in e.raw_frames:
|
||
|
for slot in frame.slots:
|
||
|
if slot.functor not in self.functors_index:
|
||
|
self.functors_index[slot.functor] = []
|
||
|
self.functors_index[slot.functor].append(frame)
|
||
|
|
||
|
def read_seqparser_pickles(
|
||
|
self, sskj_senses_pickle_path, se_list_pickle_path,
|
||
|
reload_sskj_senses
|
||
|
):
|
||
|
log.info("read_seqparser_pickles()")
|
||
|
log.info((
|
||
|
"Reading list of has_se verbs from {}."
|
||
|
"Sskj senses into db.v2_senses from {}."
|
||
|
).format(se_list_pickle_path, sskj_senses_pickle_path))
|
||
|
AUTHOR_SSKJ = "SSKJ"
|
||
|
ERR_MSG = (
|
||
|
"Need to generate .pickle files first."
|
||
|
"Use: "
|
||
|
"$ python3 /script/valency/seqparser/seqparser.py"
|
||
|
"Input is /data/sskj_v2.html."
|
||
|
)
|
||
|
|
||
|
# has_se
|
||
|
with open(se_list_pickle_path, "rb") as f:
|
||
|
self.has_se = pickle.load(f)
|
||
|
if self.has_se is None:
|
||
|
log.error(ERR_MSG)
|
||
|
exit(1)
|
||
|
self.has_se = sorted(self.has_se)
|
||
|
log.info("Loaded self.has_se (len: {}) from {}.".format(
|
||
|
len(self.has_se), se_list_pickle_path))
|
||
|
|
||
|
# sskj senses
|
||
|
if reload_sskj_senses:
|
||
|
log.info("Reloading sskj_senses.")
|
||
|
reply = self.db.v2_senses.remove({"author": AUTHOR_SSKJ})
|
||
|
log.info(reply)
|
||
|
|
||
|
query = list(self.db.v2_senses.find({"author": AUTHOR_SSKJ}))
|
||
|
if len(query) > 0:
|
||
|
log.info("Sskj senses already in database.")
|
||
|
return
|
||
|
tstart = time()
|
||
|
data = None
|
||
|
with open(sskj_senses_pickle_path, "rb") as f:
|
||
|
data = pickle.load(f)
|
||
|
if data is None:
|
||
|
log.error(ERR_MSG)
|
||
|
exit(1)
|
||
|
for k, e in data.items():
|
||
|
for sense in e["senses"]:
|
||
|
db_entry = {
|
||
|
"hw": k,
|
||
|
"author": AUTHOR_SSKJ,
|
||
|
"desc": sense["sense_desc"],
|
||
|
# unique id for each sense
|
||
|
"sense_id": "{}-{}-{}-{}-{}".format(
|
||
|
AUTHOR_SSKJ,
|
||
|
sense["homonym_id"],
|
||
|
sense["sense_id"],
|
||
|
sense["sense_type"],
|
||
|
hashlib.sha256(
|
||
|
sense["sense_desc"].encode("utf-8")
|
||
|
).hexdigest()[:5]
|
||
|
)
|
||
|
}
|
||
|
self.db.v2_senses.insert(db_entry)
|
||
|
# print(db_entry)
|
||
|
log.info("db.v2_senses prepared in {:.2f}s".format(time() - tstart))
|
||
|
|
||
|
# Functions below can be used for interactively with flask_api.
|
||
|
def test_dev(self):
|
||
|
# self.prepare_sskj_senses()
|
||
|
hw = "dajati"
|
||
|
senses = self.sskj_interface.sense_glosses(hw)
|
||
|
return str(senses)
|
||
|
|
||
|
def calc_senses(self):
|
||
|
# self.calc_all_senses(self.leskFour.lesk_nltk)
|
||
|
# self.calc_all_senses(self.leskFour.lesk_sl)
|
||
|
# self.calc_all_senses(self.leskFour.lesk_al) # cca 8h!
|
||
|
# self.calc_all_senses(self.leskFour.lesk_ram)
|
||
|
self.calc_all_senses_kmeans(self.kmeans.bisection_kmeans)
|
||
|
self.calc_all_senses_kmeans(self.kmeans.normal_kmeans)
|
||
|
return "edit val_struct.py: calc_senses()"
|
||
|
|
||
|
# deprecated functions (used for machine learning experiments)
|
||
|
|
||
|
def prepare_sskj_senses(self):
|
||
|
# obsolete, using read_seqparser_pickles()
|
||
|
log.info("prepare_sskj_senses() (db.v2_senses)")
|
||
|
query = list(self.db.v2_senses.find({"author": "SSKJ2"}))
|
||
|
if len(query) > 0:
|
||
|
log.info("Sskj senses already in database.")
|
||
|
return
|
||
|
tstart = time()
|
||
|
log.info("Iterating over {} hw entries:".format(
|
||
|
len(self.entries.keys())))
|
||
|
for hw, e in self.entries.items():
|
||
|
senses = self.sskj_interface.sense_glosses(hw)
|
||
|
if len(senses) == 0:
|
||
|
continue
|
||
|
for sense in senses:
|
||
|
# create sense from each description
|
||
|
for i, de in enumerate(sense["def"]):
|
||
|
sense_def = sense["def"][i]
|
||
|
sense_def = sense_def[0].upper() + sense_def[1:]
|
||
|
if sense_def[-1] == ":" or sense_def[-1] == ";":
|
||
|
sense_def = sense_def[:-1] + "."
|
||
|
data = {
|
||
|
"hw": hw,
|
||
|
"author": "SSKJ2",
|
||
|
"desc": sense_def,
|
||
|
"sskj_id": sense["sskj_sense_id"],
|
||
|
"sskj_desc_id": i
|
||
|
}
|
||
|
self.db.v2_senses.insert(data)
|
||
|
log.info("sskj_senses prepared in {:.2f}s".format(time() - tstart))
|
||
|
|
||
|
def gen_sskj_sl(self):
|
||
|
# Takes about an hour.
|
||
|
tstart = time()
|
||
|
log.info("Generating new sskj_simple_lesk with Simple Lesk.")
|
||
|
for k, e in self.entries.items():
|
||
|
self.gen_sskj_sl_one(e.hw)
|
||
|
log.debug("gen_sskj_sl in {:.2f}s".format(time() - tstart))
|
||
|
|
||
|
def gen_sskj_sl_one(self, hw, update_db=True):
|
||
|
entry = None
|
||
|
ttstart = time()
|
||
|
e = self.entries.get(hw)
|
||
|
if e is None:
|
||
|
return
|
||
|
for frame in e.raw_frames:
|
||
|
tid = frame.tids[0]
|
||
|
sentence = self.get_sentence(tid)
|
||
|
res = self.lesk.simple_lesk_sskj(sentence, hw)
|
||
|
if res is None:
|
||
|
log.debug("headword {} not in sskj".format(hw))
|
||
|
continue
|
||
|
key = {"ssj_id": tid}
|
||
|
entry = {
|
||
|
"headword": hw,
|
||
|
"ssj_id": tid, # uniqe identifier
|
||
|
"sense_id": res[1],
|
||
|
# "sense_desc": k_utils.dict_safe_key(res[2], "ns0:def"),
|
||
|
"sense_desc": res[2]["def"]
|
||
|
}
|
||
|
# log.debug(str(res[2]))
|
||
|
# log.debug(entry["sense_id"])
|
||
|
# log.debug(entry["sense_desc"])
|
||
|
if update_db:
|
||
|
self.db.sskj_simple_lesk.update(key, entry, upsert=True)
|
||
|
log.debug("[*] sskj_ids for {} in {:.2f}s".format(
|
||
|
hw, time() - ttstart))
|
||
|
|
||
|
def get_context(self, myid, radius=None, min_lemma_size=None):
|
||
|
radius = radius or 5
|
||
|
min_lemma_size = min_lemma_size or 4
|
||
|
# gives you the token and 10 of its neighbors
|
||
|
sentence = self.get_sentence(myid)
|
||
|
sentlen = len(sentence.split(" "))
|
||
|
sid, tid = split_id(myid)
|
||
|
idx = int(tid[1:])
|
||
|
tokens_after = []
|
||
|
i = idx
|
||
|
while i < sentlen - 1 and len(tokens_after) < radius:
|
||
|
i += 1
|
||
|
token = self.get_token(sid + ".t" + str(i))
|
||
|
if (
|
||
|
token is not None and "lemma" in token and
|
||
|
len(token["lemma"]) >= min_lemma_size and
|
||
|
token["lemma"] != "biti"
|
||
|
):
|
||
|
tokens_after.append(token)
|
||
|
tokens_before = []
|
||
|
i = idx
|
||
|
while i > 1 and len(tokens_before) < radius:
|
||
|
i -= 1
|
||
|
token = self.get_token(sid + ".t" + str(i))
|
||
|
if (
|
||
|
token is not None and "lemma" in token and
|
||
|
len(token["lemma"]) >= min_lemma_size and
|
||
|
token["lemma"] != "biti"
|
||
|
):
|
||
|
tokens_before.append(token)
|
||
|
tokens = tokens_before + [self.get_token(myid)] + tokens_after
|
||
|
# find position of original token:
|
||
|
mid_idx = len(tokens_before)
|
||
|
return (mid_idx, tokens)
|
||
|
|
||
|
def get_sense_ids(self, collname, hw, sense_group=None):
|
||
|
query = {"headword": hw}
|
||
|
if sense_group is not None:
|
||
|
query["sense_group"] = sense_group
|
||
|
result = list(self.db[collname].find(query))
|
||
|
sense_ids = {}
|
||
|
for r in result:
|
||
|
sense_ids[r["ssj_id"]] = r["sense_id"]
|
||
|
return sense_ids
|
||
|
|
||
|
def t_get_context(self):
|
||
|
ii = 10
|
||
|
for k, e in self.entries.items():
|
||
|
for frame in e.raw_frames:
|
||
|
if random.randint(0, 100) > 20:
|
||
|
continue
|
||
|
ii -= 1
|
||
|
if ii <= 0:
|
||
|
return
|
||
|
|
||
|
mytid = frame.tids[0]
|
||
|
print()
|
||
|
print(mytid)
|
||
|
print(self.get_token(mytid))
|
||
|
sent = self.get_context(mytid, radius=3, min_lemma_size=4)
|
||
|
print("mid: {}".format(sent[0]))
|
||
|
for ii in range(len(sent[1])):
|
||
|
print("{} -> {}".format(
|
||
|
ii, sent[1][ii]))
|
||
|
|
||
|
def t_simple_lesk_sskj(self):
|
||
|
ii = 10
|
||
|
for k, e in self.entries.items():
|
||
|
if random.randint(0, 100) > 20:
|
||
|
continue
|
||
|
for frame in e.raw_frames:
|
||
|
if random.randint(0, 100) > 20:
|
||
|
continue
|
||
|
if ii == 0:
|
||
|
return
|
||
|
ii -= 1
|
||
|
|
||
|
print("\nTest frame: {}.".format(frame.tids))
|
||
|
hw_token = self.get_token(frame.tids[0])
|
||
|
print(hw_token)
|
||
|
context_sentence = self.get_sentence(frame.tids[0])
|
||
|
print(context_sentence)
|
||
|
self.lesk.simple_lesk_sskj(
|
||
|
context_sentence=context_sentence,
|
||
|
word_lemma=hw_token["lemma"]
|
||
|
)
|
||
|
|
||
|
def process_kmeans(self):
|
||
|
# Convert words to lemmas, cluseter using k-means.
|
||
|
# Number of clusters from sskj.
|
||
|
tstart = time()
|
||
|
log.info("Processing senses using kmeans.")
|
||
|
for k, e in self.entries.items():
|
||
|
# Frame start
|
||
|
ttstart = time()
|
||
|
lemma = e.hw
|
||
|
tokenized_sentences = []
|
||
|
for frame in e.raw_frames:
|
||
|
tid = frame.tids[0]
|
||
|
tokenized_sentences.append(self.get_tokenized_sentence(tid))
|
||
|
lemmatized_sentences = []
|
||
|
for sent in tokenized_sentences:
|
||
|
lemmatized = ""
|
||
|
for token in sent:
|
||
|
if "lemma" in token[1]:
|
||
|
lemmatized += (token[1]["lemma"] + " ")
|
||
|
lemmatized_sentences.append(lemmatized)
|
||
|
lls = len(lemmatized_sentences)
|
||
|
# We got the sentences
|
||
|
sskj_entry = self.db.sskj.find_one(
|
||
|
{"ns0:entry.ns0:form.ns0:orth": lemma})
|
||
|
if sskj_entry is None:
|
||
|
log.debug("headword {} has no <sense> in sskj".format(lemma))
|
||
|
continue
|
||
|
n_clusters = 1
|
||
|
if "ns0:sense" in sskj_entry["ns0:entry"]:
|
||
|
# Guess number of senses based on sskj senses.
|
||
|
n_clusters = len(sskj_entry["ns0:entry"]["ns0:sense"])
|
||
|
if lls >= n_clusters and n_clusters > 1:
|
||
|
labels = k_kmeans.k_means(
|
||
|
sentences=lemmatized_sentences,
|
||
|
n_clusters=n_clusters
|
||
|
)
|
||
|
kmeans_ids = [str(x) + "-" + str(lls) for x in labels]
|
||
|
elif n_clusters == 1:
|
||
|
kmeans_ids = ["1-1" for x in lemmatized_sentences]
|
||
|
elif lls < n_clusters:
|
||
|
# Each sentence gets its own sense.
|
||
|
kmeans_ids = []
|
||
|
for i in range(lls):
|
||
|
kmeans_ids.append(str(i + 1) + "lt" + str(n_clusters))
|
||
|
else:
|
||
|
log.error("Shouldn't be here (val_struct: process_kmeans()")
|
||
|
exit(1)
|
||
|
|
||
|
# Feed sense ides of whole frame to database.
|
||
|
for i in range(len(e.raw_frames)):
|
||
|
tid = e.raw_frames[i].tids[0]
|
||
|
key = {"ssj_id": tid}
|
||
|
entry = {
|
||
|
"headword": lemma,
|
||
|
"ssj_id": tid, # unique idenfitier
|
||
|
"sense_id": kmeans_ids[i],
|
||
|
}
|
||
|
self.db.kmeans.update(key, entry, upsert=True)
|
||
|
|
||
|
log.debug("[*] kemans_ids for {} in {:.2f}s".format(
|
||
|
lemma, time() - ttstart))
|
||
|
# Frame end
|
||
|
log.debug("process_kmeans in {:.2f}s".format(time() - tstart))
|
||
|
|
||
|
def get_context1(
|
||
|
self, mytid, collname, radius=None, min_token_len=3, get_glosses=None
|
||
|
):
|
||
|
# returns {
|
||
|
# "hw": headword lemma and its glosses
|
||
|
# "context": a list of lemmas and their glosses around the hw that
|
||
|
# have entries in collname dictionary (if get_glosses=True)
|
||
|
# }
|
||
|
# tstart = time()
|
||
|
if get_glosses is None:
|
||
|
get_glosses = False
|
||
|
if radius is None:
|
||
|
radius = 10000
|
||
|
if collname == "slownet":
|
||
|
dictionary_interface = self.slownet_interface
|
||
|
elif collname == "sskj":
|
||
|
dictionary_interface = self.sskj_interface
|
||
|
else:
|
||
|
log.error("argument error: get_context1(collname=<slownet/sskj>)")
|
||
|
return []
|
||
|
|
||
|
sentence = self.get_tokenized_sentence(mytid)
|
||
|
# return [(ssj_id, {word: _, lemma: _, msd: _}), ...]
|
||
|
hw_idx = -1
|
||
|
for i, e in enumerate(sentence):
|
||
|
if e[0] == mytid:
|
||
|
hw_idx = i
|
||
|
hw_lemma = e[1]["lemma"]
|
||
|
break
|
||
|
|
||
|
hw_glosses = dictionary_interface.sense_glosses(hw_lemma)
|
||
|
if len(hw_glosses) == 0:
|
||
|
log.info("hw: {} has 0 glosses".format(hw_lemma))
|
||
|
return {
|
||
|
"hw": None,
|
||
|
"err": "headword {} has no glosses in {}".format(
|
||
|
hw_lemma, collname)
|
||
|
}
|
||
|
|
||
|
tokens_before = []
|
||
|
ii = hw_idx - 1
|
||
|
while(ii >= 0 and len(tokens_before) < radius):
|
||
|
lemma = sentence[ii][1].get("lemma")
|
||
|
if (
|
||
|
lemma is not None and
|
||
|
len(lemma) >= min_token_len
|
||
|
):
|
||
|
if get_glosses:
|
||
|
glosses = dictionary_interface.sense_glosses(lemma)
|
||
|
else:
|
||
|
glosses = [{"def": "--none--", "gloss": "--none--"}]
|
||
|
if len(glosses) > 0:
|
||
|
tokens_before.insert(0, {
|
||
|
"lemma": lemma,
|
||
|
"glosses": glosses
|
||
|
})
|
||
|
ii -= 1
|
||
|
|
||
|
tokens_after = []
|
||
|
ii = hw_idx + 1
|
||
|
while(ii < len(sentence) and len(tokens_after) < radius):
|
||
|
lemma = sentence[ii][1].get("lemma")
|
||
|
if (
|
||
|
lemma is not None and
|
||
|
len(lemma) >= min_token_len
|
||
|
):
|
||
|
if get_glosses:
|
||
|
glosses = dictionary_interface.sense_glosses(lemma)
|
||
|
else:
|
||
|
glosses = [{"def": "--none--", "gloss": "--none--"}]
|
||
|
if len(glosses) > 0:
|
||
|
tokens_after.append({
|
||
|
"lemma": lemma,
|
||
|
"glosses": glosses
|
||
|
})
|
||
|
ii += 1
|
||
|
|
||
|
# log.debug("context1({}): {:.2f}".format(mytid, time() - tstart))
|
||
|
return {
|
||
|
"hw": {"lemma": hw_lemma, "glosses": hw_glosses},
|
||
|
"context": tokens_before + tokens_after
|
||
|
}
|
||
|
|
||
|
def test_context1(self, mytid, hw=""):
|
||
|
res = ""
|
||
|
context = self.get_context1(
|
||
|
mytid, collname="slownet", radius=2, get_glosses=True)
|
||
|
if context["hw"] is None:
|
||
|
return context["err"] + "<br><br>"
|
||
|
res = "hw: {}<br>sentence: {}<br>".format(
|
||
|
hw, self.get_sentence(mytid))
|
||
|
tfigf_input = []
|
||
|
glosses = [context["hw"]] + context["context"]
|
||
|
for e in glosses:
|
||
|
res += "--->lemma: {} ({} senses)<br>".format(
|
||
|
e["lemma"], len(e["glosses"]))
|
||
|
for g in e["glosses"]:
|
||
|
res += "{}<br>".format(str(g))
|
||
|
tfigf_input.append(" ".join(k_utils.tokenize_multiple(
|
||
|
g["gloss"],
|
||
|
min_token_len=3,
|
||
|
stem=k_utils.stem_eng
|
||
|
)))
|
||
|
res += "<br><br>"
|
||
|
return res
|
||
|
|
||
|
def calc_all_senses(self, lesk_algorithm):
|
||
|
allcount = 0
|
||
|
count = 0
|
||
|
for k, e in self.entries.items():
|
||
|
allcount += len(e.raw_frames)
|
||
|
for k, e in self.entries.items():
|
||
|
if k == "biti": # skip this huge bag of words
|
||
|
continue
|
||
|
for frame in e.raw_frames:
|
||
|
count += 1
|
||
|
if count % 10 == 0:
|
||
|
log.info("calc_all_senses: ({}/{})".format(
|
||
|
count, allcount))
|
||
|
lesk_algorithm(frame.tids[0])
|
||
|
return None
|
||
|
|
||
|
def calc_all_senses_kmeans(self, kmeans_algorithm):
|
||
|
tstart = time()
|
||
|
allcount = len(self.entries)
|
||
|
count = 0
|
||
|
avg_times = []
|
||
|
for key in self.entries:
|
||
|
count += 1
|
||
|
if key == "biti":
|
||
|
continue
|
||
|
# cluster frames of each entry
|
||
|
log.info("calc_all_senses_kmeans: ({}/{}) [{}]".format(
|
||
|
count, allcount, key))
|
||
|
kmeans_algorithm(key)
|
||
|
"""
|
||
|
try:
|
||
|
kmeans_algorithm(key)
|
||
|
except ValueError:
|
||
|
continue
|
||
|
"""
|
||
|
avg_times.append(1.0 * (time() - tstart) / count)
|
||
|
log.info("avg_time: {:.2f}s".format(avg_times[-1]))
|
||
|
log.info("calc_all_senses_kmeans in {:.2f}s.".format(time() - tstart))
|
||
|
return None
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
log.setLevel(logging.DEBUG)
|
||
|
ch = logging.StreamHandler(sys.stdout)
|
||
|
log.addHandler(ch)
|
||
|
# run ssj_struct to create a ssj_test.pickle file
|
||
|
with open("ssj_test.pickle", "rb") as file:
|
||
|
ssj = pickle.load(file)
|
||
|
|
||
|
vallex = Vallex()
|
||
|
vallex.read_ssj(ssj)
|
||
|
|
||
|
vallex.sorted_words = {}
|
||
|
vallex.gen_sorted_words()
|
||
|
|
||
|
vallex.functors_index = {}
|
||
|
vallex.gen_functors_index()
|