diff --git a/Makefile b/Makefile index 7d3129d..deeadb5 100644 --- a/Makefile +++ b/Makefile @@ -48,6 +48,7 @@ python-env: # inside the container, install our packages python-env-install: pip3 install -e src/pkg/cjvt-corpusparser/. + pip3 install -e src/pkg/valency/. # from inside python-env container: data/samples: diff --git a/README.md b/README.md index 91236b2..1d0b87d 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,9 @@ If all goes well, we should be able to inspect the database, filled with corpora ### Flask backend (1 container) Relies heavily on the database. Set that up first. ```bash -# $ make backend=dev # development +$ make python-env + +# $ make backend-dev # development $ make backend-prod ``` diff --git a/dip_src/valency/frame.py b/dip_src/valency/frame.py index f0c1630..ea7c0c5 100644 --- a/dip_src/valency/frame.py +++ b/dip_src/valency/frame.py @@ -73,7 +73,7 @@ class Slot(): # It consists of different tokens. def __init__(self, functor, tids=None, count=None): self.functor = functor - self.tids = tids or [] + self.tids = tids or [] # combining multiple sentences vertically self.count = count or 1 def to_string(self): diff --git a/dip_src/valency/reduce_functions.py b/dip_src/valency/reduce_functions.py index 561d9c9..e714d8c 100644 --- a/dip_src/valency/reduce_functions.py +++ b/dip_src/valency/reduce_functions.py @@ -10,6 +10,9 @@ log = logging.getLogger(__name__) SENSE_UNDEFINED = "nedefinirano" +## TIDI: use frame.py +## TODO: build a list of [Frame] with lists of [Slot] + def sorted_by_len_tids(frames): return sorted( diff --git a/src/backend_flask/app.py b/src/backend_flask/app.py index fb7dbea..bf2ee94 100644 --- a/src/backend_flask/app.py +++ b/src/backend_flask/app.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- from flask import Flask, render_template, request, url_for, redirect +from valency import Frame, Slot +from valency.reduce_functions import reduce_functions """ from valency import k_utils @@ -26,12 +28,19 @@ from pathlib import Path from pymongo import MongoClient import argparse +# some db collections +USERS_COLL = "users" +TOKENS_COLL = "usertokens" +SENSES_COLL = "senses" +SENSEMAP_COLL = "sensemap" + +# pre-generated data (gui leftside word index) CORPORA = ["ssj", "kres"] +app_index = {c: {} for c in CORPORA} log = logging.getLogger(__name__) app = Flask(__name__) -app_index = {c: {} for c in CORPORA} # when running vuejs via webpack # CORS(app) @@ -41,23 +50,7 @@ app_index = {c: {} for c in CORPORA} CORS(app) -# for testing functions -@app.route("/test_dev") -def test_dev(): - ret = vallex.test_dev() - return(str(ret) or "edit val_struct.py: test_dev()") - - -@app.route("/") -def index(): - return(render_template("index.html")) - - -@app.route("/home", defaults={"pathname": ""}) -@app.route("/home/") -def home(pathname): - return redirect(url_for("index"), code=302) - +# INDEX SELECTION -------------------. @app.route("/api/words/") def api_words(corpus): @@ -69,10 +62,13 @@ def api_words(corpus): def api_functors(corpus): return json.dumps(app_index[corpus]["functors"]) +# INDEX SELECTION -------------------^ + + +# AUTH ------------------------------. @app.route("/api/register", methods=["POST"]) def api_register(): - USERS_COLL = "v2_users" b = request.get_data() data = json.loads(b.decode()) username = data["username"] @@ -84,7 +80,7 @@ def api_register(): email == "" ): return "ERR" - existing = list(vallex.db[USERS_COLL].find({ + existing = list(valdb[USERS_COLL].find({ "$or": [{"username": username}, {"email": email}] })) if len(existing) > 0: @@ -96,21 +92,19 @@ def api_register(): "email": hashlib.sha256( email.encode("utf-8")).hexdigest() } - vallex.db[USERS_COLL].insert(entry) + valdb[USERS_COLL].insert(entry) return "OK" @app.route("/api/login", methods=["POST"]) def api_login(): - USERS_COLL = "v2_users" - TOKENS_COLL = "v2_user_tokens" b = request.get_data() data = json.loads(b.decode()) username = data["username"] password = data["password"] hpass = hashlib.sha256(password.encode("utf-8")).hexdigest() - db_user = list(vallex.db[USERS_COLL].find({ + db_user = list(valdb[USERS_COLL].find({ "username": username, "hpass": hpass })) @@ -124,7 +118,7 @@ def api_login(): "date": datetime.datetime.utcnow(), "token": token } - vallex.db[TOKENS_COLL].update( + valdb[TOKENS_COLL].update( {"username": token_entry["username"]}, token_entry, upsert=True @@ -167,7 +161,7 @@ def api_new_pass(): username = data["username"] email = data["email"] hemail = hashlib.sha256(email.encode("utf-8")).hexdigest() - db_res = list(vallex.db.v2_users.find({ + db_res = list(valdb[USERS_COLL].find({ "username": username, "email": hemail })) @@ -179,7 +173,7 @@ def api_new_pass(): string.ascii_letters + string.digits) for i in range(10)]) # update locally hpass = hashlib.sha256(new_pass.encode("utf-8")).hexdigest() - vallex.db.v2_users.update( + valdb[USERS_COLL].update( { "username": username, "email": hemail @@ -193,6 +187,39 @@ def api_new_pass(): return json.dumps({"confirmation": True}) +def token_to_username(token): + key = { + "token": token + } + res = list(valdb[TOKENS_COLL].find(key)) + if len(res) != 1: + return None + username = res[0]["username"] + # update deletion interval + valdb[TOKENS_COLL].update( + key, {"$set": {"date": datetime.datetime.utcnow()}}) + return username + + +@app.route("/api/token", methods=["POST"]) +def api_token(): + # check if token is valid + b = request.get_data() + data = json.loads(b.decode()) + token = data.get("token") + # user = data.get("user") + user = token_to_username(token) + confirm = (user is not None) + return json.dumps({ + "confirmation": confirm, + "username": user + }) + +# AUTH ------------------------------^ + + +# FRAMES ----------------------------. + def prepare_frames(ret_frames): # append sentences for frame in ret_frames: @@ -218,19 +245,21 @@ def prepare_frames(ret_frames): return json.dumps(json_ret) -@app.route("/api/frames") +# input: hw, reduct_function +@app.route("/api/hw-frames") def api_get_frames(): hw = request.args.get("hw") if hw is None: - return json.dumps({"error": "Headword not found."}) + return json.dumps({"error": "Required argument: hw (headword)."}) rf_name = request.args.get("rf", "reduce_0") # 2nd is default RF = reduce_functions[rf_name]["f"] - entry = vallex.entries[hw] + entry = vallex.entries[hw] # TODO hw -> [Frame,] ret_frames = RF(entry.raw_frames, vallex) return prepare_frames(ret_frames) +# input: functor, reduce_function @app.route("/api/functor-frames") def api_get_functor_frames(): functor = request.args.get("functor") @@ -238,49 +267,23 @@ def api_get_functor_frames(): return json.dumps({"error": "Missing argument: functor."}) rf_name = request.args.get("rf", "reduce_0") # 2nd is default RF = reduce_functions[rf_name]["f"] - raw_frames = vallex.functors_index[functor] + raw_frames = vallex.functors_index[functor] # TODO ret_frames = RF(raw_frames, vallex) return prepare_frames(ret_frames) +# FRAMES ----------------------------^ -def token_to_username(token): - COLLNAME = "v2_user_tokens" - key = { - "token": token - } - res = list(vallex.db[COLLNAME].find(key)) - if len(res) != 1: - return None - username = res[0]["username"] - # update deletion interval - vallex.db[COLLNAME].update( - key, {"$set": {"date": datetime.datetime.utcnow()}}) - return username - - -@app.route("/api/token", methods=["POST"]) -def api_token(): - # check if token is valid - b = request.get_data() - data = json.loads(b.decode()) - token = data.get("token") - # user = data.get("user") - user = token_to_username(token) - confirm = (user is not None) - return json.dumps({ - "confirmation": confirm, - "username": user - }) +# SENSES ----------------------------. @app.route("/api/senses/get") def api_senses_get(): # returns senses and mapping for hw hw = request.args.get("hw") - senses = list(vallex.db["v2_senses"].find({ + senses = list(valdb[SENSES_COLL].find({ "hw": hw })) - sense_map_query = list(vallex.db["v2_sense_map"].find({ + sense_map_query = list(valdb[SENSEMAP_COLL].find({ "hw": hw })) # aggregation by max date possible on DB side @@ -358,7 +361,7 @@ def api_senses_update(): id_map[frontend_sense_id] = new_sense_id # insert into db - vallex.db["v2_senses"].insert(ns) + valdb[SENSES_COLL].insert(ns) # replace tmp_id with mongo's _id for ssj_id, el in sense_map.items(): @@ -373,9 +376,14 @@ def api_senses_update(): "date": datetime.datetime.utcnow() } # vallex.db["v2_sense_map"].update(key, data, upsert=True) - vallex.db["v2_sense_map"].insert(data) + valdb[SENSEMAP_COLL].insert(data) return "OK" +# SENSES ----------------------------^ + + +# APP PREFLIGHT ---------------------. + def prepare_db(): def helper_tid_to_token(tid, tokens): for t in tokens: @@ -384,7 +392,7 @@ def prepare_db(): return None # update entries (add headwords and fuctors for indexing) - for corpus in ["ssj", "kres"]: + for corpus in CORPORA: for e in valdb[corpus].find({}): if e["srl_links"] is None: continue @@ -435,6 +443,8 @@ def prepare_db(): functors = sorted(functors, key=lambda x: x[0]) app_index[corpus]["functors"] = functors +# APP PREFLIGHT ---------------------^ + if __name__ == "__main__": print("Starting app.py main()") diff --git a/src/pkg/valency/setup.py b/src/pkg/valency/setup.py new file mode 100644 index 0000000..ff82025 --- /dev/null +++ b/src/pkg/valency/setup.py @@ -0,0 +1,12 @@ +from setuptools import setup + +setup( + name='valency', + version='0.1.1', + description='Objects and functions for handling valency frames.', + author='Kristjan Voje', + author_email='kristjan.voje@gmail.com', + license='MIT', + packages=['valency'], + install_requires=[], +) diff --git a/src/pkg/valency/valency/Frame.py b/src/pkg/valency/valency/Frame.py new file mode 100644 index 0000000..ea7c0c5 --- /dev/null +++ b/src/pkg/valency/valency/Frame.py @@ -0,0 +1,96 @@ +import logging + +log = logging.getLogger(__name__) + + +class Frame(): + def __init__(self, tids, deep_links=None, slots=None, hw=None): + self.hw = hw + self.tids = tids # list of tokens with the same hw_lemma + # Each tid = "S123.t123"; + # you can get sentence with vallex.get_sentence(S123) + self.slots = [] + if slots is None: + self.slots = self.init_slots(deep_links) + else: + self.slots = slots + self.sense_info = {} + self.sentences = None # Used for passing to view in app.py, get_frames + self.aggr_sent = None # Dictionary { hw: self.sentences idx } + + def to_json(self): + ret = { + "hw": self.hw, + "tids": self.tids, + "slots": [slot.to_json() for slot in self.slots], + "sentences": self.sentences, + "aggr_sent": self.aggr_sent, + "sense_info": self.sense_info + } + return ret + + def init_slots(self, deep): + slots = [] + for link in deep: + slots.append(Slot( + functor=link["functor"], + tids=[link["to"]] + )) + return slots + + def sort_slots(self): + # ACT, PAT, alphabetically + srt1 = [ + x for x in self.slots + if (x.functor == "ACT" or + x.functor == "PAT") + ] + srt1 = sorted(srt1, key=lambda x: x.functor) + srt2 = [ + x for x in self.slots + if (x.functor != "ACT" and + x.functor != "PAT") + ] + srt2 = sorted(srt2, key=lambda x: x.functor) + self.slots = (srt1 + srt2) + + def to_string(self): + ret = "Frame:\n" + ret += "sense_info: {}\n".format(str(self.sense_info)) + ret += "tids: [" + for t in self.tids: + ret += (str(t) + ", ") + ret += "]\n" + if self.slots is not None: + ret += "slots:\n" + for sl in self.slots: + ret += (sl.to_string() + "\n") + return ret + + +class Slot(): + # Each slot is identified by its functor (ACT, PAT, ...) + # It consists of different tokens. + def __init__(self, functor, tids=None, count=None): + self.functor = functor + self.tids = tids or [] # combining multiple sentences vertically + self.count = count or 1 + + def to_string(self): + ret = "---- Slot:\n" + ret += "functor: {}\n".format(self.functor) + ret += "tids: [" + for t in self.tids: + ret += (str(t) + ", ") + ret += "]\n" + ret += "]\n" + ret += "----\n" + return ret + + def to_json(self): + ret = { + "functor": self.functor, + "tids": self.tids, + "count": self.count + } + return ret diff --git a/src/pkg/valency/valency/__init__.py b/src/pkg/valency/valency/__init__.py new file mode 100644 index 0000000..6a41c6a --- /dev/null +++ b/src/pkg/valency/valency/__init__.py @@ -0,0 +1 @@ +from valency.Frame import Frame, Slot diff --git a/src/pkg/valency/valency/frame.py b/src/pkg/valency/valency/frame.py new file mode 100644 index 0000000..ea7c0c5 --- /dev/null +++ b/src/pkg/valency/valency/frame.py @@ -0,0 +1,96 @@ +import logging + +log = logging.getLogger(__name__) + + +class Frame(): + def __init__(self, tids, deep_links=None, slots=None, hw=None): + self.hw = hw + self.tids = tids # list of tokens with the same hw_lemma + # Each tid = "S123.t123"; + # you can get sentence with vallex.get_sentence(S123) + self.slots = [] + if slots is None: + self.slots = self.init_slots(deep_links) + else: + self.slots = slots + self.sense_info = {} + self.sentences = None # Used for passing to view in app.py, get_frames + self.aggr_sent = None # Dictionary { hw: self.sentences idx } + + def to_json(self): + ret = { + "hw": self.hw, + "tids": self.tids, + "slots": [slot.to_json() for slot in self.slots], + "sentences": self.sentences, + "aggr_sent": self.aggr_sent, + "sense_info": self.sense_info + } + return ret + + def init_slots(self, deep): + slots = [] + for link in deep: + slots.append(Slot( + functor=link["functor"], + tids=[link["to"]] + )) + return slots + + def sort_slots(self): + # ACT, PAT, alphabetically + srt1 = [ + x for x in self.slots + if (x.functor == "ACT" or + x.functor == "PAT") + ] + srt1 = sorted(srt1, key=lambda x: x.functor) + srt2 = [ + x for x in self.slots + if (x.functor != "ACT" and + x.functor != "PAT") + ] + srt2 = sorted(srt2, key=lambda x: x.functor) + self.slots = (srt1 + srt2) + + def to_string(self): + ret = "Frame:\n" + ret += "sense_info: {}\n".format(str(self.sense_info)) + ret += "tids: [" + for t in self.tids: + ret += (str(t) + ", ") + ret += "]\n" + if self.slots is not None: + ret += "slots:\n" + for sl in self.slots: + ret += (sl.to_string() + "\n") + return ret + + +class Slot(): + # Each slot is identified by its functor (ACT, PAT, ...) + # It consists of different tokens. + def __init__(self, functor, tids=None, count=None): + self.functor = functor + self.tids = tids or [] # combining multiple sentences vertically + self.count = count or 1 + + def to_string(self): + ret = "---- Slot:\n" + ret += "functor: {}\n".format(self.functor) + ret += "tids: [" + for t in self.tids: + ret += (str(t) + ", ") + ret += "]\n" + ret += "]\n" + ret += "----\n" + return ret + + def to_json(self): + ret = { + "functor": self.functor, + "tids": self.tids, + "count": self.count + } + return ret diff --git a/src/pkg/valency/valency/reduce_functions.py b/src/pkg/valency/valency/reduce_functions.py new file mode 100644 index 0000000..f6f785a --- /dev/null +++ b/src/pkg/valency/valency/reduce_functions.py @@ -0,0 +1,242 @@ +# Reduction function for frames. +# Input: list of Frame objects, output: list of Frame objects. +# App uses reduce_0, 1 and 5 + +from valency import Frame, Slot +from copy import deepcopy as DC +import logging + +log = logging.getLogger(__name__) + +SENSE_UNDEFINED = "nedefinirano" + +## TIDI: use frame.py +## TODO: build a list of [Frame] with lists of [Slot] + + +def sorted_by_len_tids(frames): + return sorted( + frames, + key=lambda x: len(x.tids), + reverse=True + ) + + +def reduce_0(frames, vallex=None): + # new request... frames should be sorded by + # functors list (basically reduce_1, just each + # sentence gets its own frame) + r1_frames = reduce_1(frames) + sorting_strings = [] + separated_frames = [] + for frame in r1_frames: + for tid in frame.tids: + tmp_frame = DC(frame) + tmp_frame.tids = [tid] + separated_frames.append(tmp_frame) + sorting_strings.append("".join( + [slot.functor for slot in tmp_frame.slots] + )) + permutation = [x for _, x in sorted( + zip(sorting_strings, range(len(sorting_strings))))] + sorted_sep_frames = [separated_frames[i] for i in permutation] + return sorted_sep_frames + + +def reduce_1(frames, vallex=None): + # Combine frames with the same set of functors. + # The order of functors is not important. + frame_sets = [] # [set of functors, list of frames] + for frame in frames: + functors = [slot.functor for slot in frame.slots] + + for fs in frame_sets: + if set(functors) == set(fs[0]): + fs[1].append(frame) + break + else: + # Python for else -> fires if loop has ended. + frame_sets.append([functors, [frame]]) + + ret_frames = [] + for fs in frame_sets: + tids = [] + slots = {} + # All possible slots in this frame. + for functor in fs[0]: + slots[functor] = Slot(functor=functor) + # Reduce slots from all frames. (Merge ACT from all frames, ...) + for frame in fs[1]: + tids += frame.tids + for sl in frame.slots: + slots[sl.functor].tids += sl.tids + slots_list = [] + for k, e in slots.items(): + slots_list.append(e) + rf = Frame(tids=tids, slots=slots_list) + rf.sort_slots() + ret_frames.append(rf) + return sorted_by_len_tids(ret_frames) + + +def reduce_3(raw_frames, vallex): + # sskj simple lesk ids + ssj_ids = [frame.tids[0] for frame in raw_frames] + db_results = list(vallex.db.sskj_simple_lesk.find( + {"ssj_id": {"$in": ssj_ids}})) + id_map = {} + for entry in db_results: + id_map.update({entry["ssj_id"]: { + "sense_id": entry.get("sense_id"), + "sense_desc": entry.get("sense_desc") + }}) + return frames_from_sense_ids(raw_frames, id_map) + + +def reduce_4(raw_frames, vallex): + # kmeans ids + ssj_ids = [frame.tids[0] for frame in raw_frames] + db_results = list(vallex.db.kmeans.find( + {"ssj_id": {"$in": ssj_ids}})) + id_map = {} + for entry in db_results: + id_map.update({entry["ssj_id"]: { + "sense_id": entry["sense_id"] + }}) + return frames_from_sense_ids(raw_frames, id_map) + + +def reduce_5(raw_frames, vallex): + USER_SENSE_COLL = "v2_sense_map" + headword = raw_frames[0].hw + ssj_ids_full = [frame.tids[0] for frame in raw_frames] + # v2_sense_map stores only sentence half of ssj_id + ssj_ids = [".".join(ssj_id.split(".")[:-1]) for ssj_id in ssj_ids_full] + db_results = list(vallex.db[USER_SENSE_COLL].find({ + "ssj_id": {"$in": ssj_ids}, + "hw": headword, + })) + id_map = {} + for entry in db_results: + id_map[entry["ssj_id"]] = entry["sense_id"] + + ret_frames = frames_from_sense_ids(raw_frames, id_map) + + # sort: frames with senses to top + senses_undefined = [] + senses_defined = [] + for frame in ret_frames: + if frame.sense_info["sense_id"] == SENSE_UNDEFINED: + senses_undefined.append(frame) + else: + senses_defined.append(frame) + ret_frames = senses_defined + senses_undefined + + return ret_frames + + +def frames_from_sense_ids(raw_frames, id_map): + # id map = dict { + # ssj_id: sense_id + # } + # id_dict = dict { + # sense_id: [frame, ...] + # } + id_dict = {} + for frame in raw_frames: + # long version ssj_id (S123.t12) + frame_ssj_id = frame.tids[0] + frame_sense_id = id_map.get(frame_ssj_id) + if frame_sense_id is None: + # try short version ssj_id (S123) + frame_ssj_id = ".".join(frame_ssj_id.split(".")[:-1]) + frame_sense_id = id_map.get(frame_ssj_id) + + # set default if sense_id not found + if frame_sense_id is None: + frame_sense_id = SENSE_UNDEFINED + """ + sense_id = id_map.get(frame.tids[0]) + if sense_id is not None: + sense_id = sense_id.get("sense_id") + else: + sense_id = "nedefinirano" + """ + if frame_sense_id not in id_dict: + id_dict[frame_sense_id] = [] + id_dict[frame_sense_id].append(DC(frame)) + + ret_frames = [] + for sense_id, frames in id_dict.items(): + tids = [] + reduced_slots = [] + for frame in frames: + tids.extend(frame.tids) + for slot in frame.slots: + # if functor not in reduced slots, + # add new slot; else increase count + for rslot in reduced_slots: + if slot.functor == rslot.functor: + rslot.count += 1 + rslot.tids.extend(slot.tids) + break + else: + # in case for loop didn't match a slot + reduced_slots.append(Slot( + functor=slot.functor, + tids=slot.tids, + count=1 + )) + reduced_frame = Frame(tids, slots=reduced_slots) + id_map_entry = ( + id_map.get(tids[0]) or + id_map.get(".".join(tids[0].split(".")[:-1])) + ) + if id_map_entry is None: + reduced_frame.sense_info = { + "sense_id": SENSE_UNDEFINED, + } + else: + reduced_frame.sense_info = { + "sense_id": id_map_entry + } + reduced_frame.sort_slots() + ret_frames.append(reduced_frame) + return ret_frames + + +reduce_functions = { + "reduce_0": { + "f": reduce_0, + "desc": + "Vsaka pojavitev glagola dobi svoj stavčni vzorec.", + "simple_name": "posamezni stavki" + }, + "reduce_1": { + "f": reduce_1, + "desc": + "Združevanje stavčnih vzorcev z enako skupino udeleženskih vlog.", + "simple_name": "združeni stavki" + }, + "reduce_3": { + "f": reduce_3, + "desc": + "Združevanje stavčnih vzorcev na osnovi pomenov povedi v SSKJ. " + "Pomeni so dodeljeni s pomočjo algoritma Simple Lesk.", + "simple_name": "SSKJ_pomeni" + }, + "reduce_4": { + "f": reduce_4, + "desc": + "Združevanje stavčnih vzorcev na osnovi pomenov povedi " + "s pomočjo algoritma K-Means. Število predvidenih pomenov " + "podano na osnovi SSKJ.", + "simple_name": "KMeans_pomeni" + }, + "reduce_5": { + "f": reduce_5, + "desc": + "Uporabniško dodeljeni pomeni povedi.", + "simple_name": "po meri" + } +}