modifying frames api

2019-03-28 19:17:45 +01:00
parent d84ad9e163
commit 1f83f96267
10 changed files with 529 additions and 66 deletions
@@ -48,6 +48,7 @@ python-env:
 # inside the container, install our packages
 python-env-install:
 	pip3 install -e src/pkg/cjvt-corpusparser/.
+	pip3 install -e src/pkg/valency/.

 # from inside python-env container:
 data/samples:
@@ -44,7 +44,9 @@ If all goes well, we should be able to inspect the database, filled with corpora
 ### Flask backend (1 container)
 Relies heavily on the database. Set that up first. 
 ```bash
-# $ make backend=dev  # development
+$ make python-env
+
+# $ make backend-dev  # development
 $ make backend-prod
 ```

@@ -73,7 +73,7 @@ class Slot():
    # It consists of different tokens.
    def __init__(self, functor, tids=None, count=None):
        self.functor = functor
-        self.tids = tids or []
+        self.tids = tids or []  # combining multiple sentences vertically
        self.count = count or 1

    def to_string(self):
@@ -10,6 +10,9 @@ log = logging.getLogger(__name__)

 SENSE_UNDEFINED = "nedefinirano"

+## TIDI: use frame.py
+## TODO: build a list of [Frame] with lists of [Slot]
+

 def sorted_by_len_tids(frames):
    return sorted(
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-

 from flask import Flask, render_template, request, url_for, redirect
+from valency import Frame, Slot
+from valency.reduce_functions import reduce_functions

 """
 from valency import k_utils
@@ -26,12 +28,19 @@ from pathlib import Path
 from pymongo import MongoClient
 import argparse

+# some db collections
+USERS_COLL = "users"
+TOKENS_COLL = "usertokens"
+SENSES_COLL = "senses"
+SENSEMAP_COLL = "sensemap"
+
+# pre-generated data (gui leftside word index)
 CORPORA = ["ssj", "kres"]
+app_index = {c: {} for c in CORPORA}

 log = logging.getLogger(__name__)
 app = Flask(__name__)

-app_index = {c: {} for c in CORPORA}

 # when running vuejs via webpack
 # CORS(app)
@@ -41,23 +50,7 @@ app_index = {c: {} for c in CORPORA}
 CORS(app)


-# for testing functions
-@app.route("/test_dev")
-def test_dev():
-    ret = vallex.test_dev()
-    return(str(ret) or "edit val_struct.py: test_dev()")
-
-
-@app.route("/")
-def index():
-    return(render_template("index.html"))
-
-
-@app.route("/home", defaults={"pathname": ""})
-@app.route("/home/<path:pathname>")
-def home(pathname):
-    return redirect(url_for("index"), code=302)
-
+# INDEX SELECTION -------------------.

@app.route("/api/words/<corpus>")
 def api_words(corpus):
@@ -69,10 +62,13 @@ def api_words(corpus):
 def api_functors(corpus):
    return json.dumps(app_index[corpus]["functors"])

+# INDEX SELECTION -------------------^
+
+
+# AUTH ------------------------------.

@app.route("/api/register", methods=["POST"])
 def api_register():
-    USERS_COLL = "v2_users"
    b = request.get_data()
    data = json.loads(b.decode())
    username = data["username"]
@@ -84,7 +80,7 @@ def api_register():
        email == ""
    ):
        return "ERR"
-    existing = list(vallex.db[USERS_COLL].find({
+    existing = list(valdb[USERS_COLL].find({
        "$or": [{"username": username}, {"email": email}]
    }))
    if len(existing) > 0:
@@ -96,21 +92,19 @@ def api_register():
        "email": hashlib.sha256(
            email.encode("utf-8")).hexdigest()
    }
-    vallex.db[USERS_COLL].insert(entry)
+    valdb[USERS_COLL].insert(entry)
    return "OK"


@app.route("/api/login", methods=["POST"])
 def api_login():
-    USERS_COLL = "v2_users"
-    TOKENS_COLL = "v2_user_tokens"
    b = request.get_data()
    data = json.loads(b.decode())
    username = data["username"]
    password = data["password"]
    hpass = hashlib.sha256(password.encode("utf-8")).hexdigest()

-    db_user = list(vallex.db[USERS_COLL].find({
+    db_user = list(valdb[USERS_COLL].find({
        "username": username,
        "hpass": hpass
    }))
@@ -124,7 +118,7 @@ def api_login():
        "date": datetime.datetime.utcnow(),
        "token": token
    }
-    vallex.db[TOKENS_COLL].update(
+    valdb[TOKENS_COLL].update(
        {"username": token_entry["username"]},
        token_entry,
        upsert=True
@@ -167,7 +161,7 @@ def api_new_pass():
    username = data["username"]
    email = data["email"]
    hemail = hashlib.sha256(email.encode("utf-8")).hexdigest()
-    db_res = list(vallex.db.v2_users.find({
+    db_res = list(valdb[USERS_COLL].find({
        "username": username,
        "email": hemail
    }))
@@ -179,7 +173,7 @@ def api_new_pass():
        string.ascii_letters + string.digits) for i in range(10)])
    # update locally
    hpass = hashlib.sha256(new_pass.encode("utf-8")).hexdigest()
-    vallex.db.v2_users.update(
+    valdb[USERS_COLL].update(
        {
            "username": username,
            "email": hemail
@@ -193,6 +187,39 @@ def api_new_pass():
    return json.dumps({"confirmation": True})


+def token_to_username(token):
+    key = {
+        "token": token
+    }
+    res = list(valdb[TOKENS_COLL].find(key))
+    if len(res) != 1:
+        return None
+    username = res[0]["username"]
+    # update deletion interval
+    valdb[TOKENS_COLL].update(
+        key, {"$set": {"date": datetime.datetime.utcnow()}})
+    return username
+
+
+@app.route("/api/token", methods=["POST"])
+def api_token():
+    # check if token is valid
+    b = request.get_data()
+    data = json.loads(b.decode())
+    token = data.get("token")
+    # user = data.get("user")
+    user = token_to_username(token)
+    confirm = (user is not None)
+    return json.dumps({
+        "confirmation": confirm,
+        "username": user
+    })
+
+# AUTH ------------------------------^
+
+
+# FRAMES ----------------------------.
+
 def prepare_frames(ret_frames):
    # append sentences
    for frame in ret_frames:
@@ -218,19 +245,21 @@ def prepare_frames(ret_frames):
    return json.dumps(json_ret)


-@app.route("/api/frames")
+# input: hw, reduct_function
+@app.route("/api/hw-frames")
 def api_get_frames():
    hw = request.args.get("hw")
    if hw is None:
-        return json.dumps({"error": "Headword not found."})
+        return json.dumps({"error": "Required argument: hw (headword)."})

    rf_name = request.args.get("rf", "reduce_0")  # 2nd is default
    RF = reduce_functions[rf_name]["f"]
-    entry = vallex.entries[hw]
+    entry = vallex.entries[hw]  # TODO  hw -> [Frame,]
    ret_frames = RF(entry.raw_frames, vallex)
    return prepare_frames(ret_frames)


+# input: functor, reduce_function
@app.route("/api/functor-frames")
 def api_get_functor_frames():
    functor = request.args.get("functor")
@@ -238,49 +267,23 @@ def api_get_functor_frames():
        return json.dumps({"error": "Missing argument: functor."})
    rf_name = request.args.get("rf", "reduce_0")  # 2nd is default
    RF = reduce_functions[rf_name]["f"]
-    raw_frames = vallex.functors_index[functor]
+    raw_frames = vallex.functors_index[functor]  # TODO
    ret_frames = RF(raw_frames, vallex)
    return prepare_frames(ret_frames)

-
-def token_to_username(token):
-    COLLNAME = "v2_user_tokens"
-    key = {
-        "token": token
-    }
-    res = list(vallex.db[COLLNAME].find(key))
-    if len(res) != 1:
-        return None
-    username = res[0]["username"]
-    # update deletion interval
-    vallex.db[COLLNAME].update(
-        key, {"$set": {"date": datetime.datetime.utcnow()}})
-    return username
+# FRAMES ----------------------------^


-@app.route("/api/token", methods=["POST"])
-def api_token():
-    # check if token is valid
-    b = request.get_data()
-    data = json.loads(b.decode())
-    token = data.get("token")
-    # user = data.get("user")
-    user = token_to_username(token)
-    confirm = (user is not None)
-    return json.dumps({
-        "confirmation": confirm,
-        "username": user
-    })
-
+# SENSES ----------------------------.

@app.route("/api/senses/get")
 def api_senses_get():
    # returns senses and mapping for hw
    hw = request.args.get("hw")
-    senses = list(vallex.db["v2_senses"].find({
+    senses = list(valdb[SENSES_COLL].find({
        "hw": hw
    }))
-    sense_map_query = list(vallex.db["v2_sense_map"].find({
+    sense_map_query = list(valdb[SENSEMAP_COLL].find({
        "hw": hw
    }))
    # aggregation by max date possible on DB side
@@ -358,7 +361,7 @@ def api_senses_update():
        id_map[frontend_sense_id] = new_sense_id

        # insert into db
-        vallex.db["v2_senses"].insert(ns)
+        valdb[SENSES_COLL].insert(ns)

    # replace tmp_id with mongo's _id
    for ssj_id, el in sense_map.items():
@@ -373,9 +376,14 @@ def api_senses_update():
            "date": datetime.datetime.utcnow()
        }
        # vallex.db["v2_sense_map"].update(key, data, upsert=True)
-        vallex.db["v2_sense_map"].insert(data)
+        valdb[SENSEMAP_COLL].insert(data)
    return "OK"

+# SENSES ----------------------------^
+
+
+# APP PREFLIGHT ---------------------.
+
 def prepare_db():
    def helper_tid_to_token(tid, tokens):
        for t in tokens:
@@ -384,7 +392,7 @@ def prepare_db():
        return None

    # update entries (add headwords and fuctors for indexing)
-    for corpus in ["ssj", "kres"]:
+    for corpus in CORPORA:
        for e in valdb[corpus].find({}):
            if e["srl_links"] is None:
                continue
@@ -435,6 +443,8 @@ def prepare_db():
        functors = sorted(functors, key=lambda x: x[0])
        app_index[corpus]["functors"] = functors

+# APP PREFLIGHT ---------------------^
+

 if __name__ == "__main__":
    print("Starting app.py main()")
@@ -0,0 +1,12 @@
+from setuptools import setup
+
+setup(
+    name='valency',
+    version='0.1.1',
+    description='Objects and functions for handling valency frames.',
+    author='Kristjan Voje',
+    author_email='kristjan.voje@gmail.com',
+    license='MIT',
+    packages=['valency'],
+    install_requires=[],
+)
@@ -0,0 +1,96 @@
+import logging
+
+log = logging.getLogger(__name__)
+
+
+class Frame():
+    def __init__(self, tids, deep_links=None, slots=None, hw=None):
+        self.hw = hw
+        self.tids = tids   # list of tokens with the same hw_lemma
+        # Each tid = "S123.t123";
+        # you can get sentence with vallex.get_sentence(S123)
+        self.slots = []
+        if slots is None:
+            self.slots = self.init_slots(deep_links)
+        else:
+            self.slots = slots
+        self.sense_info = {}
+        self.sentences = None  # Used for passing to view in app.py, get_frames
+        self.aggr_sent = None  # Dictionary { hw: self.sentences idx }
+
+    def to_json(self):
+        ret = {
+            "hw": self.hw,
+            "tids": self.tids,
+            "slots": [slot.to_json() for slot in self.slots],
+            "sentences": self.sentences,
+            "aggr_sent": self.aggr_sent,
+            "sense_info": self.sense_info
+        }
+        return ret
+
+    def init_slots(self, deep):
+        slots = []
+        for link in deep:
+            slots.append(Slot(
+                functor=link["functor"],
+                tids=[link["to"]]
+            ))
+        return slots
+
+    def sort_slots(self):
+        # ACT, PAT, alphabetically
+        srt1 = [
+            x for x in self.slots
+            if (x.functor == "ACT" or
+                x.functor == "PAT")
+        ]
+        srt1 = sorted(srt1, key=lambda x: x.functor)
+        srt2 = [
+            x for x in self.slots
+            if (x.functor != "ACT" and
+                x.functor != "PAT")
+        ]
+        srt2 = sorted(srt2, key=lambda x: x.functor)
+        self.slots = (srt1 + srt2)
+
+    def to_string(self):
+        ret = "Frame:\n"
+        ret += "sense_info: {}\n".format(str(self.sense_info))
+        ret += "tids: ["
+        for t in self.tids:
+            ret += (str(t) + ", ")
+        ret += "]\n"
+        if self.slots is not None:
+            ret += "slots:\n"
+            for sl in self.slots:
+                ret += (sl.to_string() + "\n")
+        return ret
+
+
+class Slot():
+    # Each slot is identified by its functor (ACT, PAT, ...)
+    # It consists of different tokens.
+    def __init__(self, functor, tids=None, count=None):
+        self.functor = functor
+        self.tids = tids or []  # combining multiple sentences vertically
+        self.count = count or 1
+
+    def to_string(self):
+        ret = "---- Slot:\n"
+        ret += "functor: {}\n".format(self.functor)
+        ret += "tids: ["
+        for t in self.tids:
+            ret += (str(t) + ", ")
+        ret += "]\n"
+        ret += "]\n"
+        ret += "----\n"
+        return ret
+
+    def to_json(self):
+        ret = {
+            "functor": self.functor,
+            "tids": self.tids,
+            "count": self.count
+        }
+        return ret
@@ -0,0 +1 @@
+from valency.Frame import Frame, Slot
@@ -0,0 +1,96 @@
+import logging
+
+log = logging.getLogger(__name__)
+
+
+class Frame():
+    def __init__(self, tids, deep_links=None, slots=None, hw=None):
+        self.hw = hw
+        self.tids = tids   # list of tokens with the same hw_lemma
+        # Each tid = "S123.t123";
+        # you can get sentence with vallex.get_sentence(S123)
+        self.slots = []
+        if slots is None:
+            self.slots = self.init_slots(deep_links)
+        else:
+            self.slots = slots
+        self.sense_info = {}
+        self.sentences = None  # Used for passing to view in app.py, get_frames
+        self.aggr_sent = None  # Dictionary { hw: self.sentences idx }
+
+    def to_json(self):
+        ret = {
+            "hw": self.hw,
+            "tids": self.tids,
+            "slots": [slot.to_json() for slot in self.slots],
+            "sentences": self.sentences,
+            "aggr_sent": self.aggr_sent,
+            "sense_info": self.sense_info
+        }
+        return ret
+
+    def init_slots(self, deep):
+        slots = []
+        for link in deep:
+            slots.append(Slot(
+                functor=link["functor"],
+                tids=[link["to"]]
+            ))
+        return slots
+
+    def sort_slots(self):
+        # ACT, PAT, alphabetically
+        srt1 = [
+            x for x in self.slots
+            if (x.functor == "ACT" or
+                x.functor == "PAT")
+        ]
+        srt1 = sorted(srt1, key=lambda x: x.functor)
+        srt2 = [
+            x for x in self.slots
+            if (x.functor != "ACT" and
+                x.functor != "PAT")
+        ]
+        srt2 = sorted(srt2, key=lambda x: x.functor)
+        self.slots = (srt1 + srt2)
+
+    def to_string(self):
+        ret = "Frame:\n"
+        ret += "sense_info: {}\n".format(str(self.sense_info))
+        ret += "tids: ["
+        for t in self.tids:
+            ret += (str(t) + ", ")
+        ret += "]\n"
+        if self.slots is not None:
+            ret += "slots:\n"
+            for sl in self.slots:
+                ret += (sl.to_string() + "\n")
+        return ret
+
+
+class Slot():
+    # Each slot is identified by its functor (ACT, PAT, ...)
+    # It consists of different tokens.
+    def __init__(self, functor, tids=None, count=None):
+        self.functor = functor
+        self.tids = tids or []  # combining multiple sentences vertically
+        self.count = count or 1
+
+    def to_string(self):
+        ret = "---- Slot:\n"
+        ret += "functor: {}\n".format(self.functor)
+        ret += "tids: ["
+        for t in self.tids:
+            ret += (str(t) + ", ")
+        ret += "]\n"
+        ret += "]\n"
+        ret += "----\n"
+        return ret
+
+    def to_json(self):
+        ret = {
+            "functor": self.functor,
+            "tids": self.tids,
+            "count": self.count
+        }
+        return ret
@@ -0,0 +1,242 @@
+# Reduction function for frames.
+# Input: list of Frame objects, output: list of Frame objects.
+# App uses reduce_0, 1 and 5
+
+from valency import Frame, Slot
+from copy import deepcopy as DC
+import logging
+
+log = logging.getLogger(__name__)
+
+SENSE_UNDEFINED = "nedefinirano"
+
+## TIDI: use frame.py
+## TODO: build a list of [Frame] with lists of [Slot]
+
+
+def sorted_by_len_tids(frames):
+    return sorted(
+        frames,
+        key=lambda x: len(x.tids),
+        reverse=True
+    )
+
+
+def reduce_0(frames, vallex=None):
+    # new request... frames should be sorded by
+    # functors list (basically reduce_1, just each
+    # sentence gets its own frame)
+    r1_frames = reduce_1(frames)
+    sorting_strings = []
+    separated_frames = []
+    for frame in r1_frames:
+        for tid in frame.tids:
+            tmp_frame = DC(frame)
+            tmp_frame.tids = [tid]
+            separated_frames.append(tmp_frame)
+            sorting_strings.append("".join(
+                [slot.functor for slot in tmp_frame.slots]
+            ))
+    permutation = [x for _, x in sorted(
+        zip(sorting_strings, range(len(sorting_strings))))]
+    sorted_sep_frames = [separated_frames[i] for i in permutation]
+    return sorted_sep_frames
+
+
+def reduce_1(frames, vallex=None):
+    # Combine frames with the same set of functors.
+    # The order of functors is not important.
+    frame_sets = []  # [set of functors, list of frames]
+    for frame in frames:
+        functors = [slot.functor for slot in frame.slots]
+
+        for fs in frame_sets:
+            if set(functors) == set(fs[0]):
+                fs[1].append(frame)
+                break
+        else:
+            # Python for else -> fires if loop has ended.
+            frame_sets.append([functors, [frame]])
+
+    ret_frames = []
+    for fs in frame_sets:
+        tids = []
+        slots = {}
+        # All possible slots in this frame.
+        for functor in fs[0]:
+            slots[functor] = Slot(functor=functor)
+        # Reduce slots from all frames. (Merge ACT from all frames, ...)
+        for frame in fs[1]:
+            tids += frame.tids
+            for sl in frame.slots:
+                slots[sl.functor].tids += sl.tids
+        slots_list = []
+        for k, e in slots.items():
+            slots_list.append(e)
+        rf = Frame(tids=tids, slots=slots_list)
+        rf.sort_slots()
+        ret_frames.append(rf)
+    return sorted_by_len_tids(ret_frames)
+
+
+def reduce_3(raw_frames, vallex):
+    # sskj simple lesk ids
+    ssj_ids = [frame.tids[0] for frame in raw_frames]
+    db_results = list(vallex.db.sskj_simple_lesk.find(
+        {"ssj_id": {"$in": ssj_ids}}))
+    id_map = {}
+    for entry in db_results:
+        id_map.update({entry["ssj_id"]: {
+            "sense_id": entry.get("sense_id"),
+            "sense_desc": entry.get("sense_desc")
+        }})
+    return frames_from_sense_ids(raw_frames, id_map)
+
+
+def reduce_4(raw_frames, vallex):
+    # kmeans ids
+    ssj_ids = [frame.tids[0] for frame in raw_frames]
+    db_results = list(vallex.db.kmeans.find(
+        {"ssj_id": {"$in": ssj_ids}}))
+    id_map = {}
+    for entry in db_results:
+        id_map.update({entry["ssj_id"]: {
+            "sense_id": entry["sense_id"]
+        }})
+    return frames_from_sense_ids(raw_frames, id_map)
+
+
+def reduce_5(raw_frames, vallex):
+    USER_SENSE_COLL = "v2_sense_map"
+    headword = raw_frames[0].hw
+    ssj_ids_full = [frame.tids[0] for frame in raw_frames]
+    # v2_sense_map stores only sentence half of ssj_id
+    ssj_ids = [".".join(ssj_id.split(".")[:-1]) for ssj_id in ssj_ids_full]
+    db_results = list(vallex.db[USER_SENSE_COLL].find({
+        "ssj_id": {"$in": ssj_ids},
+        "hw": headword,
+    }))
+    id_map = {}
+    for entry in db_results:
+        id_map[entry["ssj_id"]] = entry["sense_id"]
+
+    ret_frames = frames_from_sense_ids(raw_frames, id_map)
+
+    # sort: frames with senses to top
+    senses_undefined = []
+    senses_defined = []
+    for frame in ret_frames:
+        if frame.sense_info["sense_id"] == SENSE_UNDEFINED:
+            senses_undefined.append(frame)
+        else:
+            senses_defined.append(frame)
+    ret_frames = senses_defined + senses_undefined
+
+    return ret_frames
+
+
+def frames_from_sense_ids(raw_frames, id_map):
+    # id map = dict {
+    #   ssj_id: sense_id
+    # }
+    # id_dict = dict {
+    #   sense_id: [frame, ...]
+    # }
+    id_dict = {}
+    for frame in raw_frames:
+        # long version ssj_id (S123.t12)
+        frame_ssj_id = frame.tids[0]
+        frame_sense_id = id_map.get(frame_ssj_id)
+        if frame_sense_id is None:
+            # try short version ssj_id (S123)
+            frame_ssj_id = ".".join(frame_ssj_id.split(".")[:-1])
+            frame_sense_id = id_map.get(frame_ssj_id)
+
+        # set default if sense_id not found
+        if frame_sense_id is None:
+            frame_sense_id = SENSE_UNDEFINED
+        """
+        sense_id = id_map.get(frame.tids[0])
+        if sense_id is not None:
+            sense_id = sense_id.get("sense_id")
+        else:
+            sense_id = "nedefinirano"
+        """
+        if frame_sense_id not in id_dict:
+            id_dict[frame_sense_id] = []
+        id_dict[frame_sense_id].append(DC(frame))
+
+    ret_frames = []
+    for sense_id, frames in id_dict.items():
+        tids = []
+        reduced_slots = []
+        for frame in frames:
+            tids.extend(frame.tids)
+            for slot in frame.slots:
+                # if functor not in reduced slots,
+                # add new slot; else increase count
+                for rslot in reduced_slots:
+                    if slot.functor == rslot.functor:
+                        rslot.count += 1
+                        rslot.tids.extend(slot.tids)
+                        break
+                else:
+                    # in case for loop didn't match a slot
+                    reduced_slots.append(Slot(
+                        functor=slot.functor,
+                        tids=slot.tids,
+                        count=1
+                    ))
+        reduced_frame = Frame(tids, slots=reduced_slots)
+        id_map_entry = (
+            id_map.get(tids[0]) or
+            id_map.get(".".join(tids[0].split(".")[:-1]))
+        )
+        if id_map_entry is None:
+            reduced_frame.sense_info = {
+                "sense_id": SENSE_UNDEFINED,
+            }
+        else:
+            reduced_frame.sense_info = {
+                "sense_id": id_map_entry
+            }
+        reduced_frame.sort_slots()
+        ret_frames.append(reduced_frame)
+    return ret_frames
+
+
+reduce_functions = {
+    "reduce_0": {
+        "f": reduce_0,
+        "desc":
+        "Vsaka pojavitev glagola dobi svoj stavčni vzorec.",
+        "simple_name": "posamezni stavki"
+    },
+    "reduce_1": {
+        "f": reduce_1,
+        "desc":
+        "Združevanje stavčnih vzorcev z enako skupino udeleženskih vlog.",
+        "simple_name": "združeni stavki"
+    },
+    "reduce_3": {
+        "f": reduce_3,
+        "desc":
+        "Združevanje stavčnih vzorcev na osnovi pomenov povedi v SSKJ. "
+        "Pomeni so dodeljeni s pomočjo algoritma Simple Lesk.",
+        "simple_name": "SSKJ_pomeni"
+    },
+    "reduce_4": {
+        "f": reduce_4,
+        "desc":
+        "Združevanje stavčnih vzorcev na osnovi pomenov povedi "
+        "s pomočjo algoritma K-Means. Število predvidenih pomenov "
+        "podano na osnovi SSKJ.",
+        "simple_name": "KMeans_pomeni"
+    },
+    "reduce_5": {
+        "f": reduce_5,
+        "desc":
+        "Uporabniško dodeljeni pomeni povedi.",
+        "simple_name": "po meri"
+    }
+}