forked from kristjan/cjvt-valency
modifying frames api
This commit is contained in:
parent
d84ad9e163
commit
1f83f96267
1
Makefile
1
Makefile
|
@ -48,6 +48,7 @@ python-env:
|
||||||
# inside the container, install our packages
|
# inside the container, install our packages
|
||||||
python-env-install:
|
python-env-install:
|
||||||
pip3 install -e src/pkg/cjvt-corpusparser/.
|
pip3 install -e src/pkg/cjvt-corpusparser/.
|
||||||
|
pip3 install -e src/pkg/valency/.
|
||||||
|
|
||||||
# from inside python-env container:
|
# from inside python-env container:
|
||||||
data/samples:
|
data/samples:
|
||||||
|
|
|
@ -44,7 +44,9 @@ If all goes well, we should be able to inspect the database, filled with corpora
|
||||||
### Flask backend (1 container)
|
### Flask backend (1 container)
|
||||||
Relies heavily on the database. Set that up first.
|
Relies heavily on the database. Set that up first.
|
||||||
```bash
|
```bash
|
||||||
# $ make backend=dev # development
|
$ make python-env
|
||||||
|
|
||||||
|
# $ make backend-dev # development
|
||||||
$ make backend-prod
|
$ make backend-prod
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -73,7 +73,7 @@ class Slot():
|
||||||
# It consists of different tokens.
|
# It consists of different tokens.
|
||||||
def __init__(self, functor, tids=None, count=None):
|
def __init__(self, functor, tids=None, count=None):
|
||||||
self.functor = functor
|
self.functor = functor
|
||||||
self.tids = tids or []
|
self.tids = tids or [] # combining multiple sentences vertically
|
||||||
self.count = count or 1
|
self.count = count or 1
|
||||||
|
|
||||||
def to_string(self):
|
def to_string(self):
|
||||||
|
|
|
@ -10,6 +10,9 @@ log = logging.getLogger(__name__)
|
||||||
|
|
||||||
SENSE_UNDEFINED = "nedefinirano"
|
SENSE_UNDEFINED = "nedefinirano"
|
||||||
|
|
||||||
|
## TIDI: use frame.py
|
||||||
|
## TODO: build a list of [Frame] with lists of [Slot]
|
||||||
|
|
||||||
|
|
||||||
def sorted_by_len_tids(frames):
|
def sorted_by_len_tids(frames):
|
||||||
return sorted(
|
return sorted(
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from flask import Flask, render_template, request, url_for, redirect
|
from flask import Flask, render_template, request, url_for, redirect
|
||||||
|
from valency import Frame, Slot
|
||||||
|
from valency.reduce_functions import reduce_functions
|
||||||
|
|
||||||
"""
|
"""
|
||||||
from valency import k_utils
|
from valency import k_utils
|
||||||
|
@ -26,12 +28,19 @@ from pathlib import Path
|
||||||
from pymongo import MongoClient
|
from pymongo import MongoClient
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
# some db collections
|
||||||
|
USERS_COLL = "users"
|
||||||
|
TOKENS_COLL = "usertokens"
|
||||||
|
SENSES_COLL = "senses"
|
||||||
|
SENSEMAP_COLL = "sensemap"
|
||||||
|
|
||||||
|
# pre-generated data (gui leftside word index)
|
||||||
CORPORA = ["ssj", "kres"]
|
CORPORA = ["ssj", "kres"]
|
||||||
|
app_index = {c: {} for c in CORPORA}
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
app_index = {c: {} for c in CORPORA}
|
|
||||||
|
|
||||||
# when running vuejs via webpack
|
# when running vuejs via webpack
|
||||||
# CORS(app)
|
# CORS(app)
|
||||||
|
@ -41,23 +50,7 @@ app_index = {c: {} for c in CORPORA}
|
||||||
CORS(app)
|
CORS(app)
|
||||||
|
|
||||||
|
|
||||||
# for testing functions
|
# INDEX SELECTION -------------------.
|
||||||
@app.route("/test_dev")
|
|
||||||
def test_dev():
|
|
||||||
ret = vallex.test_dev()
|
|
||||||
return(str(ret) or "edit val_struct.py: test_dev()")
|
|
||||||
|
|
||||||
|
|
||||||
@app.route("/")
|
|
||||||
def index():
|
|
||||||
return(render_template("index.html"))
|
|
||||||
|
|
||||||
|
|
||||||
@app.route("/home", defaults={"pathname": ""})
|
|
||||||
@app.route("/home/<path:pathname>")
|
|
||||||
def home(pathname):
|
|
||||||
return redirect(url_for("index"), code=302)
|
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/words/<corpus>")
|
@app.route("/api/words/<corpus>")
|
||||||
def api_words(corpus):
|
def api_words(corpus):
|
||||||
|
@ -69,10 +62,13 @@ def api_words(corpus):
|
||||||
def api_functors(corpus):
|
def api_functors(corpus):
|
||||||
return json.dumps(app_index[corpus]["functors"])
|
return json.dumps(app_index[corpus]["functors"])
|
||||||
|
|
||||||
|
# INDEX SELECTION -------------------^
|
||||||
|
|
||||||
|
|
||||||
|
# AUTH ------------------------------.
|
||||||
|
|
||||||
@app.route("/api/register", methods=["POST"])
|
@app.route("/api/register", methods=["POST"])
|
||||||
def api_register():
|
def api_register():
|
||||||
USERS_COLL = "v2_users"
|
|
||||||
b = request.get_data()
|
b = request.get_data()
|
||||||
data = json.loads(b.decode())
|
data = json.loads(b.decode())
|
||||||
username = data["username"]
|
username = data["username"]
|
||||||
|
@ -84,7 +80,7 @@ def api_register():
|
||||||
email == ""
|
email == ""
|
||||||
):
|
):
|
||||||
return "ERR"
|
return "ERR"
|
||||||
existing = list(vallex.db[USERS_COLL].find({
|
existing = list(valdb[USERS_COLL].find({
|
||||||
"$or": [{"username": username}, {"email": email}]
|
"$or": [{"username": username}, {"email": email}]
|
||||||
}))
|
}))
|
||||||
if len(existing) > 0:
|
if len(existing) > 0:
|
||||||
|
@ -96,21 +92,19 @@ def api_register():
|
||||||
"email": hashlib.sha256(
|
"email": hashlib.sha256(
|
||||||
email.encode("utf-8")).hexdigest()
|
email.encode("utf-8")).hexdigest()
|
||||||
}
|
}
|
||||||
vallex.db[USERS_COLL].insert(entry)
|
valdb[USERS_COLL].insert(entry)
|
||||||
return "OK"
|
return "OK"
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/login", methods=["POST"])
|
@app.route("/api/login", methods=["POST"])
|
||||||
def api_login():
|
def api_login():
|
||||||
USERS_COLL = "v2_users"
|
|
||||||
TOKENS_COLL = "v2_user_tokens"
|
|
||||||
b = request.get_data()
|
b = request.get_data()
|
||||||
data = json.loads(b.decode())
|
data = json.loads(b.decode())
|
||||||
username = data["username"]
|
username = data["username"]
|
||||||
password = data["password"]
|
password = data["password"]
|
||||||
hpass = hashlib.sha256(password.encode("utf-8")).hexdigest()
|
hpass = hashlib.sha256(password.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
db_user = list(vallex.db[USERS_COLL].find({
|
db_user = list(valdb[USERS_COLL].find({
|
||||||
"username": username,
|
"username": username,
|
||||||
"hpass": hpass
|
"hpass": hpass
|
||||||
}))
|
}))
|
||||||
|
@ -124,7 +118,7 @@ def api_login():
|
||||||
"date": datetime.datetime.utcnow(),
|
"date": datetime.datetime.utcnow(),
|
||||||
"token": token
|
"token": token
|
||||||
}
|
}
|
||||||
vallex.db[TOKENS_COLL].update(
|
valdb[TOKENS_COLL].update(
|
||||||
{"username": token_entry["username"]},
|
{"username": token_entry["username"]},
|
||||||
token_entry,
|
token_entry,
|
||||||
upsert=True
|
upsert=True
|
||||||
|
@ -167,7 +161,7 @@ def api_new_pass():
|
||||||
username = data["username"]
|
username = data["username"]
|
||||||
email = data["email"]
|
email = data["email"]
|
||||||
hemail = hashlib.sha256(email.encode("utf-8")).hexdigest()
|
hemail = hashlib.sha256(email.encode("utf-8")).hexdigest()
|
||||||
db_res = list(vallex.db.v2_users.find({
|
db_res = list(valdb[USERS_COLL].find({
|
||||||
"username": username,
|
"username": username,
|
||||||
"email": hemail
|
"email": hemail
|
||||||
}))
|
}))
|
||||||
|
@ -179,7 +173,7 @@ def api_new_pass():
|
||||||
string.ascii_letters + string.digits) for i in range(10)])
|
string.ascii_letters + string.digits) for i in range(10)])
|
||||||
# update locally
|
# update locally
|
||||||
hpass = hashlib.sha256(new_pass.encode("utf-8")).hexdigest()
|
hpass = hashlib.sha256(new_pass.encode("utf-8")).hexdigest()
|
||||||
vallex.db.v2_users.update(
|
valdb[USERS_COLL].update(
|
||||||
{
|
{
|
||||||
"username": username,
|
"username": username,
|
||||||
"email": hemail
|
"email": hemail
|
||||||
|
@ -193,6 +187,39 @@ def api_new_pass():
|
||||||
return json.dumps({"confirmation": True})
|
return json.dumps({"confirmation": True})
|
||||||
|
|
||||||
|
|
||||||
|
def token_to_username(token):
|
||||||
|
key = {
|
||||||
|
"token": token
|
||||||
|
}
|
||||||
|
res = list(valdb[TOKENS_COLL].find(key))
|
||||||
|
if len(res) != 1:
|
||||||
|
return None
|
||||||
|
username = res[0]["username"]
|
||||||
|
# update deletion interval
|
||||||
|
valdb[TOKENS_COLL].update(
|
||||||
|
key, {"$set": {"date": datetime.datetime.utcnow()}})
|
||||||
|
return username
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/token", methods=["POST"])
|
||||||
|
def api_token():
|
||||||
|
# check if token is valid
|
||||||
|
b = request.get_data()
|
||||||
|
data = json.loads(b.decode())
|
||||||
|
token = data.get("token")
|
||||||
|
# user = data.get("user")
|
||||||
|
user = token_to_username(token)
|
||||||
|
confirm = (user is not None)
|
||||||
|
return json.dumps({
|
||||||
|
"confirmation": confirm,
|
||||||
|
"username": user
|
||||||
|
})
|
||||||
|
|
||||||
|
# AUTH ------------------------------^
|
||||||
|
|
||||||
|
|
||||||
|
# FRAMES ----------------------------.
|
||||||
|
|
||||||
def prepare_frames(ret_frames):
|
def prepare_frames(ret_frames):
|
||||||
# append sentences
|
# append sentences
|
||||||
for frame in ret_frames:
|
for frame in ret_frames:
|
||||||
|
@ -218,19 +245,21 @@ def prepare_frames(ret_frames):
|
||||||
return json.dumps(json_ret)
|
return json.dumps(json_ret)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/frames")
|
# input: hw, reduct_function
|
||||||
|
@app.route("/api/hw-frames")
|
||||||
def api_get_frames():
|
def api_get_frames():
|
||||||
hw = request.args.get("hw")
|
hw = request.args.get("hw")
|
||||||
if hw is None:
|
if hw is None:
|
||||||
return json.dumps({"error": "Headword not found."})
|
return json.dumps({"error": "Required argument: hw (headword)."})
|
||||||
|
|
||||||
rf_name = request.args.get("rf", "reduce_0") # 2nd is default
|
rf_name = request.args.get("rf", "reduce_0") # 2nd is default
|
||||||
RF = reduce_functions[rf_name]["f"]
|
RF = reduce_functions[rf_name]["f"]
|
||||||
entry = vallex.entries[hw]
|
entry = vallex.entries[hw] # TODO hw -> [Frame,]
|
||||||
ret_frames = RF(entry.raw_frames, vallex)
|
ret_frames = RF(entry.raw_frames, vallex)
|
||||||
return prepare_frames(ret_frames)
|
return prepare_frames(ret_frames)
|
||||||
|
|
||||||
|
|
||||||
|
# input: functor, reduce_function
|
||||||
@app.route("/api/functor-frames")
|
@app.route("/api/functor-frames")
|
||||||
def api_get_functor_frames():
|
def api_get_functor_frames():
|
||||||
functor = request.args.get("functor")
|
functor = request.args.get("functor")
|
||||||
|
@ -238,49 +267,23 @@ def api_get_functor_frames():
|
||||||
return json.dumps({"error": "Missing argument: functor."})
|
return json.dumps({"error": "Missing argument: functor."})
|
||||||
rf_name = request.args.get("rf", "reduce_0") # 2nd is default
|
rf_name = request.args.get("rf", "reduce_0") # 2nd is default
|
||||||
RF = reduce_functions[rf_name]["f"]
|
RF = reduce_functions[rf_name]["f"]
|
||||||
raw_frames = vallex.functors_index[functor]
|
raw_frames = vallex.functors_index[functor] # TODO
|
||||||
ret_frames = RF(raw_frames, vallex)
|
ret_frames = RF(raw_frames, vallex)
|
||||||
return prepare_frames(ret_frames)
|
return prepare_frames(ret_frames)
|
||||||
|
|
||||||
|
# FRAMES ----------------------------^
|
||||||
def token_to_username(token):
|
|
||||||
COLLNAME = "v2_user_tokens"
|
|
||||||
key = {
|
|
||||||
"token": token
|
|
||||||
}
|
|
||||||
res = list(vallex.db[COLLNAME].find(key))
|
|
||||||
if len(res) != 1:
|
|
||||||
return None
|
|
||||||
username = res[0]["username"]
|
|
||||||
# update deletion interval
|
|
||||||
vallex.db[COLLNAME].update(
|
|
||||||
key, {"$set": {"date": datetime.datetime.utcnow()}})
|
|
||||||
return username
|
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/token", methods=["POST"])
|
# SENSES ----------------------------.
|
||||||
def api_token():
|
|
||||||
# check if token is valid
|
|
||||||
b = request.get_data()
|
|
||||||
data = json.loads(b.decode())
|
|
||||||
token = data.get("token")
|
|
||||||
# user = data.get("user")
|
|
||||||
user = token_to_username(token)
|
|
||||||
confirm = (user is not None)
|
|
||||||
return json.dumps({
|
|
||||||
"confirmation": confirm,
|
|
||||||
"username": user
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/senses/get")
|
@app.route("/api/senses/get")
|
||||||
def api_senses_get():
|
def api_senses_get():
|
||||||
# returns senses and mapping for hw
|
# returns senses and mapping for hw
|
||||||
hw = request.args.get("hw")
|
hw = request.args.get("hw")
|
||||||
senses = list(vallex.db["v2_senses"].find({
|
senses = list(valdb[SENSES_COLL].find({
|
||||||
"hw": hw
|
"hw": hw
|
||||||
}))
|
}))
|
||||||
sense_map_query = list(vallex.db["v2_sense_map"].find({
|
sense_map_query = list(valdb[SENSEMAP_COLL].find({
|
||||||
"hw": hw
|
"hw": hw
|
||||||
}))
|
}))
|
||||||
# aggregation by max date possible on DB side
|
# aggregation by max date possible on DB side
|
||||||
|
@ -358,7 +361,7 @@ def api_senses_update():
|
||||||
id_map[frontend_sense_id] = new_sense_id
|
id_map[frontend_sense_id] = new_sense_id
|
||||||
|
|
||||||
# insert into db
|
# insert into db
|
||||||
vallex.db["v2_senses"].insert(ns)
|
valdb[SENSES_COLL].insert(ns)
|
||||||
|
|
||||||
# replace tmp_id with mongo's _id
|
# replace tmp_id with mongo's _id
|
||||||
for ssj_id, el in sense_map.items():
|
for ssj_id, el in sense_map.items():
|
||||||
|
@ -373,9 +376,14 @@ def api_senses_update():
|
||||||
"date": datetime.datetime.utcnow()
|
"date": datetime.datetime.utcnow()
|
||||||
}
|
}
|
||||||
# vallex.db["v2_sense_map"].update(key, data, upsert=True)
|
# vallex.db["v2_sense_map"].update(key, data, upsert=True)
|
||||||
vallex.db["v2_sense_map"].insert(data)
|
valdb[SENSEMAP_COLL].insert(data)
|
||||||
return "OK"
|
return "OK"
|
||||||
|
|
||||||
|
# SENSES ----------------------------^
|
||||||
|
|
||||||
|
|
||||||
|
# APP PREFLIGHT ---------------------.
|
||||||
|
|
||||||
def prepare_db():
|
def prepare_db():
|
||||||
def helper_tid_to_token(tid, tokens):
|
def helper_tid_to_token(tid, tokens):
|
||||||
for t in tokens:
|
for t in tokens:
|
||||||
|
@ -384,7 +392,7 @@ def prepare_db():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# update entries (add headwords and fuctors for indexing)
|
# update entries (add headwords and fuctors for indexing)
|
||||||
for corpus in ["ssj", "kres"]:
|
for corpus in CORPORA:
|
||||||
for e in valdb[corpus].find({}):
|
for e in valdb[corpus].find({}):
|
||||||
if e["srl_links"] is None:
|
if e["srl_links"] is None:
|
||||||
continue
|
continue
|
||||||
|
@ -435,6 +443,8 @@ def prepare_db():
|
||||||
functors = sorted(functors, key=lambda x: x[0])
|
functors = sorted(functors, key=lambda x: x[0])
|
||||||
app_index[corpus]["functors"] = functors
|
app_index[corpus]["functors"] = functors
|
||||||
|
|
||||||
|
# APP PREFLIGHT ---------------------^
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print("Starting app.py main()")
|
print("Starting app.py main()")
|
||||||
|
|
12
src/pkg/valency/setup.py
Normal file
12
src/pkg/valency/setup.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
from setuptools import setup
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name='valency',
|
||||||
|
version='0.1.1',
|
||||||
|
description='Objects and functions for handling valency frames.',
|
||||||
|
author='Kristjan Voje',
|
||||||
|
author_email='kristjan.voje@gmail.com',
|
||||||
|
license='MIT',
|
||||||
|
packages=['valency'],
|
||||||
|
install_requires=[],
|
||||||
|
)
|
96
src/pkg/valency/valency/Frame.py
Normal file
96
src/pkg/valency/valency/Frame.py
Normal file
|
@ -0,0 +1,96 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Frame():
|
||||||
|
def __init__(self, tids, deep_links=None, slots=None, hw=None):
|
||||||
|
self.hw = hw
|
||||||
|
self.tids = tids # list of tokens with the same hw_lemma
|
||||||
|
# Each tid = "S123.t123";
|
||||||
|
# you can get sentence with vallex.get_sentence(S123)
|
||||||
|
self.slots = []
|
||||||
|
if slots is None:
|
||||||
|
self.slots = self.init_slots(deep_links)
|
||||||
|
else:
|
||||||
|
self.slots = slots
|
||||||
|
self.sense_info = {}
|
||||||
|
self.sentences = None # Used for passing to view in app.py, get_frames
|
||||||
|
self.aggr_sent = None # Dictionary { hw: self.sentences idx }
|
||||||
|
|
||||||
|
def to_json(self):
|
||||||
|
ret = {
|
||||||
|
"hw": self.hw,
|
||||||
|
"tids": self.tids,
|
||||||
|
"slots": [slot.to_json() for slot in self.slots],
|
||||||
|
"sentences": self.sentences,
|
||||||
|
"aggr_sent": self.aggr_sent,
|
||||||
|
"sense_info": self.sense_info
|
||||||
|
}
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def init_slots(self, deep):
|
||||||
|
slots = []
|
||||||
|
for link in deep:
|
||||||
|
slots.append(Slot(
|
||||||
|
functor=link["functor"],
|
||||||
|
tids=[link["to"]]
|
||||||
|
))
|
||||||
|
return slots
|
||||||
|
|
||||||
|
def sort_slots(self):
|
||||||
|
# ACT, PAT, alphabetically
|
||||||
|
srt1 = [
|
||||||
|
x for x in self.slots
|
||||||
|
if (x.functor == "ACT" or
|
||||||
|
x.functor == "PAT")
|
||||||
|
]
|
||||||
|
srt1 = sorted(srt1, key=lambda x: x.functor)
|
||||||
|
srt2 = [
|
||||||
|
x for x in self.slots
|
||||||
|
if (x.functor != "ACT" and
|
||||||
|
x.functor != "PAT")
|
||||||
|
]
|
||||||
|
srt2 = sorted(srt2, key=lambda x: x.functor)
|
||||||
|
self.slots = (srt1 + srt2)
|
||||||
|
|
||||||
|
def to_string(self):
|
||||||
|
ret = "Frame:\n"
|
||||||
|
ret += "sense_info: {}\n".format(str(self.sense_info))
|
||||||
|
ret += "tids: ["
|
||||||
|
for t in self.tids:
|
||||||
|
ret += (str(t) + ", ")
|
||||||
|
ret += "]\n"
|
||||||
|
if self.slots is not None:
|
||||||
|
ret += "slots:\n"
|
||||||
|
for sl in self.slots:
|
||||||
|
ret += (sl.to_string() + "\n")
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
class Slot():
|
||||||
|
# Each slot is identified by its functor (ACT, PAT, ...)
|
||||||
|
# It consists of different tokens.
|
||||||
|
def __init__(self, functor, tids=None, count=None):
|
||||||
|
self.functor = functor
|
||||||
|
self.tids = tids or [] # combining multiple sentences vertically
|
||||||
|
self.count = count or 1
|
||||||
|
|
||||||
|
def to_string(self):
|
||||||
|
ret = "---- Slot:\n"
|
||||||
|
ret += "functor: {}\n".format(self.functor)
|
||||||
|
ret += "tids: ["
|
||||||
|
for t in self.tids:
|
||||||
|
ret += (str(t) + ", ")
|
||||||
|
ret += "]\n"
|
||||||
|
ret += "]\n"
|
||||||
|
ret += "----\n"
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def to_json(self):
|
||||||
|
ret = {
|
||||||
|
"functor": self.functor,
|
||||||
|
"tids": self.tids,
|
||||||
|
"count": self.count
|
||||||
|
}
|
||||||
|
return ret
|
1
src/pkg/valency/valency/__init__.py
Normal file
1
src/pkg/valency/valency/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from valency.Frame import Frame, Slot
|
96
src/pkg/valency/valency/frame.py
Normal file
96
src/pkg/valency/valency/frame.py
Normal file
|
@ -0,0 +1,96 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Frame():
|
||||||
|
def __init__(self, tids, deep_links=None, slots=None, hw=None):
|
||||||
|
self.hw = hw
|
||||||
|
self.tids = tids # list of tokens with the same hw_lemma
|
||||||
|
# Each tid = "S123.t123";
|
||||||
|
# you can get sentence with vallex.get_sentence(S123)
|
||||||
|
self.slots = []
|
||||||
|
if slots is None:
|
||||||
|
self.slots = self.init_slots(deep_links)
|
||||||
|
else:
|
||||||
|
self.slots = slots
|
||||||
|
self.sense_info = {}
|
||||||
|
self.sentences = None # Used for passing to view in app.py, get_frames
|
||||||
|
self.aggr_sent = None # Dictionary { hw: self.sentences idx }
|
||||||
|
|
||||||
|
def to_json(self):
|
||||||
|
ret = {
|
||||||
|
"hw": self.hw,
|
||||||
|
"tids": self.tids,
|
||||||
|
"slots": [slot.to_json() for slot in self.slots],
|
||||||
|
"sentences": self.sentences,
|
||||||
|
"aggr_sent": self.aggr_sent,
|
||||||
|
"sense_info": self.sense_info
|
||||||
|
}
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def init_slots(self, deep):
|
||||||
|
slots = []
|
||||||
|
for link in deep:
|
||||||
|
slots.append(Slot(
|
||||||
|
functor=link["functor"],
|
||||||
|
tids=[link["to"]]
|
||||||
|
))
|
||||||
|
return slots
|
||||||
|
|
||||||
|
def sort_slots(self):
|
||||||
|
# ACT, PAT, alphabetically
|
||||||
|
srt1 = [
|
||||||
|
x for x in self.slots
|
||||||
|
if (x.functor == "ACT" or
|
||||||
|
x.functor == "PAT")
|
||||||
|
]
|
||||||
|
srt1 = sorted(srt1, key=lambda x: x.functor)
|
||||||
|
srt2 = [
|
||||||
|
x for x in self.slots
|
||||||
|
if (x.functor != "ACT" and
|
||||||
|
x.functor != "PAT")
|
||||||
|
]
|
||||||
|
srt2 = sorted(srt2, key=lambda x: x.functor)
|
||||||
|
self.slots = (srt1 + srt2)
|
||||||
|
|
||||||
|
def to_string(self):
|
||||||
|
ret = "Frame:\n"
|
||||||
|
ret += "sense_info: {}\n".format(str(self.sense_info))
|
||||||
|
ret += "tids: ["
|
||||||
|
for t in self.tids:
|
||||||
|
ret += (str(t) + ", ")
|
||||||
|
ret += "]\n"
|
||||||
|
if self.slots is not None:
|
||||||
|
ret += "slots:\n"
|
||||||
|
for sl in self.slots:
|
||||||
|
ret += (sl.to_string() + "\n")
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
class Slot():
|
||||||
|
# Each slot is identified by its functor (ACT, PAT, ...)
|
||||||
|
# It consists of different tokens.
|
||||||
|
def __init__(self, functor, tids=None, count=None):
|
||||||
|
self.functor = functor
|
||||||
|
self.tids = tids or [] # combining multiple sentences vertically
|
||||||
|
self.count = count or 1
|
||||||
|
|
||||||
|
def to_string(self):
|
||||||
|
ret = "---- Slot:\n"
|
||||||
|
ret += "functor: {}\n".format(self.functor)
|
||||||
|
ret += "tids: ["
|
||||||
|
for t in self.tids:
|
||||||
|
ret += (str(t) + ", ")
|
||||||
|
ret += "]\n"
|
||||||
|
ret += "]\n"
|
||||||
|
ret += "----\n"
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def to_json(self):
|
||||||
|
ret = {
|
||||||
|
"functor": self.functor,
|
||||||
|
"tids": self.tids,
|
||||||
|
"count": self.count
|
||||||
|
}
|
||||||
|
return ret
|
242
src/pkg/valency/valency/reduce_functions.py
Normal file
242
src/pkg/valency/valency/reduce_functions.py
Normal file
|
@ -0,0 +1,242 @@
|
||||||
|
# Reduction function for frames.
|
||||||
|
# Input: list of Frame objects, output: list of Frame objects.
|
||||||
|
# App uses reduce_0, 1 and 5
|
||||||
|
|
||||||
|
from valency import Frame, Slot
|
||||||
|
from copy import deepcopy as DC
|
||||||
|
import logging
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
SENSE_UNDEFINED = "nedefinirano"
|
||||||
|
|
||||||
|
## TIDI: use frame.py
|
||||||
|
## TODO: build a list of [Frame] with lists of [Slot]
|
||||||
|
|
||||||
|
|
||||||
|
def sorted_by_len_tids(frames):
|
||||||
|
return sorted(
|
||||||
|
frames,
|
||||||
|
key=lambda x: len(x.tids),
|
||||||
|
reverse=True
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def reduce_0(frames, vallex=None):
|
||||||
|
# new request... frames should be sorded by
|
||||||
|
# functors list (basically reduce_1, just each
|
||||||
|
# sentence gets its own frame)
|
||||||
|
r1_frames = reduce_1(frames)
|
||||||
|
sorting_strings = []
|
||||||
|
separated_frames = []
|
||||||
|
for frame in r1_frames:
|
||||||
|
for tid in frame.tids:
|
||||||
|
tmp_frame = DC(frame)
|
||||||
|
tmp_frame.tids = [tid]
|
||||||
|
separated_frames.append(tmp_frame)
|
||||||
|
sorting_strings.append("".join(
|
||||||
|
[slot.functor for slot in tmp_frame.slots]
|
||||||
|
))
|
||||||
|
permutation = [x for _, x in sorted(
|
||||||
|
zip(sorting_strings, range(len(sorting_strings))))]
|
||||||
|
sorted_sep_frames = [separated_frames[i] for i in permutation]
|
||||||
|
return sorted_sep_frames
|
||||||
|
|
||||||
|
|
||||||
|
def reduce_1(frames, vallex=None):
|
||||||
|
# Combine frames with the same set of functors.
|
||||||
|
# The order of functors is not important.
|
||||||
|
frame_sets = [] # [set of functors, list of frames]
|
||||||
|
for frame in frames:
|
||||||
|
functors = [slot.functor for slot in frame.slots]
|
||||||
|
|
||||||
|
for fs in frame_sets:
|
||||||
|
if set(functors) == set(fs[0]):
|
||||||
|
fs[1].append(frame)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# Python for else -> fires if loop has ended.
|
||||||
|
frame_sets.append([functors, [frame]])
|
||||||
|
|
||||||
|
ret_frames = []
|
||||||
|
for fs in frame_sets:
|
||||||
|
tids = []
|
||||||
|
slots = {}
|
||||||
|
# All possible slots in this frame.
|
||||||
|
for functor in fs[0]:
|
||||||
|
slots[functor] = Slot(functor=functor)
|
||||||
|
# Reduce slots from all frames. (Merge ACT from all frames, ...)
|
||||||
|
for frame in fs[1]:
|
||||||
|
tids += frame.tids
|
||||||
|
for sl in frame.slots:
|
||||||
|
slots[sl.functor].tids += sl.tids
|
||||||
|
slots_list = []
|
||||||
|
for k, e in slots.items():
|
||||||
|
slots_list.append(e)
|
||||||
|
rf = Frame(tids=tids, slots=slots_list)
|
||||||
|
rf.sort_slots()
|
||||||
|
ret_frames.append(rf)
|
||||||
|
return sorted_by_len_tids(ret_frames)
|
||||||
|
|
||||||
|
|
||||||
|
def reduce_3(raw_frames, vallex):
|
||||||
|
# sskj simple lesk ids
|
||||||
|
ssj_ids = [frame.tids[0] for frame in raw_frames]
|
||||||
|
db_results = list(vallex.db.sskj_simple_lesk.find(
|
||||||
|
{"ssj_id": {"$in": ssj_ids}}))
|
||||||
|
id_map = {}
|
||||||
|
for entry in db_results:
|
||||||
|
id_map.update({entry["ssj_id"]: {
|
||||||
|
"sense_id": entry.get("sense_id"),
|
||||||
|
"sense_desc": entry.get("sense_desc")
|
||||||
|
}})
|
||||||
|
return frames_from_sense_ids(raw_frames, id_map)
|
||||||
|
|
||||||
|
|
||||||
|
def reduce_4(raw_frames, vallex):
|
||||||
|
# kmeans ids
|
||||||
|
ssj_ids = [frame.tids[0] for frame in raw_frames]
|
||||||
|
db_results = list(vallex.db.kmeans.find(
|
||||||
|
{"ssj_id": {"$in": ssj_ids}}))
|
||||||
|
id_map = {}
|
||||||
|
for entry in db_results:
|
||||||
|
id_map.update({entry["ssj_id"]: {
|
||||||
|
"sense_id": entry["sense_id"]
|
||||||
|
}})
|
||||||
|
return frames_from_sense_ids(raw_frames, id_map)
|
||||||
|
|
||||||
|
|
||||||
|
def reduce_5(raw_frames, vallex):
|
||||||
|
USER_SENSE_COLL = "v2_sense_map"
|
||||||
|
headword = raw_frames[0].hw
|
||||||
|
ssj_ids_full = [frame.tids[0] for frame in raw_frames]
|
||||||
|
# v2_sense_map stores only sentence half of ssj_id
|
||||||
|
ssj_ids = [".".join(ssj_id.split(".")[:-1]) for ssj_id in ssj_ids_full]
|
||||||
|
db_results = list(vallex.db[USER_SENSE_COLL].find({
|
||||||
|
"ssj_id": {"$in": ssj_ids},
|
||||||
|
"hw": headword,
|
||||||
|
}))
|
||||||
|
id_map = {}
|
||||||
|
for entry in db_results:
|
||||||
|
id_map[entry["ssj_id"]] = entry["sense_id"]
|
||||||
|
|
||||||
|
ret_frames = frames_from_sense_ids(raw_frames, id_map)
|
||||||
|
|
||||||
|
# sort: frames with senses to top
|
||||||
|
senses_undefined = []
|
||||||
|
senses_defined = []
|
||||||
|
for frame in ret_frames:
|
||||||
|
if frame.sense_info["sense_id"] == SENSE_UNDEFINED:
|
||||||
|
senses_undefined.append(frame)
|
||||||
|
else:
|
||||||
|
senses_defined.append(frame)
|
||||||
|
ret_frames = senses_defined + senses_undefined
|
||||||
|
|
||||||
|
return ret_frames
|
||||||
|
|
||||||
|
|
||||||
|
def frames_from_sense_ids(raw_frames, id_map):
|
||||||
|
# id map = dict {
|
||||||
|
# ssj_id: sense_id
|
||||||
|
# }
|
||||||
|
# id_dict = dict {
|
||||||
|
# sense_id: [frame, ...]
|
||||||
|
# }
|
||||||
|
id_dict = {}
|
||||||
|
for frame in raw_frames:
|
||||||
|
# long version ssj_id (S123.t12)
|
||||||
|
frame_ssj_id = frame.tids[0]
|
||||||
|
frame_sense_id = id_map.get(frame_ssj_id)
|
||||||
|
if frame_sense_id is None:
|
||||||
|
# try short version ssj_id (S123)
|
||||||
|
frame_ssj_id = ".".join(frame_ssj_id.split(".")[:-1])
|
||||||
|
frame_sense_id = id_map.get(frame_ssj_id)
|
||||||
|
|
||||||
|
# set default if sense_id not found
|
||||||
|
if frame_sense_id is None:
|
||||||
|
frame_sense_id = SENSE_UNDEFINED
|
||||||
|
"""
|
||||||
|
sense_id = id_map.get(frame.tids[0])
|
||||||
|
if sense_id is not None:
|
||||||
|
sense_id = sense_id.get("sense_id")
|
||||||
|
else:
|
||||||
|
sense_id = "nedefinirano"
|
||||||
|
"""
|
||||||
|
if frame_sense_id not in id_dict:
|
||||||
|
id_dict[frame_sense_id] = []
|
||||||
|
id_dict[frame_sense_id].append(DC(frame))
|
||||||
|
|
||||||
|
ret_frames = []
|
||||||
|
for sense_id, frames in id_dict.items():
|
||||||
|
tids = []
|
||||||
|
reduced_slots = []
|
||||||
|
for frame in frames:
|
||||||
|
tids.extend(frame.tids)
|
||||||
|
for slot in frame.slots:
|
||||||
|
# if functor not in reduced slots,
|
||||||
|
# add new slot; else increase count
|
||||||
|
for rslot in reduced_slots:
|
||||||
|
if slot.functor == rslot.functor:
|
||||||
|
rslot.count += 1
|
||||||
|
rslot.tids.extend(slot.tids)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# in case for loop didn't match a slot
|
||||||
|
reduced_slots.append(Slot(
|
||||||
|
functor=slot.functor,
|
||||||
|
tids=slot.tids,
|
||||||
|
count=1
|
||||||
|
))
|
||||||
|
reduced_frame = Frame(tids, slots=reduced_slots)
|
||||||
|
id_map_entry = (
|
||||||
|
id_map.get(tids[0]) or
|
||||||
|
id_map.get(".".join(tids[0].split(".")[:-1]))
|
||||||
|
)
|
||||||
|
if id_map_entry is None:
|
||||||
|
reduced_frame.sense_info = {
|
||||||
|
"sense_id": SENSE_UNDEFINED,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
reduced_frame.sense_info = {
|
||||||
|
"sense_id": id_map_entry
|
||||||
|
}
|
||||||
|
reduced_frame.sort_slots()
|
||||||
|
ret_frames.append(reduced_frame)
|
||||||
|
return ret_frames
|
||||||
|
|
||||||
|
|
||||||
|
reduce_functions = {
|
||||||
|
"reduce_0": {
|
||||||
|
"f": reduce_0,
|
||||||
|
"desc":
|
||||||
|
"Vsaka pojavitev glagola dobi svoj stavčni vzorec.",
|
||||||
|
"simple_name": "posamezni stavki"
|
||||||
|
},
|
||||||
|
"reduce_1": {
|
||||||
|
"f": reduce_1,
|
||||||
|
"desc":
|
||||||
|
"Združevanje stavčnih vzorcev z enako skupino udeleženskih vlog.",
|
||||||
|
"simple_name": "združeni stavki"
|
||||||
|
},
|
||||||
|
"reduce_3": {
|
||||||
|
"f": reduce_3,
|
||||||
|
"desc":
|
||||||
|
"Združevanje stavčnih vzorcev na osnovi pomenov povedi v SSKJ. "
|
||||||
|
"Pomeni so dodeljeni s pomočjo algoritma Simple Lesk.",
|
||||||
|
"simple_name": "SSKJ_pomeni"
|
||||||
|
},
|
||||||
|
"reduce_4": {
|
||||||
|
"f": reduce_4,
|
||||||
|
"desc":
|
||||||
|
"Združevanje stavčnih vzorcev na osnovi pomenov povedi "
|
||||||
|
"s pomočjo algoritma K-Means. Število predvidenih pomenov "
|
||||||
|
"podano na osnovi SSKJ.",
|
||||||
|
"simple_name": "KMeans_pomeni"
|
||||||
|
},
|
||||||
|
"reduce_5": {
|
||||||
|
"f": reduce_5,
|
||||||
|
"desc":
|
||||||
|
"Uporabniško dodeljeni pomeni povedi.",
|
||||||
|
"simple_name": "po meri"
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user