forked from kristjan/cjvt-valency
Merge branch 'master' of gitea.cjvt.si:kristjan/cjvt-valency
This commit is contained in:
commit
9e3254de8e
30
Makefile
30
Makefile
|
@ -8,13 +8,13 @@ MAKE_ROOT = $(shell pwd)
|
|||
# kres is composed of many .xml files
|
||||
# I generated srl tags for kres in separate .json files
|
||||
# (for each kres.xml file there is a kres.json file with srl tags)
|
||||
SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_example/ssj500k-sl.body.sample.xml"
|
||||
KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_example"
|
||||
# KRES_SRL_FOLDER = "/home/kristjan/kres_srl/final_json/" # t420
|
||||
KRES_SRL_FOLDER = "/home/voje/work_data/final_json" # work-pc
|
||||
SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_xml/ssj500k-sl.body.sample.xml"
|
||||
KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
|
||||
KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
|
||||
|
||||
OUTPUT = "db"
|
||||
OUTDIR = "/home/voje/workdir/test_out"
|
||||
# OUTPUT = "file"
|
||||
OUTDIR = "/tmp/three" # if you're running this in docker, make sure to mount the volume
|
||||
DBADDR = "0.0.0.0:27017" # don't use localhost
|
||||
|
||||
MONGOEXPRESS_USER = mxuser
|
||||
|
@ -23,6 +23,8 @@ DB_ADM_USER = valadmin
|
|||
DB_ADM_PASS = valadminpass
|
||||
DB_USR_USER = valuser
|
||||
DB_USR_PASS = valuserpass
|
||||
|
||||
N_CORES = 1
|
||||
export
|
||||
|
||||
.PHONY: python-env fill-database
|
||||
|
@ -59,14 +61,17 @@ data/samples:
|
|||
cd data; tar xzvf samples.tar.gz
|
||||
|
||||
# from inside python-env container:
|
||||
# you can set OUTPUT = "file" and a valid OUTDIR to test writing to json files instead of DB
|
||||
fill-database: data/samples
|
||||
python3 src/pkg/cjvt-corpusparser/corpusparser/main.py --kres-folder $(KRES_FOLDER) \
|
||||
--ssj-file $(SSJ_FILE) --kres-srl-folder $(KRES_SRL_FOLDER) \
|
||||
--output $(OUTPUT) --outdir $(OUTDIR) --dbaddr $(DBADDR) \
|
||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS)
|
||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) \
|
||||
--cores $(N_CORES)
|
||||
|
||||
|
||||
## Frontend
|
||||
|
||||
## Run from host
|
||||
## See src/frontend_vue/README.md for port settings etc.
|
||||
frontend-dev:
|
||||
|
@ -77,12 +82,15 @@ frontend-prod:
|
|||
|
||||
|
||||
## Backend
|
||||
backend-dev-init: python-env-install
|
||||
cd ./src/backend_flask; python3 app.py \
|
||||
--config-file ./conf_files/dev_conf_init.yaml \
|
||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)
|
||||
|
||||
backend-dev: python-env-install
|
||||
# runs once and exits before the app starts
|
||||
backend-prepare-db:
|
||||
cd ./src/backend_flask; python3 app.py \
|
||||
--config-file ./conf_files/dev_conf.yaml \
|
||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \
|
||||
--prepare-db
|
||||
|
||||
backend-dev:
|
||||
cd ./src/backend_flask; python3 app.py \
|
||||
--config-file ./conf_files/dev_conf.yaml \
|
||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)
|
||||
|
|
13
README.md
13
README.md
|
@ -37,6 +37,7 @@ $ make python-env
|
|||
$ make python-env-install
|
||||
|
||||
# run the code
|
||||
# beforehand, set the data files in Makefile
|
||||
$ make fill-database
|
||||
```
|
||||
|
||||
|
@ -46,11 +47,17 @@ If all goes well, we should be able to inspect the database, filled with corpora
|
|||
### Flask backend (1 container)
|
||||
Relies heavily on the database. Set that up first.
|
||||
```bash
|
||||
# spin up container
|
||||
$ make python-env
|
||||
|
||||
# development:
|
||||
$ make backend-dev-init # run the first time, to prepare the db, then kill
|
||||
$ make backend-dev # debug with this one
|
||||
# install our packages
|
||||
$ make python-env-install
|
||||
|
||||
# needs to be ran once to modify a new database
|
||||
$ make backend-prepare-db
|
||||
|
||||
# with debugger
|
||||
$ make backend-dev
|
||||
|
||||
# production
|
||||
$ make backend-prod
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
/home/kristjan/kres_srl/final_json/
|
|
@ -1 +0,0 @@
|
|||
/home/voje/work_data/final_json
|
|
@ -1 +0,0 @@
|
|||
/home/kristjan/kres_srl/final_json/
|
Binary file not shown.
|
@ -235,28 +235,6 @@ def api_token():
|
|||
|
||||
# FRAMES ----------------------------.
|
||||
|
||||
def prepare_frames(ret_frames):
|
||||
# append sentences
|
||||
for frame in ret_frames:
|
||||
unique_sids = {".".join(x.split(".")[:-1]): x for x in frame.tids}
|
||||
# frame.sentences = []
|
||||
frame.aggr_sent = {}
|
||||
# sid, tid==hw
|
||||
for sid, tid in unique_sids.items():
|
||||
# hwl = vallex.get_token(tid)["lemma"]
|
||||
hwl = frame.hw
|
||||
tmp_idx = len(frame.sentences)
|
||||
if hwl not in frame.aggr_sent:
|
||||
frame.aggr_sent[hwl] = []
|
||||
frame.aggr_sent[hwl].append(tmp_idx)
|
||||
# return (n-frames, rendered template)
|
||||
# json frames
|
||||
json_ret = {"frames": []}
|
||||
for frame in ret_frames:
|
||||
json_ret["frames"].append(DC(frame.to_json()))
|
||||
return json.dumps(json_ret)
|
||||
|
||||
|
||||
# input: hw, reduct_function
|
||||
@app.route("/api/frames")
|
||||
def api_get_frames():
|
||||
|
@ -280,8 +258,34 @@ def api_get_frames():
|
|||
frames = [x for x in frames if x.hw == hw]
|
||||
|
||||
ret_frames = RF(frames, valdb[SENSEMAP_COLL])
|
||||
return prepare_frames(ret_frames)
|
||||
|
||||
json_ret = {"frames": []}
|
||||
for frame in ret_frames:
|
||||
json_ret["frames"].append(frame.to_json())
|
||||
return json.dumps(json_ret)
|
||||
# return prepare_frames(ret_frames)
|
||||
|
||||
def _aggregate_by_hw(ret_frames):
|
||||
|
||||
def _tid_to_lemma(tid, sentence):
|
||||
# slow and hackish
|
||||
for pair in sentence:
|
||||
if pair[0] == tid:
|
||||
return pair[1]["lemma"]
|
||||
return None
|
||||
|
||||
# append sentences
|
||||
for frame in ret_frames:
|
||||
# unique_sids = {".".join(x.split(".")[:-1]): x for x in frame.tids}
|
||||
frame.aggr_sent = {} # map of headword: [sentence indexes]
|
||||
# sid, tid==hw
|
||||
for i, tid in enumerate(frame.tids):
|
||||
# hwl = vallex.get_token(tid)["lemma"]
|
||||
hwl = _tid_to_lemma(tid, frame.sentences[i])
|
||||
if hwl not in frame.aggr_sent:
|
||||
frame.aggr_sent[hwl] = []
|
||||
frame.aggr_sent[hwl].append(i)
|
||||
return ret_frames
|
||||
|
||||
# input: functor, reduce_function
|
||||
@app.route("/api/functor-frames")
|
||||
|
@ -302,12 +306,17 @@ def api_get_functor_frames():
|
|||
for ent in cur:
|
||||
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
|
||||
|
||||
for f in frames:
|
||||
print(f.to_json())
|
||||
# filter by relevant functor
|
||||
frames = [x for x in frames if functor in x.get_functors()]
|
||||
|
||||
# raw_frames = vallex.functors_index[functor] # TODO
|
||||
ret_frames = RF(frames, valdb[SENSEMAP_COLL])
|
||||
return prepare_frames(ret_frames)
|
||||
ret_frames = _aggregate_by_hw(ret_frames)
|
||||
|
||||
json_ret = {"frames": []}
|
||||
for frame in ret_frames:
|
||||
json_ret["frames"].append(DC(frame.to_json()))
|
||||
return json.dumps(json_ret)
|
||||
|
||||
# FRAMES ----------------------------^
|
||||
|
||||
|
@ -422,33 +431,8 @@ def api_senses_update():
|
|||
|
||||
# APP PREFLIGHT ---------------------.
|
||||
|
||||
def prepare_db():
|
||||
def helper_tid_to_token(tid, tokens):
|
||||
for t in tokens:
|
||||
if t["tid"] == tid:
|
||||
return t
|
||||
return None
|
||||
|
||||
# update entries (add headwords and fuctors for indexing)
|
||||
for corpus in CORPORA:
|
||||
for e in valdb[corpus].find({}):
|
||||
if e["srl_links"] is None:
|
||||
e["headwords"] = []
|
||||
e["functors"] = []
|
||||
else:
|
||||
hw_tids = list(set([x["from"] for x in e["srl_links"]]))
|
||||
hw_tokens = [helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids]
|
||||
headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens]
|
||||
e["headwords"] = headwords
|
||||
|
||||
functors = list(set([x["afun"] for x in e["srl_links"]]))
|
||||
e["functors"] = functors
|
||||
|
||||
valdb[corpus].save(e)
|
||||
|
||||
valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)])
|
||||
valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)])
|
||||
|
||||
def prepare_app_index():
|
||||
log.info("[*] preparing app_index")
|
||||
# create app_index (used in frontend, left side word index)
|
||||
tmp_app_index = {c: {} for c in CORPORA}
|
||||
for corpus in CORPORA:
|
||||
|
@ -495,6 +479,7 @@ if __name__ == "__main__":
|
|||
print("Starting app.py main()")
|
||||
aparser = argparse.ArgumentParser(description="Arguments for app.py")
|
||||
aparser.add_argument("--config-file", type=str, help="check ./conf_files/")
|
||||
aparser.add_argument('--prepare-db', action="store_true", default=False)
|
||||
aparser.add_argument("--dbuser", type=str)
|
||||
aparser.add_argument("--dbpass", type=str)
|
||||
aparser.add_argument("--dbaddr", type=str)
|
||||
|
@ -521,8 +506,9 @@ if __name__ == "__main__":
|
|||
)
|
||||
valdb = client.valdb
|
||||
|
||||
if bool(config["prepare_db"]):
|
||||
prepare_db()
|
||||
if args.prepare_db:
|
||||
prepare_app_index()
|
||||
sys.exit()
|
||||
|
||||
# app index from db
|
||||
app_index = (valdb.appindex.find_one({"dockey": "appindex"}))["data"]
|
||||
|
|
|
@ -3,5 +3,4 @@ debug: True
|
|||
port: 5004
|
||||
host: localhost
|
||||
logfile: "/var/log/valency_backend.log"
|
||||
prepare_db: False
|
||||
---
|
||||
|
|
|
@ -1,7 +0,0 @@
|
|||
---
|
||||
debug: True
|
||||
port: 5004
|
||||
host: localhost
|
||||
logfile: "/var/log/valency_backend.log"
|
||||
prepare_db: True
|
||||
---
|
70
src/backend_flask/preprocess.py
Normal file
70
src/backend_flask/preprocess.py
Normal file
|
@ -0,0 +1,70 @@
|
|||
CORPORA = ["kres", "ssj"]
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
valdb = None
|
||||
|
||||
def helper_tid_to_token(tid, tokens):
|
||||
for t in tokens:
|
||||
if t["tid"] == tid:
|
||||
return t
|
||||
return None
|
||||
|
||||
# update entries (add headwords and fuctors for indexing)
|
||||
for corpus in CORPORA:
|
||||
for e in valdb[corpus].find({}):
|
||||
if e["srl_links"] is None:
|
||||
e["headwords"] = []
|
||||
e["functors"] = []
|
||||
else:
|
||||
hw_tids = list(set([x["from"] for x in e["srl_links"]]))
|
||||
hw_tokens = [helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids]
|
||||
headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens]
|
||||
e["headwords"] = headwords
|
||||
|
||||
functors = list(set([x["afun"] for x in e["srl_links"]]))
|
||||
e["functors"] = functors
|
||||
|
||||
valdb[corpus].save(e)
|
||||
|
||||
valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)])
|
||||
valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)])
|
||||
|
||||
# create app_index (used in frontend, left side word index)
|
||||
tmp_app_index = {c: {} for c in CORPORA}
|
||||
for corpus in CORPORA:
|
||||
res_hws = {}
|
||||
res_fns = {}
|
||||
for e in valdb[corpus].find({}):
|
||||
if "headwords" not in e:
|
||||
continue
|
||||
for hw in e["headwords"]:
|
||||
if hw in res_hws:
|
||||
res_hws[hw] += 1
|
||||
else:
|
||||
res_hws[hw] = 1
|
||||
if "functors" not in e:
|
||||
continue
|
||||
for fn in e["functors"]:
|
||||
if fn in res_fns:
|
||||
res_fns[fn] += 1
|
||||
else:
|
||||
res_fns[fn] = 1
|
||||
|
||||
alphabetical = {}
|
||||
for k, e in res_hws.items():
|
||||
fst = k[0].lower()
|
||||
if fst in alphabetical:
|
||||
alphabetical[fst].append((k, e))
|
||||
else:
|
||||
alphabetical[fst] = [(k, e)]
|
||||
|
||||
for k, e in alphabetical.items():
|
||||
alphabetical[k] = sorted(e, key=lambda x: x[0])
|
||||
tmp_app_index[corpus]["words"] = alphabetical
|
||||
|
||||
functors = [(k, e) for (k, e) in res_fns.items()]
|
||||
functors = sorted(functors, key=lambda x: x[0])
|
||||
tmp_app_index[corpus]["functors"] = functors
|
||||
|
||||
valdb.appindex.update({"dockey": "appindex"}, {"dockey": "appindex", "data": tmp_app_index}, upsert=True)
|
|
@ -169,7 +169,7 @@ export default {
|
|||
}
|
||||
this.sentences = {}
|
||||
for (var fi in this.frames) {
|
||||
console.log(this.frames[fi].sentences)
|
||||
// console.log(this.frames[fi].sentences)
|
||||
for (var si in this.frames[fi].sentences) {
|
||||
var sentence = this.frames[fi].sentences[si]
|
||||
// get ssj_id without .t123
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 2b7339ac5abb52958f7875a3e0a0eb1899728730
|
||||
Subproject commit f0b0abac1bd32ad6e9e29e7b737e4162e28568c2
|
|
@ -50,6 +50,10 @@ class Frame():
|
|||
self.sentences = sentences
|
||||
self.aggr_sent = None # Dictionary { hw: self.sentences idx }
|
||||
|
||||
def get_functors(self):
|
||||
return [slot.functor for slot in self.slots]
|
||||
|
||||
|
||||
def to_json(self):
|
||||
ret = {
|
||||
"hw": self.hw,
|
||||
|
|
Loading…
Reference in New Issue
Block a user