moved db_preparation logic to fill-database phase

This commit is contained in:
voje 2019-04-15 02:35:50 +02:00
parent 1494d4dfed
commit c2dea0bf70
8 changed files with 107 additions and 72 deletions

View File

@ -12,8 +12,8 @@ SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_xml/ssj500k-sl.body.sample.xml"
KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml" KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json" KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
# OUTPUT = "db" OUTPUT = "db"
OUTPUT = "file" # OUTPUT = "file"
OUTDIR = "/tmp/three" # if you're running this in docker, make sure to mount the volume OUTDIR = "/tmp/three" # if you're running this in docker, make sure to mount the volume
DBADDR = "0.0.0.0:27017" # don't use localhost DBADDR = "0.0.0.0:27017" # don't use localhost
@ -21,6 +21,8 @@ DB_ADM_USER = valadmin
DB_ADM_PASS = valadminpass DB_ADM_PASS = valadminpass
DB_USR_USER = valuser DB_USR_USER = valuser
DB_USR_PASS = valuserpass DB_USR_PASS = valuserpass
N_CORES = 1
export export
.PHONY: python-env fill-database .PHONY: python-env fill-database
@ -63,9 +65,11 @@ fill-database: data/samples
--ssj-file $(SSJ_FILE) --kres-srl-folder $(KRES_SRL_FOLDER) \ --ssj-file $(SSJ_FILE) --kres-srl-folder $(KRES_SRL_FOLDER) \
--output $(OUTPUT) --outdir $(OUTDIR) --dbaddr $(DBADDR) \ --output $(OUTPUT) --outdir $(OUTDIR) --dbaddr $(DBADDR) \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) \ --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) \
--cores 2 --cores $(N_CORES)
## Frontend ## Frontend
## Run from host ## Run from host
## See src/frontend_vue/README.md for port settings etc. ## See src/frontend_vue/README.md for port settings etc.
frontend-dev: frontend-dev:
@ -76,12 +80,15 @@ frontend-prod:
## Backend ## Backend
backend-dev-init: python-env-install
cd ./src/backend_flask; python3 app.py \
--config-file ./conf_files/dev_conf_init.yaml \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)
backend-dev: python-env-install # runs once and exits before the app starts
backend-prepare-db:
cd ./src/backend_flask; python3 app.py \
--config-file ./conf_files/dev_conf.yaml \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \
--prepare-db
backend-dev:
cd ./src/backend_flask; python3 app.py \ cd ./src/backend_flask; python3 app.py \
--config-file ./conf_files/dev_conf.yaml \ --config-file ./conf_files/dev_conf.yaml \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)

View File

@ -47,14 +47,17 @@ If all goes well, we should be able to inspect the database, filled with corpora
### Flask backend (1 container) ### Flask backend (1 container)
Relies heavily on the database. Set that up first. Relies heavily on the database. Set that up first.
```bash ```bash
# spin up container
$ make python-env $ make python-env
# development: # install our packages
# run the first time, to prepare the db, then kill $ make python-env-install
# it runs a few minutes, there should be a new collection in the db when finished
$ make backend-dev-init
$ make backend-dev # debug with this one # needs to be ran once to modify a new database
$ make backend-prepare-db
# with debugger
$ make backend-dev
# production # production
$ make backend-prod $ make backend-prod

View File

@ -431,33 +431,8 @@ def api_senses_update():
# APP PREFLIGHT ---------------------. # APP PREFLIGHT ---------------------.
def prepare_db(): def prepare_app_index():
def helper_tid_to_token(tid, tokens): log.info("[*] preparing app_index")
for t in tokens:
if t["tid"] == tid:
return t
return None
# update entries (add headwords and fuctors for indexing)
for corpus in CORPORA:
for e in valdb[corpus].find({}):
if e["srl_links"] is None:
e["headwords"] = []
e["functors"] = []
else:
hw_tids = list(set([x["from"] for x in e["srl_links"]]))
hw_tokens = [helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids]
headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens]
e["headwords"] = headwords
functors = list(set([x["afun"] for x in e["srl_links"]]))
e["functors"] = functors
valdb[corpus].save(e)
valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)])
valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)])
# create app_index (used in frontend, left side word index) # create app_index (used in frontend, left side word index)
tmp_app_index = {c: {} for c in CORPORA} tmp_app_index = {c: {} for c in CORPORA}
for corpus in CORPORA: for corpus in CORPORA:
@ -504,6 +479,7 @@ if __name__ == "__main__":
print("Starting app.py main()") print("Starting app.py main()")
aparser = argparse.ArgumentParser(description="Arguments for app.py") aparser = argparse.ArgumentParser(description="Arguments for app.py")
aparser.add_argument("--config-file", type=str, help="check ./conf_files/") aparser.add_argument("--config-file", type=str, help="check ./conf_files/")
aparser.add_argument('--prepare-db', action="store_true", default=False)
aparser.add_argument("--dbuser", type=str) aparser.add_argument("--dbuser", type=str)
aparser.add_argument("--dbpass", type=str) aparser.add_argument("--dbpass", type=str)
aparser.add_argument("--dbaddr", type=str) aparser.add_argument("--dbaddr", type=str)
@ -530,8 +506,9 @@ if __name__ == "__main__":
) )
valdb = client.valdb valdb = client.valdb
if bool(config["prepare_db"]): if args.prepare_db:
prepare_db() prepare_app_index()
sys.exit()
# app index from db # app index from db
app_index = (valdb.appindex.find_one({"dockey": "appindex"}))["data"] app_index = (valdb.appindex.find_one({"dockey": "appindex"}))["data"]

View File

@ -3,5 +3,4 @@ debug: True
port: 5004 port: 5004
host: localhost host: localhost
logfile: "/var/log/valency_backend.log" logfile: "/var/log/valency_backend.log"
prepare_db: False
--- ---

View File

@ -1,7 +0,0 @@
---
debug: True
port: 5004
host: localhost
logfile: "/var/log/valency_backend.log"
prepare_db: True
---

View File

@ -0,0 +1,70 @@
CORPORA = ["kres", "ssj"]
if __name__ == "__main__":
valdb = None
def helper_tid_to_token(tid, tokens):
for t in tokens:
if t["tid"] == tid:
return t
return None
# update entries (add headwords and fuctors for indexing)
for corpus in CORPORA:
for e in valdb[corpus].find({}):
if e["srl_links"] is None:
e["headwords"] = []
e["functors"] = []
else:
hw_tids = list(set([x["from"] for x in e["srl_links"]]))
hw_tokens = [helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids]
headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens]
e["headwords"] = headwords
functors = list(set([x["afun"] for x in e["srl_links"]]))
e["functors"] = functors
valdb[corpus].save(e)
valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)])
valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)])
# create app_index (used in frontend, left side word index)
tmp_app_index = {c: {} for c in CORPORA}
for corpus in CORPORA:
res_hws = {}
res_fns = {}
for e in valdb[corpus].find({}):
if "headwords" not in e:
continue
for hw in e["headwords"]:
if hw in res_hws:
res_hws[hw] += 1
else:
res_hws[hw] = 1
if "functors" not in e:
continue
for fn in e["functors"]:
if fn in res_fns:
res_fns[fn] += 1
else:
res_fns[fn] = 1
alphabetical = {}
for k, e in res_hws.items():
fst = k[0].lower()
if fst in alphabetical:
alphabetical[fst].append((k, e))
else:
alphabetical[fst] = [(k, e)]
for k, e in alphabetical.items():
alphabetical[k] = sorted(e, key=lambda x: x[0])
tmp_app_index[corpus]["words"] = alphabetical
functors = [(k, e) for (k, e) in res_fns.items()]
functors = sorted(functors, key=lambda x: x[0])
tmp_app_index[corpus]["functors"] = functors
valdb.appindex.update({"dockey": "appindex"}, {"dockey": "appindex", "data": tmp_app_index}, upsert=True)

View File

@ -3632,14 +3632,12 @@
"balanced-match": { "balanced-match": {
"version": "1.0.0", "version": "1.0.0",
"bundled": true, "bundled": true,
"dev": true, "dev": true
"optional": true
}, },
"brace-expansion": { "brace-expansion": {
"version": "1.1.11", "version": "1.1.11",
"bundled": true, "bundled": true,
"dev": true, "dev": true,
"optional": true,
"requires": { "requires": {
"balanced-match": "^1.0.0", "balanced-match": "^1.0.0",
"concat-map": "0.0.1" "concat-map": "0.0.1"
@ -3654,20 +3652,17 @@
"code-point-at": { "code-point-at": {
"version": "1.1.0", "version": "1.1.0",
"bundled": true, "bundled": true,
"dev": true, "dev": true
"optional": true
}, },
"concat-map": { "concat-map": {
"version": "0.0.1", "version": "0.0.1",
"bundled": true, "bundled": true,
"dev": true, "dev": true
"optional": true
}, },
"console-control-strings": { "console-control-strings": {
"version": "1.1.0", "version": "1.1.0",
"bundled": true, "bundled": true,
"dev": true, "dev": true
"optional": true
}, },
"core-util-is": { "core-util-is": {
"version": "1.0.2", "version": "1.0.2",
@ -3784,8 +3779,7 @@
"inherits": { "inherits": {
"version": "2.0.3", "version": "2.0.3",
"bundled": true, "bundled": true,
"dev": true, "dev": true
"optional": true
}, },
"ini": { "ini": {
"version": "1.3.5", "version": "1.3.5",
@ -3797,7 +3791,6 @@
"version": "1.0.0", "version": "1.0.0",
"bundled": true, "bundled": true,
"dev": true, "dev": true,
"optional": true,
"requires": { "requires": {
"number-is-nan": "^1.0.0" "number-is-nan": "^1.0.0"
} }
@ -3812,7 +3805,6 @@
"version": "3.0.4", "version": "3.0.4",
"bundled": true, "bundled": true,
"dev": true, "dev": true,
"optional": true,
"requires": { "requires": {
"brace-expansion": "^1.1.7" "brace-expansion": "^1.1.7"
} }
@ -3820,14 +3812,12 @@
"minimist": { "minimist": {
"version": "0.0.8", "version": "0.0.8",
"bundled": true, "bundled": true,
"dev": true, "dev": true
"optional": true
}, },
"minipass": { "minipass": {
"version": "2.2.4", "version": "2.2.4",
"bundled": true, "bundled": true,
"dev": true, "dev": true,
"optional": true,
"requires": { "requires": {
"safe-buffer": "^5.1.1", "safe-buffer": "^5.1.1",
"yallist": "^3.0.0" "yallist": "^3.0.0"
@ -3846,7 +3836,6 @@
"version": "0.5.1", "version": "0.5.1",
"bundled": true, "bundled": true,
"dev": true, "dev": true,
"optional": true,
"requires": { "requires": {
"minimist": "0.0.8" "minimist": "0.0.8"
} }
@ -3927,8 +3916,7 @@
"number-is-nan": { "number-is-nan": {
"version": "1.0.1", "version": "1.0.1",
"bundled": true, "bundled": true,
"dev": true, "dev": true
"optional": true
}, },
"object-assign": { "object-assign": {
"version": "4.1.1", "version": "4.1.1",
@ -3940,7 +3928,6 @@
"version": "1.4.0", "version": "1.4.0",
"bundled": true, "bundled": true,
"dev": true, "dev": true,
"optional": true,
"requires": { "requires": {
"wrappy": "1" "wrappy": "1"
} }
@ -4062,7 +4049,6 @@
"version": "1.0.2", "version": "1.0.2",
"bundled": true, "bundled": true,
"dev": true, "dev": true,
"optional": true,
"requires": { "requires": {
"code-point-at": "^1.0.0", "code-point-at": "^1.0.0",
"is-fullwidth-code-point": "^1.0.0", "is-fullwidth-code-point": "^1.0.0",

@ -1 +1 @@
Subproject commit 86e56767ddb72b83adcb144c32373b3e92e215dc Subproject commit f0b0abac1bd32ad6e9e29e7b737e4162e28568c2