From c2dea0bf70740dc1bdc036b6515c11585f4d99c4 Mon Sep 17 00:00:00 2001 From: voje Date: Mon, 15 Apr 2019 02:35:50 +0200 Subject: [PATCH] moved db_preparation logic to fill-database phase --- Makefile | 21 ++++-- README.md | 13 ++-- src/backend_flask/app.py | 35 ++-------- src/backend_flask/conf_files/dev_conf.yaml | 1 - .../conf_files/dev_conf_init.yaml | 7 -- src/backend_flask/preprocess.py | 70 +++++++++++++++++++ src/frontend_vue/package-lock.json | 28 ++------ src/pkg/cjvt-corpusparser | 2 +- 8 files changed, 106 insertions(+), 71 deletions(-) delete mode 100644 src/backend_flask/conf_files/dev_conf_init.yaml create mode 100644 src/backend_flask/preprocess.py diff --git a/Makefile b/Makefile index 750f1a4..bca5d6c 100644 --- a/Makefile +++ b/Makefile @@ -12,8 +12,8 @@ SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_xml/ssj500k-sl.body.sample.xml" KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml" KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json" -# OUTPUT = "db" -OUTPUT = "file" +OUTPUT = "db" +# OUTPUT = "file" OUTDIR = "/tmp/three" # if you're running this in docker, make sure to mount the volume DBADDR = "0.0.0.0:27017" # don't use localhost @@ -21,6 +21,8 @@ DB_ADM_USER = valadmin DB_ADM_PASS = valadminpass DB_USR_USER = valuser DB_USR_PASS = valuserpass + +N_CORES = 1 export .PHONY: python-env fill-database @@ -63,9 +65,11 @@ fill-database: data/samples --ssj-file $(SSJ_FILE) --kres-srl-folder $(KRES_SRL_FOLDER) \ --output $(OUTPUT) --outdir $(OUTDIR) --dbaddr $(DBADDR) \ --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) \ - --cores 2 + --cores $(N_CORES) + ## Frontend + ## Run from host ## See src/frontend_vue/README.md for port settings etc. frontend-dev: @@ -76,12 +80,15 @@ frontend-prod: ## Backend -backend-dev-init: python-env-install + +# runs once and exits before the app starts +backend-prepare-db: cd ./src/backend_flask; python3 app.py \ - --config-file ./conf_files/dev_conf_init.yaml \ - --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) + --config-file ./conf_files/dev_conf.yaml \ + --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \ + --prepare-db -backend-dev: python-env-install +backend-dev: cd ./src/backend_flask; python3 app.py \ --config-file ./conf_files/dev_conf.yaml \ --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) diff --git a/README.md b/README.md index ea9b10a..7915c6e 100644 --- a/README.md +++ b/README.md @@ -47,14 +47,17 @@ If all goes well, we should be able to inspect the database, filled with corpora ### Flask backend (1 container) Relies heavily on the database. Set that up first. ```bash +# spin up container $ make python-env -# development: -# run the first time, to prepare the db, then kill -# it runs a few minutes, there should be a new collection in the db when finished -$ make backend-dev-init +# install our packages +$ make python-env-install + +# needs to be ran once to modify a new database +$ make backend-prepare-db -$ make backend-dev # debug with this one +# with debugger +$ make backend-dev # production $ make backend-prod diff --git a/src/backend_flask/app.py b/src/backend_flask/app.py index 62024d0..e73c7bc 100644 --- a/src/backend_flask/app.py +++ b/src/backend_flask/app.py @@ -431,33 +431,8 @@ def api_senses_update(): # APP PREFLIGHT ---------------------. -def prepare_db(): - def helper_tid_to_token(tid, tokens): - for t in tokens: - if t["tid"] == tid: - return t - return None - - # update entries (add headwords and fuctors for indexing) - for corpus in CORPORA: - for e in valdb[corpus].find({}): - if e["srl_links"] is None: - e["headwords"] = [] - e["functors"] = [] - else: - hw_tids = list(set([x["from"] for x in e["srl_links"]])) - hw_tokens = [helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids] - headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens] - e["headwords"] = headwords - - functors = list(set([x["afun"] for x in e["srl_links"]])) - e["functors"] = functors - - valdb[corpus].save(e) - - valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)]) - valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)]) - +def prepare_app_index(): + log.info("[*] preparing app_index") # create app_index (used in frontend, left side word index) tmp_app_index = {c: {} for c in CORPORA} for corpus in CORPORA: @@ -504,6 +479,7 @@ if __name__ == "__main__": print("Starting app.py main()") aparser = argparse.ArgumentParser(description="Arguments for app.py") aparser.add_argument("--config-file", type=str, help="check ./conf_files/") + aparser.add_argument('--prepare-db', action="store_true", default=False) aparser.add_argument("--dbuser", type=str) aparser.add_argument("--dbpass", type=str) aparser.add_argument("--dbaddr", type=str) @@ -530,8 +506,9 @@ if __name__ == "__main__": ) valdb = client.valdb - if bool(config["prepare_db"]): - prepare_db() + if args.prepare_db: + prepare_app_index() + sys.exit() # app index from db app_index = (valdb.appindex.find_one({"dockey": "appindex"}))["data"] diff --git a/src/backend_flask/conf_files/dev_conf.yaml b/src/backend_flask/conf_files/dev_conf.yaml index 7dd4f53..31dd233 100644 --- a/src/backend_flask/conf_files/dev_conf.yaml +++ b/src/backend_flask/conf_files/dev_conf.yaml @@ -3,5 +3,4 @@ debug: True port: 5004 host: localhost logfile: "/var/log/valency_backend.log" -prepare_db: False --- diff --git a/src/backend_flask/conf_files/dev_conf_init.yaml b/src/backend_flask/conf_files/dev_conf_init.yaml deleted file mode 100644 index 3753cc7..0000000 --- a/src/backend_flask/conf_files/dev_conf_init.yaml +++ /dev/null @@ -1,7 +0,0 @@ ---- -debug: True -port: 5004 -host: localhost -logfile: "/var/log/valency_backend.log" -prepare_db: True ---- diff --git a/src/backend_flask/preprocess.py b/src/backend_flask/preprocess.py new file mode 100644 index 0000000..4f26474 --- /dev/null +++ b/src/backend_flask/preprocess.py @@ -0,0 +1,70 @@ +CORPORA = ["kres", "ssj"] + +if __name__ == "__main__": + + valdb = None + + def helper_tid_to_token(tid, tokens): + for t in tokens: + if t["tid"] == tid: + return t + return None + + # update entries (add headwords and fuctors for indexing) + for corpus in CORPORA: + for e in valdb[corpus].find({}): + if e["srl_links"] is None: + e["headwords"] = [] + e["functors"] = [] + else: + hw_tids = list(set([x["from"] for x in e["srl_links"]])) + hw_tokens = [helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids] + headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens] + e["headwords"] = headwords + + functors = list(set([x["afun"] for x in e["srl_links"]])) + e["functors"] = functors + + valdb[corpus].save(e) + + valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)]) + valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)]) + + # create app_index (used in frontend, left side word index) + tmp_app_index = {c: {} for c in CORPORA} + for corpus in CORPORA: + res_hws = {} + res_fns = {} + for e in valdb[corpus].find({}): + if "headwords" not in e: + continue + for hw in e["headwords"]: + if hw in res_hws: + res_hws[hw] += 1 + else: + res_hws[hw] = 1 + if "functors" not in e: + continue + for fn in e["functors"]: + if fn in res_fns: + res_fns[fn] += 1 + else: + res_fns[fn] = 1 + + alphabetical = {} + for k, e in res_hws.items(): + fst = k[0].lower() + if fst in alphabetical: + alphabetical[fst].append((k, e)) + else: + alphabetical[fst] = [(k, e)] + + for k, e in alphabetical.items(): + alphabetical[k] = sorted(e, key=lambda x: x[0]) + tmp_app_index[corpus]["words"] = alphabetical + + functors = [(k, e) for (k, e) in res_fns.items()] + functors = sorted(functors, key=lambda x: x[0]) + tmp_app_index[corpus]["functors"] = functors + + valdb.appindex.update({"dockey": "appindex"}, {"dockey": "appindex", "data": tmp_app_index}, upsert=True) \ No newline at end of file diff --git a/src/frontend_vue/package-lock.json b/src/frontend_vue/package-lock.json index aedaf92..a2a9f2f 100644 --- a/src/frontend_vue/package-lock.json +++ b/src/frontend_vue/package-lock.json @@ -3632,14 +3632,12 @@ "balanced-match": { "version": "1.0.0", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "brace-expansion": { "version": "1.1.11", "bundled": true, "dev": true, - "optional": true, "requires": { "balanced-match": "^1.0.0", "concat-map": "0.0.1" @@ -3654,20 +3652,17 @@ "code-point-at": { "version": "1.1.0", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "concat-map": { "version": "0.0.1", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "console-control-strings": { "version": "1.1.0", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "core-util-is": { "version": "1.0.2", @@ -3784,8 +3779,7 @@ "inherits": { "version": "2.0.3", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "ini": { "version": "1.3.5", @@ -3797,7 +3791,6 @@ "version": "1.0.0", "bundled": true, "dev": true, - "optional": true, "requires": { "number-is-nan": "^1.0.0" } @@ -3812,7 +3805,6 @@ "version": "3.0.4", "bundled": true, "dev": true, - "optional": true, "requires": { "brace-expansion": "^1.1.7" } @@ -3820,14 +3812,12 @@ "minimist": { "version": "0.0.8", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "minipass": { "version": "2.2.4", "bundled": true, "dev": true, - "optional": true, "requires": { "safe-buffer": "^5.1.1", "yallist": "^3.0.0" @@ -3846,7 +3836,6 @@ "version": "0.5.1", "bundled": true, "dev": true, - "optional": true, "requires": { "minimist": "0.0.8" } @@ -3927,8 +3916,7 @@ "number-is-nan": { "version": "1.0.1", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "object-assign": { "version": "4.1.1", @@ -3940,7 +3928,6 @@ "version": "1.4.0", "bundled": true, "dev": true, - "optional": true, "requires": { "wrappy": "1" } @@ -4062,7 +4049,6 @@ "version": "1.0.2", "bundled": true, "dev": true, - "optional": true, "requires": { "code-point-at": "^1.0.0", "is-fullwidth-code-point": "^1.0.0", diff --git a/src/pkg/cjvt-corpusparser b/src/pkg/cjvt-corpusparser index 86e5676..f0b0aba 160000 --- a/src/pkg/cjvt-corpusparser +++ b/src/pkg/cjvt-corpusparser @@ -1 +1 @@ -Subproject commit 86e56767ddb72b83adcb144c32373b3e92e215dc +Subproject commit f0b0abac1bd32ad6e9e29e7b737e4162e28568c2