diff --git a/Dockerfile-backend-flask b/Dockerfile-backend-flask new file mode 100644 index 0000000..fe9349b --- /dev/null +++ b/Dockerfile-backend-flask @@ -0,0 +1,18 @@ +FROM cjvt-python-env + +RUN pip3 install gunicorn + +RUN mkdir -p /project/src/backend_flask +RUN mkdir -p /project/src/pkg +RUN mkdir -p /project/data + +COPY src/backend_flask /project/src/backend_flask +COPY src/pkg /project/src/pkg + +COPY data/appindex.json /project/data + +COPY src/backend_flask/entrypoint.sh /. + +COPY src/backend_flask/conf_files/prod_conf.yaml /project + +ENTRYPOINT ["/bin/bash", "/entrypoint.sh"] \ No newline at end of file diff --git a/Makefile b/Makefile index a1ec4e8..2e74b6e 100644 --- a/Makefile +++ b/Makefile @@ -124,12 +124,15 @@ backend-dev: --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \ --appindex-json $(APPINDEX_PATH) -backend-prod: +backend-prod-old: cd ./src/backend_flask; python3 app.py \ --config-file ./conf_files/prod_conf.yaml \ --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \ --appindex-json $(APPINDEX_PATH) +backend-prod: + cd ./src/backend_flask; $(MAKE) prod + ## add sskj senses to db (generated with pkg/seqparser) sskj-senses: python3 ./src/pkg/seqparser/seqparser/main.py \ diff --git a/dockerfiles/python-env/Dockerfile b/dockerfiles/python-env/Dockerfile index 266d331..0249115 100644 --- a/dockerfiles/python-env/Dockerfile +++ b/dockerfiles/python-env/Dockerfile @@ -1,26 +1,25 @@ -FROM ubuntu:16.04 +FROM ubuntu:18.04 RUN apt-get update --fix-missing RUN apt-get install -y \ vim \ python3 \ python3-pip \ -sshfs +sshfs \ +curl + +RUN pip3 install --upgrade pip RUN pip3 install \ lxml \ pandas \ sklearn \ argparse \ + pyyaml \ pathlib \ + flask \ + flask_cors \ pymongo \ - flask - -RUN apt-get install -y \ - curl + flask-pymongo ENV PYTHONIOENCODING UTF-8 - -RUN pip3 install \ - pyyaml \ - flask_cors diff --git a/dockerfiles/python-env/Makefile b/dockerfiles/python-env/Makefile index e9b30d4..9596535 100644 --- a/dockerfiles/python-env/Makefile +++ b/dockerfiles/python-env/Makefile @@ -1,4 +1,4 @@ -IMAGE_NAME="cjvt-python-env" +IMAGE_NAME="cjvt-python-env" # don't change, used in backend_flask/Makefile CNNAME="python-env" all: build run diff --git a/dockerfiles/python-env/entrypoint.sh b/dockerfiles/python-env/entrypoint.sh new file mode 100755 index 0000000..7d1cf5f --- /dev/null +++ b/dockerfiles/python-env/entrypoint.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +echo "testing entrypoint." +$(exit 1) +exit 0 \ No newline at end of file diff --git a/src/backend_flask/Makefile b/src/backend_flask/Makefile new file mode 100644 index 0000000..d1a5da9 --- /dev/null +++ b/src/backend_flask/Makefile @@ -0,0 +1,16 @@ +IMG="backend-flask" +CNT="backend_flask" + +clean: + - docker rm -f $(CNT) + +run: clean build + docker run -d --net host --name $(CNT) $(IMG) + docker logs -f $(CNT) + +build: build-cjvt-python-env + # docker build . -f ../../Dockerfile-backend-flask -t $(IMG) + cd ../..; docker build . -f Dockerfile-backend-flask -t $(IMG) + +build-cjvt-python-env: + cd ../../dockerfiles/python-env; $(MAKE) build \ No newline at end of file diff --git a/src/backend_flask/app.py b/src/backend_flask/app.py index 670ff4a..36f5513 100644 --- a/src/backend_flask/app.py +++ b/src/backend_flask/app.py @@ -26,26 +26,21 @@ from email.mime.text import MIMEText from copy import deepcopy as DC from pathlib import Path from pymongo import MongoClient +from flask_pymongo import PyMongo import pymongo import argparse -# some db collections -USERS_COLL = "users" -TOKENS_COLL = "usertokens" -SENSES_COLL = "senses" -SENSEMAP_COLL = "sensemap" - -# pre-generated data (gui leftside word index) -CORPORA = ["ssj", "kres"] -app_index = None -sskj_wordlist = None # used by _is_banned(hw) -BANNED_HEADWORDS = ["biti"] -QUERY_LIMIT = 1000 # Some headwords contain thousands of examples - not practical for the app log = logging.getLogger(__name__) -valdb = None app = Flask(__name__) +app.config.from_object("db_config") +mongo = PyMongo(app) + +app.config["CORPORA"] = ["ssj", "kres"] +app.config["BANNED_HEADWORDS"] = ["biti"] +app.config["QUERY_LIMIT"] = 1000 + # when running vuejs via webpack # CORS(app) @@ -60,7 +55,7 @@ CORS(app) @app.route("/api/dev") def api_dev(): print("DEV") - cur = valdb.kres.find({"headwords": "nagovarjati"}) + cur = mongo.db.kres.find({"headwords": "nagovarjati"}) frames = [] for ent in cur: frames += frames_from_db_entry(ent) @@ -73,12 +68,12 @@ def api_dev(): @app.route("/api/words/") def api_words(corpus): return json.dumps({ - "sorted_words": app_index[corpus]["words"], # todo - make corpus as arg + "sorted_words": app.config["app_index"][corpus]["words"], # todo - make corpus as arg }) @app.route("/api/functors/") def api_functors(corpus): - return json.dumps(app_index[corpus]["functors"]) + return json.dumps(app.config["app_index"][corpus]["functors"]) # INDEX SELECTION -------------------^ @@ -99,7 +94,7 @@ def api_register(): ): return "ERR" email_hash = hashlib.sha256(email.encode("utf-8")).hexdigest() - existing = list(valdb[USERS_COLL].find({ + existing = list(mongo.db.users.find({ "$or": [{"username": username}, {"email": email_hash}] })) if len(existing) > 0: @@ -110,7 +105,7 @@ def api_register(): password.encode("utf-8")).hexdigest(), "email": email_hash } - valdb[USERS_COLL].insert(entry) + mongo.db.users.insert(entry) return "OK" @@ -122,7 +117,7 @@ def api_login(): password = data["password"] hpass = hashlib.sha256(password.encode("utf-8")).hexdigest() - db_user = list(valdb[USERS_COLL].find({ + db_user = list(mongo.db.users.find({ "username": username, "hpass": hpass })) @@ -136,7 +131,7 @@ def api_login(): "date": datetime.datetime.utcnow(), "token": token } - valdb[TOKENS_COLL].update( + mongo.db.usertokens.update( {"username": token_entry["username"]}, token_entry, upsert=True @@ -179,7 +174,7 @@ def api_new_pass(): username = data["username"] email = data["email"] hemail = hashlib.sha256(email.encode("utf-8")).hexdigest() - db_res = list(valdb[USERS_COLL].find({ + db_res = list(mongo.db.users.find({ "username": username, "email": hemail })) @@ -191,7 +186,7 @@ def api_new_pass(): string.ascii_letters + string.digits) for i in range(10)]) # update locally hpass = hashlib.sha256(new_pass.encode("utf-8")).hexdigest() - valdb[USERS_COLL].update( + mongo.db.users.update( { "username": username, "email": hemail @@ -209,12 +204,12 @@ def token_to_username(token): key = { "token": token } - res = list(valdb[TOKENS_COLL].find(key)) + res = list(mongo.db.usertokens.find(key)) if len(res) != 1: return None username = res[0]["username"] # update deletion interval - valdb[TOKENS_COLL].update( + mongo.db.usertokens.update( key, {"$set": {"date": datetime.datetime.utcnow()}}) return username @@ -249,19 +244,19 @@ def api_get_frames(): RF = reduce_functions[rf_name]["f"] corpus = request.args.get("cor") - if corpus not in CORPORA: + if corpus not in app.config["CORPORA"]: return json.dumps({"error": "cor={kres,ssj}"}) - cur = valdb[corpus].find({"headwords": hw}) + cur = mongo.db[corpus].find({"headwords": hw}) frames = [] - for ent in cur[:QUERY_LIMIT]: + for ent in cur[:app.config["QUERY_LIMIT"]]: frames += frames_from_db_entry(ent) # pre-process this step for prod TODO cur.close() # filter by relevant hw frames = [x for x in frames if x.hw == hw] - ret_frames = RF(frames, valdb[SENSEMAP_COLL]) + ret_frames = RF(frames, mongo.db.sensemap) json_ret = {"frames": []} for frame in ret_frames: @@ -302,12 +297,12 @@ def api_get_functor_frames(): RF = reduce_functions[rf_name]["f"] corpus = request.args.get("cor") - if corpus not in CORPORA: + if corpus not in app.config["CORPORA"]: return json.dumps({"error": "cor={kres,ssj}"}) - cur = valdb[corpus].find({"functors": functor}) + cur = mongo.db[corpus].find({"functors": functor}) frames = [] - for ent in cur[:QUERY_LIMIT]: + for ent in cur[:app.config["QUERY_LIMIT"]]: frames += frames_from_db_entry(ent) # pre-process this step for prod TODO cur.close() @@ -315,7 +310,7 @@ def api_get_functor_frames(): frames = [x for x in frames if functor in x.get_functors()] # raw_frames = vallex.functors_index[functor] # TODO - ret_frames = RF(frames, valdb[SENSEMAP_COLL]) + ret_frames = RF(frames, mongo.db.sensemap) ret_frames = _aggregate_by_hw(ret_frames) json_ret = {"frames": []} @@ -334,10 +329,10 @@ def api_get_functor_frames(): def api_senses_get(): # returns senses and mapping for hw hw = request.args.get("hw") - senses = list(valdb[SENSES_COLL].find({ + senses = list(mongo.db.senses.find({ "hw": hw })) - sense_map_query = list(valdb[SENSEMAP_COLL].find({ + sense_map_query = list(mongo.db.sensemap.find({ "hw": hw })) # aggregation by max date possible on DB side @@ -417,7 +412,7 @@ def api_senses_update(): print(ns) # insert into db - valdb[SENSES_COLL].insert(ns) + mongo.db.senses.insert(ns) # replace tmp_id with mongo's _id for ssj_id, el in sense_map.items(): @@ -432,7 +427,7 @@ def api_senses_update(): "date": datetime.datetime.utcnow() } # vallex.db["v2_sense_map"].update(key, data, upsert=True) - valdb[SENSEMAP_COLL].insert(data) + mongo.db.sensemap.insert(data) return "OK" # SENSES ----------------------------^ @@ -441,7 +436,7 @@ def api_senses_update(): # APP PREFLIGHT ---------------------. def _is_banned(hw): banned = True - if hw in BANNED_HEADWORDS: + if hw in app.config["BANNED_HEADWORDS"]: banned = True elif hw in sskj_wordlist["wordlist"]: banned = False @@ -449,17 +444,17 @@ def _is_banned(hw): banned = False return banned -def prepare_app_index(appindex_json): +def prepare_app_index(appindex_json, sskj_wordlist): log.info("[*] preparing app_index") # create app_index (used in frontend, left side word index) - tmp_app_index = {c: {} for c in CORPORA} - for corpus in CORPORA: + tmp_app_index = {c: {} for c in app.config["CORPORA"]} + for corpus in app.config["CORPORA"]: res_hws = {} res_fns = {} - nentries = valdb[corpus].count() + nentries = mongo.db[corpus].count() idx = 0 - for e in valdb[corpus].find({}): + for e in mongo.db[corpus].find({}): if "headwords" not in e: continue for hw in e["headwords"]: @@ -504,6 +499,33 @@ def prepare_app_index(appindex_json): # APP PREFLIGHT ---------------------^ +def init_wsgi(app): + config = None + with Path("/project/prod_conf.yaml").open("r") as fp: + config = list(yaml.safe_load_all(fp))[0] + + app.debug = False + logfile = config["logfile"] + if app.debug: + logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + else: + logging.basicConfig(filename=logfile, level=logging.INFO) + + # db login + # app index from db + with Path(config["appindex"]).open("r") as fp: + # a dirty hack but ok + app.config["app_index"] = json.load(fp) + + # log.info("[*] Starting app.py with config:\n%s".format(config)) + log.info("[*] Starting app.py with config:\n{}".format(config)) + + +# if we don't pass arguments, assume production environment (gunicorn) +if len(sys.argv) == 1: + init_wsgi() + + if __name__ == "__main__": print("Starting app.py main()") aparser = argparse.ArgumentParser(description="Arguments for app.py") @@ -516,7 +538,6 @@ if __name__ == "__main__": aparser.add_argument("--appindex-json", type=str) args = aparser.parse_args() - config = None with Path(args.config_file).open("r") as fp: config = list(yaml.safe_load_all(fp))[0] @@ -527,28 +548,31 @@ if __name__ == "__main__": else: logging.basicConfig(filename=logfile, level=logging.INFO) + """ # db login client = MongoClient( "mongodb://{}".format(args.dbaddr), username=args.dbuser, password=args.dbpass, - authSource="valdb", + authSource="mongo.db", authMechanism='SCRAM-SHA-1' ) - valdb = client.valdb + valdb = client.mongo.db + """ if args.prepare_db: with Path(args.sskj_wordlist).open("r") as fp: sskj_wordlist = json.load(fp) - prepare_app_index(args.appindex_json) + prepare_app_index(args.appindex_json, sskj_wordlist) sys.exit() # app index from db with Path(args.appindex_json).open("r") as fp: - app_index = json.load(fp) + app.config["app_index"] = json.load(fp) # log.info("[*] Starting app.py with config:\n%s".format(config)) log.info("[*] Starting app.py with config:\n{}".format(config)) app.run(host=str(config["host"]), port=int(config["port"])) + diff --git a/src/backend_flask/conf_files/dev_conf.yaml b/src/backend_flask/conf_files/dev_conf.yaml index 1f21a14..f023ce2 100644 --- a/src/backend_flask/conf_files/dev_conf.yaml +++ b/src/backend_flask/conf_files/dev_conf.yaml @@ -4,3 +4,4 @@ port: 8084 host: localhost logfile: "/var/log/valency_backend.log" --- + diff --git a/src/backend_flask/conf_files/prod_conf.yaml b/src/backend_flask/conf_files/prod_conf.yaml index c5aaa24..754e6cb 100644 --- a/src/backend_flask/conf_files/prod_conf.yaml +++ b/src/backend_flask/conf_files/prod_conf.yaml @@ -1,6 +1,10 @@ --- -debug: True port: 8084 host: 0.0.0.0 logfile: "/var/log/valency_backend.log" ---- +appindex: /project/data/appindex.json + +# Same as in root Makefile +dbaddr: 0.0.0.0:27017 +dbuser: valuser +dbpass: valuserpass diff --git a/src/backend_flask/db_config.py b/src/backend_flask/db_config.py new file mode 100644 index 0000000..ccd4bce --- /dev/null +++ b/src/backend_flask/db_config.py @@ -0,0 +1,2 @@ +MONGO_URI = "mongodb://valuser:valuserpass@127.0.0.1:27017/valdb" +MONGO_AUTH_SOURCE = 'admin' diff --git a/src/backend_flask/entrypoint.sh b/src/backend_flask/entrypoint.sh new file mode 100755 index 0000000..b924e73 --- /dev/null +++ b/src/backend_flask/entrypoint.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +pip3 install -e /project/src/pkg/cjvt-corpusparser/. +pip3 install -e /project/src/pkg/valency/. +pip3 install -e /project/src/pkg/seqparser/. + +sleep 10000 diff --git a/src/backend_flask/preprocess.py b/src/backend_flask/preprocess.py deleted file mode 100644 index 4057411..0000000 --- a/src/backend_flask/preprocess.py +++ /dev/null @@ -1,73 +0,0 @@ -# Deprecated: headword creation moved to be part of corpusparser, -# index creation moved to app.py as a preprocessing (with exit) step - -CORPORA = ["kres", "ssj"] - -if __name__ == "__main__": - - valdb = None - - def helper_tid_to_token(tid, tokens): - for t in tokens: - if t["tid"] == tid: - return t - return None - - # update entries (add headwords and fuctors for indexing) - for corpus in CORPORA: - for e in valdb[corpus].find({}): - if e["srl_links"] is None: - e["headwords"] = [] - e["functors"] = [] - else: - hw_tids = list(set([x["from"] for x in e["srl_links"]])) - hw_tokens = [helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids] - headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens] - e["headwords"] = headwords - - functors = list(set([x["afun"] for x in e["srl_links"]])) - e["functors"] = functors - - valdb[corpus].save(e) - - valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)]) - valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)]) - - # create app_index (used in frontend, left side word index) - tmp_app_index = {c: {} for c in CORPORA} - for corpus in CORPORA: - res_hws = {} - res_fns = {} - for e in valdb[corpus].find({}): - if "headwords" not in e: - continue - for hw in e["headwords"]: - if hw in res_hws: - res_hws[hw] += 1 - else: - res_hws[hw] = 1 - if "functors" not in e: - continue - for fn in e["functors"]: - if fn in res_fns: - res_fns[fn] += 1 - else: - res_fns[fn] = 1 - - alphabetical = {} - for k, e in res_hws.items(): - fst = k[0].lower() - if fst in alphabetical: - alphabetical[fst].append((k, e)) - else: - alphabetical[fst] = [(k, e)] - - for k, e in alphabetical.items(): - alphabetical[k] = sorted(e, key=lambda x: x[0]) - tmp_app_index[corpus]["words"] = alphabetical - - functors = [(k, e) for (k, e) in res_fns.items()] - functors = sorted(functors, key=lambda x: x[0]) - tmp_app_index[corpus]["functors"] = functors - - valdb.appindex.update({"dockey": "appindex"}, {"dockey": "appindex", "data": tmp_app_index}, upsert=True) \ No newline at end of file