diff --git a/Makefile b/Makefile index 7e98361..62af9b5 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,7 @@ KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json" # This file comes with the source code. Make sure you unpack it and name it right. SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json" +SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json" OUTPUT = "db" # OUTPUT = "file" @@ -105,6 +106,7 @@ frontend-prod: ## Backend # runs once and exits before the app starts +# need to extract ./data/sskj_data.tar.gz first backend-prepare-db: cd ./src/backend_flask; python3 app.py \ --config-file ./conf_files/dev_conf.yaml \ @@ -121,3 +123,12 @@ backend-prod: cd ./src/backend_flask; python3 app.py \ --config-file ./conf_files/prod_conf.yaml \ --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) + +## add sskj senses to db (generated with pkg/seqparser) +sskj-senses: + python3 ./src/pkg/seqparser/seqparser/main.py \ + --sskj-json $(SSKJ_JSON) \ + --operation "senses_to_db" \ + --dbaddr $(DBADDR) \ + --dbuser $(DB_USR_USER) \ + --dbpass $(DB_USR_PASS) diff --git a/data/sskj_data.tar.gz b/data/sskj_data.tar.gz new file mode 100644 index 0000000..5027233 Binary files /dev/null and b/data/sskj_data.tar.gz differ diff --git a/src/pkg/seqparser/seqparser/sskj.json b/data/sskj_senses.json similarity index 100% rename from src/pkg/seqparser/seqparser/sskj.json rename to data/sskj_senses.json diff --git a/data/wordlist.tar.gz b/data/wordlist.tar.gz deleted file mode 100644 index 3aac3f8..0000000 Binary files a/data/wordlist.tar.gz and /dev/null differ diff --git a/src/backend_flask/app.py b/src/backend_flask/app.py index baa3499..16bd0d6 100644 --- a/src/backend_flask/app.py +++ b/src/backend_flask/app.py @@ -324,6 +324,8 @@ def api_get_functor_frames(): # SENSES ----------------------------. +# ssj_id is legacy notation, read +# it as general sentence_id @app.route("/api/senses/get") def api_senses_get(): @@ -409,6 +411,8 @@ def api_senses_update(): ns["date"] = tmp_dt id_map[frontend_sense_id] = new_sense_id + print(ns) + # insert into db valdb[SENSES_COLL].insert(ns) @@ -441,8 +445,8 @@ def _is_banned(hw): elif (hw + " se") in sskj_wordlist["wordlist"]: banned = False - if banned: - log.debug("Banned headword: {}".format(hw)) + if hw[-1] == "_": + log.debug("hw: {}, banned: {}".format(hw, banned)) return banned def prepare_app_index(): diff --git a/src/pkg/seqparser/Makefile b/src/pkg/seqparser/Makefile index 5dedf8c..ca847a4 100644 --- a/src/pkg/seqparser/Makefile +++ b/src/pkg/seqparser/Makefile @@ -1,5 +1,5 @@ SSKJ_HTML = /home/kristjan/git/diploma/data/sskj/sskj2_v1.html -SSKJ_JSON = "./sskj.json" +SSKJ_JSON = "./sskj_senses.json" WORDLIST = "./wordlist.json" gen_json_files: diff --git a/src/pkg/seqparser/seqparser/main.py b/src/pkg/seqparser/seqparser/main.py index 6f5ef42..e613a22 100644 --- a/src/pkg/seqparser/seqparser/main.py +++ b/src/pkg/seqparser/seqparser/main.py @@ -1,15 +1,68 @@ from Seqparser import Seqparser import argparse +import sys +from pathlib import Path +import json +import datetime +import hashlib +from pymongo import MongoClient + +SSKJ_USER = "sskj2" if __name__ == "__main__": - aparser = argparse.ArgumentParser() - aparser.add_argument("--sskj-html", type=str) - aparser.add_argument("--sskj-json", type=str) - aparser.add_argument("--wordlist", type=str) - args = aparser.parse_args() + aparser = argparse.ArgumentParser() + aparser.add_argument("--sskj-html", type=str) + aparser.add_argument("--sskj-json", type=str) + aparser.add_argument("--wordlist", type=str) + aparser.add_argument("--operation", type=str) + aparser.add_argument("--dbaddr", type=str) + aparser.add_argument("--dbuser", type=str) + aparser.add_argument("--dbpass", type=str) + args = aparser.parse_args() + + if args.operation == "gen_sskj_json": + sqp = Seqparser() + sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json) + sys.exit() - sqp = Seqparser() - # sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json) + if args.operation == "gen_wordlist": + sqp = Seqparser() + sqp.generate_sskj_wordlist(args.sskj_senses, args.wordlist) + sys.exit() - sqp.generate_sskj_wordlist(args.sskj_json, args.wordlist) + if args.operation == "senses_to_db": + db_entries = [] + tmp_dt = datetime.datetime.utcnow() + with Path(args.sskj_json).open("r") as fp: + jdata = json.load(fp) + # print(jdata[list(jdata.keys())[201]]) + for hw, entry in jdata.items(): + for key, sense in entry[0]["senses"].items(): + desc = sense[0][1] + if sense[0][0] == "razl": + desc = desc[:-1] # for some reason, descriptions contain a ':' + else: + desc = sense[0][0] + ": " + desc + tmp_entry = { + "desc": desc, + "hw": hw, + "author": SSKJ_USER + } + tmp_entry["sense_id"] = "{}-{}".format( + SSKJ_USER, + hashlib.sha256(str(tmp_entry).encode("utf-8")).hexdigest()[:10] + ) + tmp_entry["date"] = tmp_dt + db_entries.append(tmp_entry) + print(len(db_entries)) + # db login + client = MongoClient( + "mongodb://{}".format(args.dbaddr), + username=args.dbuser, + password=args.dbpass, + authSource="valdb", + authMechanism='SCRAM-SHA-1' + ) + valdb = client.valdb + valdb.senses.insert_many(db_entries)