diff --git a/.gitignore b/.gitignore index 34728fc..12061fb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ data/samples/ +data/wordlist.json *egg-info/ *.pyc src/frontend_vue/node_modules/ diff --git a/Makefile b/Makefile index cd395f5..7e98361 100644 --- a/Makefile +++ b/Makefile @@ -11,9 +11,14 @@ MAKE_ROOT = $(shell pwd) # SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_xml/ssj500k-sl.body.sample.xml" SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link" # KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml" -KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link" +# KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link" +KRES_FOLDER = "/home/kristjan/kres_data/payload/kres_xml" # KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json" -KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link" +# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link" +KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json" + +# This file comes with the source code. Make sure you unpack it and name it right. +SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json" OUTPUT = "db" # OUTPUT = "file" @@ -24,7 +29,7 @@ DBADDR = "0.0.0.0:27017" # don't use localhost # create it from env.default include env.local -N_CORES = 5 +N_CORES = 3 # insert kres files into database in chunks, for fewer connections KRES_CHUNK_SIZE = 30 @@ -59,6 +64,7 @@ python-env: python-env-install: pip3 install -e src/pkg/cjvt-corpusparser/. pip3 install -e src/pkg/valency/. + pip3 install -e src/pkg/seqparser/. # from inside python-env container: data/samples: @@ -103,6 +109,7 @@ backend-prepare-db: cd ./src/backend_flask; python3 app.py \ --config-file ./conf_files/dev_conf.yaml \ --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \ + --sskj-wordlist $(SSKJ_WORDLIST) \ --prepare-db backend-dev: diff --git a/data/kres_json_folder_link_t420 b/data/kres_json_folder_link_t420 deleted file mode 120000 index 965aba4..0000000 --- a/data/kres_json_folder_link_t420 +++ /dev/null @@ -1 +0,0 @@ -/home/kristjan/kres_data/payload/kres_json/ \ No newline at end of file diff --git a/data/kres_xml_folder_link_proc b/data/kres_xml_folder_link_proc deleted file mode 120000 index b5b31cd..0000000 --- a/data/kres_xml_folder_link_proc +++ /dev/null @@ -1 +0,0 @@ -/home/kristjan/kres_mount/kres_parsed/tei/ \ No newline at end of file diff --git a/data/wordlist.tar.gz b/data/wordlist.tar.gz new file mode 100644 index 0000000..3aac3f8 Binary files /dev/null and b/data/wordlist.tar.gz differ diff --git a/src/backend_flask/app.py b/src/backend_flask/app.py index 98f3be6..baa3499 100644 --- a/src/backend_flask/app.py +++ b/src/backend_flask/app.py @@ -38,6 +38,8 @@ SENSEMAP_COLL = "sensemap" # pre-generated data (gui leftside word index) CORPORA = ["ssj", "kres"] app_index = None +sskj_wordlist = None # used by _is_banned(hw) +BANNED_HEADWORDS = ["biti"] log = logging.getLogger(__name__) valdb = None @@ -430,6 +432,18 @@ def api_senses_update(): # APP PREFLIGHT ---------------------. +def _is_banned(hw): + banned = True + if hw in BANNED_HEADWORDS: + banned = True + elif hw in sskj_wordlist["wordlist"]: + banned = False + elif (hw + " se") in sskj_wordlist["wordlist"]: + banned = False + + if banned: + log.debug("Banned headword: {}".format(hw)) + return banned def prepare_app_index(): log.info("[*] preparing app_index") @@ -462,8 +476,10 @@ def prepare_app_index(): else: alphabetical[fst] = [(k, e)] - for k, e in alphabetical.items(): - alphabetical[k] = sorted(e, key=lambda x: x[0]) + for letter, words in alphabetical.items(): + filtered_words = [x for x in words if not _is_banned(x[0])] + alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0]) + tmp_app_index[corpus]["words"] = alphabetical functors = [(k, e) for (k, e) in res_fns.items()] @@ -483,6 +499,7 @@ if __name__ == "__main__": aparser.add_argument("--dbuser", type=str) aparser.add_argument("--dbpass", type=str) aparser.add_argument("--dbaddr", type=str) + aparser.add_argument("--sskj-wordlist", type=str) args = aparser.parse_args() config = None @@ -507,6 +524,8 @@ if __name__ == "__main__": valdb = client.valdb if args.prepare_db: + with Path(args.sskj_wordlist).open("r") as fp: + sskj_wordlist = json.load(fp) prepare_app_index() sys.exit() diff --git a/src/frontend_vue/src/components/Home.vue b/src/frontend_vue/src/components/Home.vue index e27c717..c115b79 100644 --- a/src/frontend_vue/src/components/Home.vue +++ b/src/frontend_vue/src/components/Home.vue @@ -9,7 +9,7 @@
-

+

{{ this.$root.store.api_error }}

diff --git a/src/frontend_vue/src/components/MainDispl.vue b/src/frontend_vue/src/components/MainDispl.vue index 359ba04..f1db514 100644 --- a/src/frontend_vue/src/components/MainDispl.vue +++ b/src/frontend_vue/src/components/MainDispl.vue @@ -1,6 +1,11 @@