diff --git a/Makefile b/Makefile index 0f148d4..801f95a 100644 --- a/Makefile +++ b/Makefile @@ -8,13 +8,13 @@ MAKE_ROOT = $(shell pwd) # kres is composed of many .xml files # I generated srl tags for kres in separate .json files # (for each kres.xml file there is a kres.json file with srl tags) -SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_example/ssj500k-sl.body.sample.xml" -KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_example" -# KRES_SRL_FOLDER = "/home/kristjan/kres_srl/final_json/" # t420 -KRES_SRL_FOLDER = "/home/voje/work_data/final_json" # work-pc +SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_xml/ssj500k-sl.body.sample.xml" +KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml" +KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json" OUTPUT = "db" -OUTDIR = "/home/voje/workdir/test_out" +# OUTPUT = "file" +OUTDIR = "/tmp/three" # if you're running this in docker, make sure to mount the volume DBADDR = "0.0.0.0:27017" # don't use localhost MONGOEXPRESS_USER = mxuser @@ -23,6 +23,8 @@ DB_ADM_USER = valadmin DB_ADM_PASS = valadminpass DB_USR_USER = valuser DB_USR_PASS = valuserpass + +N_CORES = 1 export .PHONY: python-env fill-database @@ -59,14 +61,17 @@ data/samples: cd data; tar xzvf samples.tar.gz # from inside python-env container: +# you can set OUTPUT = "file" and a valid OUTDIR to test writing to json files instead of DB fill-database: data/samples python3 src/pkg/cjvt-corpusparser/corpusparser/main.py --kres-folder $(KRES_FOLDER) \ --ssj-file $(SSJ_FILE) --kres-srl-folder $(KRES_SRL_FOLDER) \ --output $(OUTPUT) --outdir $(OUTDIR) --dbaddr $(DBADDR) \ - --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) + --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) \ + --cores $(N_CORES) ## Frontend + ## Run from host ## See src/frontend_vue/README.md for port settings etc. frontend-dev: @@ -77,12 +82,15 @@ frontend-prod: ## Backend -backend-dev-init: python-env-install + +# runs once and exits before the app starts +backend-prepare-db: cd ./src/backend_flask; python3 app.py \ - --config-file ./conf_files/dev_conf_init.yaml \ - --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) + --config-file ./conf_files/dev_conf.yaml \ + --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \ + --prepare-db -backend-dev: python-env-install +backend-dev: cd ./src/backend_flask; python3 app.py \ --config-file ./conf_files/dev_conf.yaml \ --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) diff --git a/README.md b/README.md index df65013..7915c6e 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ $ make python-env $ make python-env-install # run the code +# beforehand, set the data files in Makefile $ make fill-database ``` @@ -46,11 +47,17 @@ If all goes well, we should be able to inspect the database, filled with corpora ### Flask backend (1 container) Relies heavily on the database. Set that up first. ```bash +# spin up container $ make python-env -# development: -$ make backend-dev-init # run the first time, to prepare the db, then kill -$ make backend-dev # debug with this one +# install our packages +$ make python-env-install + +# needs to be ran once to modify a new database +$ make backend-prepare-db + +# with debugger +$ make backend-dev # production $ make backend-prod diff --git a/data/kres_srl b/data/kres_srl deleted file mode 120000 index f1acfc4..0000000 --- a/data/kres_srl +++ /dev/null @@ -1 +0,0 @@ -/home/kristjan/kres_srl/final_json/ \ No newline at end of file diff --git a/data/kres_srl_ikt b/data/kres_srl_ikt deleted file mode 120000 index 465d987..0000000 --- a/data/kres_srl_ikt +++ /dev/null @@ -1 +0,0 @@ -/home/voje/work_data/final_json \ No newline at end of file diff --git a/data/kres_srl_t420 b/data/kres_srl_t420 deleted file mode 120000 index f1acfc4..0000000 --- a/data/kres_srl_t420 +++ /dev/null @@ -1 +0,0 @@ -/home/kristjan/kres_srl/final_json/ \ No newline at end of file diff --git a/data/samples.tar.gz b/data/samples.tar.gz index ba89432..f50aa75 100644 Binary files a/data/samples.tar.gz and b/data/samples.tar.gz differ diff --git a/src/backend_flask/app.py b/src/backend_flask/app.py index fb05a2c..e73c7bc 100644 --- a/src/backend_flask/app.py +++ b/src/backend_flask/app.py @@ -235,28 +235,6 @@ def api_token(): # FRAMES ----------------------------. -def prepare_frames(ret_frames): - # append sentences - for frame in ret_frames: - unique_sids = {".".join(x.split(".")[:-1]): x for x in frame.tids} - # frame.sentences = [] - frame.aggr_sent = {} - # sid, tid==hw - for sid, tid in unique_sids.items(): - # hwl = vallex.get_token(tid)["lemma"] - hwl = frame.hw - tmp_idx = len(frame.sentences) - if hwl not in frame.aggr_sent: - frame.aggr_sent[hwl] = [] - frame.aggr_sent[hwl].append(tmp_idx) - # return (n-frames, rendered template) - # json frames - json_ret = {"frames": []} - for frame in ret_frames: - json_ret["frames"].append(DC(frame.to_json())) - return json.dumps(json_ret) - - # input: hw, reduct_function @app.route("/api/frames") def api_get_frames(): @@ -280,8 +258,34 @@ def api_get_frames(): frames = [x for x in frames if x.hw == hw] ret_frames = RF(frames, valdb[SENSEMAP_COLL]) - return prepare_frames(ret_frames) + json_ret = {"frames": []} + for frame in ret_frames: + json_ret["frames"].append(frame.to_json()) + return json.dumps(json_ret) + # return prepare_frames(ret_frames) + +def _aggregate_by_hw(ret_frames): + + def _tid_to_lemma(tid, sentence): + # slow and hackish + for pair in sentence: + if pair[0] == tid: + return pair[1]["lemma"] + return None + + # append sentences + for frame in ret_frames: + # unique_sids = {".".join(x.split(".")[:-1]): x for x in frame.tids} + frame.aggr_sent = {} # map of headword: [sentence indexes] + # sid, tid==hw + for i, tid in enumerate(frame.tids): + # hwl = vallex.get_token(tid)["lemma"] + hwl = _tid_to_lemma(tid, frame.sentences[i]) + if hwl not in frame.aggr_sent: + frame.aggr_sent[hwl] = [] + frame.aggr_sent[hwl].append(i) + return ret_frames # input: functor, reduce_function @app.route("/api/functor-frames") @@ -302,12 +306,17 @@ def api_get_functor_frames(): for ent in cur: frames += frames_from_db_entry(ent) # pre-process this step for prod TODO - for f in frames: - print(f.to_json()) + # filter by relevant functor + frames = [x for x in frames if functor in x.get_functors()] # raw_frames = vallex.functors_index[functor] # TODO ret_frames = RF(frames, valdb[SENSEMAP_COLL]) - return prepare_frames(ret_frames) + ret_frames = _aggregate_by_hw(ret_frames) + + json_ret = {"frames": []} + for frame in ret_frames: + json_ret["frames"].append(DC(frame.to_json())) + return json.dumps(json_ret) # FRAMES ----------------------------^ @@ -422,33 +431,8 @@ def api_senses_update(): # APP PREFLIGHT ---------------------. -def prepare_db(): - def helper_tid_to_token(tid, tokens): - for t in tokens: - if t["tid"] == tid: - return t - return None - - # update entries (add headwords and fuctors for indexing) - for corpus in CORPORA: - for e in valdb[corpus].find({}): - if e["srl_links"] is None: - e["headwords"] = [] - e["functors"] = [] - else: - hw_tids = list(set([x["from"] for x in e["srl_links"]])) - hw_tokens = [helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids] - headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens] - e["headwords"] = headwords - - functors = list(set([x["afun"] for x in e["srl_links"]])) - e["functors"] = functors - - valdb[corpus].save(e) - - valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)]) - valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)]) - +def prepare_app_index(): + log.info("[*] preparing app_index") # create app_index (used in frontend, left side word index) tmp_app_index = {c: {} for c in CORPORA} for corpus in CORPORA: @@ -495,6 +479,7 @@ if __name__ == "__main__": print("Starting app.py main()") aparser = argparse.ArgumentParser(description="Arguments for app.py") aparser.add_argument("--config-file", type=str, help="check ./conf_files/") + aparser.add_argument('--prepare-db', action="store_true", default=False) aparser.add_argument("--dbuser", type=str) aparser.add_argument("--dbpass", type=str) aparser.add_argument("--dbaddr", type=str) @@ -521,8 +506,9 @@ if __name__ == "__main__": ) valdb = client.valdb - if bool(config["prepare_db"]): - prepare_db() + if args.prepare_db: + prepare_app_index() + sys.exit() # app index from db app_index = (valdb.appindex.find_one({"dockey": "appindex"}))["data"] diff --git a/src/backend_flask/conf_files/dev_conf.yaml b/src/backend_flask/conf_files/dev_conf.yaml index 7dd4f53..31dd233 100644 --- a/src/backend_flask/conf_files/dev_conf.yaml +++ b/src/backend_flask/conf_files/dev_conf.yaml @@ -3,5 +3,4 @@ debug: True port: 5004 host: localhost logfile: "/var/log/valency_backend.log" -prepare_db: False --- diff --git a/src/backend_flask/conf_files/dev_conf_init.yaml b/src/backend_flask/conf_files/dev_conf_init.yaml deleted file mode 100644 index 3753cc7..0000000 --- a/src/backend_flask/conf_files/dev_conf_init.yaml +++ /dev/null @@ -1,7 +0,0 @@ ---- -debug: True -port: 5004 -host: localhost -logfile: "/var/log/valency_backend.log" -prepare_db: True ---- diff --git a/src/backend_flask/preprocess.py b/src/backend_flask/preprocess.py new file mode 100644 index 0000000..4f26474 --- /dev/null +++ b/src/backend_flask/preprocess.py @@ -0,0 +1,70 @@ +CORPORA = ["kres", "ssj"] + +if __name__ == "__main__": + + valdb = None + + def helper_tid_to_token(tid, tokens): + for t in tokens: + if t["tid"] == tid: + return t + return None + + # update entries (add headwords and fuctors for indexing) + for corpus in CORPORA: + for e in valdb[corpus].find({}): + if e["srl_links"] is None: + e["headwords"] = [] + e["functors"] = [] + else: + hw_tids = list(set([x["from"] for x in e["srl_links"]])) + hw_tokens = [helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids] + headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens] + e["headwords"] = headwords + + functors = list(set([x["afun"] for x in e["srl_links"]])) + e["functors"] = functors + + valdb[corpus].save(e) + + valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)]) + valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)]) + + # create app_index (used in frontend, left side word index) + tmp_app_index = {c: {} for c in CORPORA} + for corpus in CORPORA: + res_hws = {} + res_fns = {} + for e in valdb[corpus].find({}): + if "headwords" not in e: + continue + for hw in e["headwords"]: + if hw in res_hws: + res_hws[hw] += 1 + else: + res_hws[hw] = 1 + if "functors" not in e: + continue + for fn in e["functors"]: + if fn in res_fns: + res_fns[fn] += 1 + else: + res_fns[fn] = 1 + + alphabetical = {} + for k, e in res_hws.items(): + fst = k[0].lower() + if fst in alphabetical: + alphabetical[fst].append((k, e)) + else: + alphabetical[fst] = [(k, e)] + + for k, e in alphabetical.items(): + alphabetical[k] = sorted(e, key=lambda x: x[0]) + tmp_app_index[corpus]["words"] = alphabetical + + functors = [(k, e) for (k, e) in res_fns.items()] + functors = sorted(functors, key=lambda x: x[0]) + tmp_app_index[corpus]["functors"] = functors + + valdb.appindex.update({"dockey": "appindex"}, {"dockey": "appindex", "data": tmp_app_index}, upsert=True) \ No newline at end of file diff --git a/src/frontend_vue/src/components/MainDispl.vue b/src/frontend_vue/src/components/MainDispl.vue index 2804fef..d007925 100644 --- a/src/frontend_vue/src/components/MainDispl.vue +++ b/src/frontend_vue/src/components/MainDispl.vue @@ -169,7 +169,7 @@ export default { } this.sentences = {} for (var fi in this.frames) { - console.log(this.frames[fi].sentences) + // console.log(this.frames[fi].sentences) for (var si in this.frames[fi].sentences) { var sentence = this.frames[fi].sentences[si] // get ssj_id without .t123 diff --git a/src/pkg/cjvt-corpusparser b/src/pkg/cjvt-corpusparser index 2b7339a..f0b0aba 160000 --- a/src/pkg/cjvt-corpusparser +++ b/src/pkg/cjvt-corpusparser @@ -1 +1 @@ -Subproject commit 2b7339ac5abb52958f7875a3e0a0eb1899728730 +Subproject commit f0b0abac1bd32ad6e9e29e7b737e4162e28568c2 diff --git a/src/pkg/valency/valency/Frame.py b/src/pkg/valency/valency/Frame.py index e02fe9f..26e099b 100644 --- a/src/pkg/valency/valency/Frame.py +++ b/src/pkg/valency/valency/Frame.py @@ -50,6 +50,10 @@ class Frame(): self.sentences = sentences self.aggr_sent = None # Dictionary { hw: self.sentences idx } + def get_functors(self): + return [slot.functor for slot in self.slots] + + def to_json(self): ret = { "hw": self.hw,