First commit on scripts branch

2020-09-15 14:08:16 +02:00
parent c803057164
commit 3d91251905
23 changed files with 2032 additions and 1256209 deletions
--- a/src/backend_flask/app.py
+++ b/src/backend_flask/app.py
@@ -37,8 +37,8 @@ app = Flask(__name__)
 app.config.from_object("db_config")
 mongo = PyMongo(app)

-# app.config["CORPORA"] = ["ssj", "kres"]
-app.config["CORPORA"] = ["ssj"]
+# app.config["CORPORA"] = ["ssj", "kres", "gigafida"]
+app.config["CORPORA"] = ["gigafida"]
 app.config["BANNED_HEADWORDS"] = ["biti"]
 app.config["QUERY_LIMIT"] = 1000

@@ -248,20 +248,23 @@ def api_get_frames():
    if corpus not in app.config["CORPORA"]:
        return json.dumps({"error": "cor={kres,ssj}"})

+    log.info("Test1")
    cur = mongo.db[corpus].find({"headwords": hw})
+    log.info("Test2")
    frames = []
    for ent in cur[:app.config["QUERY_LIMIT"]]:
        frames += frames_from_db_entry(ent)  # pre-process this step for prod TODO
    cur.close()
-
+    log.info("Test3")
    # filter by relevant hw
    frames = [x for x in frames if x.hw == hw]

    ret_frames = RF(frames, mongo.db.sensemap)
-
+    log.info("Test3")
    json_ret = {"frames": []}
    for frame in ret_frames:
        json_ret["frames"].append(frame.to_json())
+    log.info("Test4")
    return json.dumps(json_ret)
    # return prepare_frames(ret_frames)

@@ -445,7 +448,7 @@ def _is_banned(hw):
        banned = False
    return banned

-def prepare_app_index(appindex_json, sskj_wordlist):
+def prepare_app_index(appindex_json):
    log.info("[*] preparing app_index")
    # create app_index (used in frontend, left side word index)
    tmp_app_index = {c: {} for c in app.config["CORPORA"]}
@@ -453,18 +456,17 @@ def prepare_app_index(appindex_json, sskj_wordlist):
        res_hws = {}
        res_fns = {}

-        print('CORPUS...!!...')
-        print(corpus)
-        a = mongo.db[corpus]
-        print('TEST_OK')
-        print(a)
-        print(mongo.db)
-        a = mongo.db.list_collection_names()
-        print('TEST_OK2')
+        # print('CORPUS...!!...')
+        # print(corpus)
+        # a = mongo.db[corpus]
+        # print('TEST_OK')
+        # print(a)
+        # print(mongo.db)
+        # a = mongo.db.list_collection_names()
+        # print('TEST_OK2')
        nentries = mongo.db[corpus].count()
        idx = 0
        for e in mongo.db[corpus].find({}):
-            print('aaa')
            if "headwords" not in e:
                continue
            for hw in e["headwords"]:
@@ -494,6 +496,7 @@ def prepare_app_index(appindex_json, sskj_wordlist):

        for letter, words in alphabetical.items():
            filtered_words = [x for x in words if not _is_banned(x[0])]
+            # filtered_words = [x for x in words]
            alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])

        tmp_app_index[corpus]["words"] = alphabetical
@@ -570,12 +573,16 @@ if __name__ == "__main__":
    if args.prepare_db:
        with Path(args.sskj_wordlist).open("r") as fp:
            sskj_wordlist = json.load(fp)
-        prepare_app_index(args.appindex_json, sskj_wordlist)
+        prepare_app_index(args.appindex_json)
        sys.exit()

    # app index from db
    with Path(args.appindex_json).open("r") as fp:
        app.config["app_index"] = json.load(fp)
+        # a = app.config["app_index"]
+        # b = app.config["app_index"]["kres"]
+        # c = app.config["app_index"]["kres"]["words"]
+        # print('HERE')

    # log.info("[*] Starting app.py with config:\n%s".format(config))
    log.info("[*] Starting app.py with config:\n{}".format(config))
--- a/src/backend_flask/build_app_index.py
+++ b/src/backend_flask/build_app_index.py
@@ -0,0 +1,106 @@
+import argparse
+import json
+
+from flask import Flask
+from flask_pymongo import PyMongo
+from pathlib import Path
+
+app = Flask(__name__)
+
+app.config.from_object("db_config")
+mongo = PyMongo(app)
+
+app.config["BANNED_HEADWORDS"] = ["biti"]
+
+def _is_banned(hw):
+    banned = True
+    if hw in app.config["BANNED_HEADWORDS"]:
+        banned = True
+    elif hw in sskj_wordlist["wordlist"]:
+        banned = False
+    elif (hw + " se") in sskj_wordlist["wordlist"]:
+        banned = False
+    return banned
+
+
+def prepare_app_index(appindex_json, corporas, previous_json=None):
+    if previous_json:
+        with Path(previous_json).open("r") as fp:
+            tmp_app_index = json.load(fp)
+    else:
+        tmp_app_index = {}
+    # create app_index (used in frontend, left side word index)
+    for c in corporas:
+        tmp_app_index[c] = {}
+
+    for corpus in corporas:
+        res_hws = {}
+        res_fns = {}
+
+        # print('CORPUS...!!...')
+        # print(corpus)
+        # a = mongo.db[corpus]
+        # print('TEST_OK')
+        # print(a)
+        # print(mongo.db)
+        # a = mongo.db.list_collection_names()
+        # print('TEST_OK2')
+        nentries = mongo.db[corpus].count()
+        idx = 0
+        for e in mongo.db[corpus].find({}):
+            if "headwords" not in e:
+                continue
+            for hw in e["headwords"]:
+                if hw in res_hws:
+                    res_hws[hw] += 1
+                else:
+                    res_hws[hw] = 1
+            if "functors" not in e:
+                continue
+            for fn in e["functors"]:
+                if fn in res_fns:
+                    res_fns[fn] += 1
+                else:
+                    res_fns[fn] = 1
+            idx += 1
+            if idx % 10000 == 0:
+                print("indexing {}: {}/{}".format(
+                    corpus, idx, nentries))
+
+        alphabetical = {}
+        for k, e in res_hws.items():
+            fst = k[0].lower()
+            if fst in alphabetical:
+                alphabetical[fst].append((k, e))
+            else:
+                alphabetical[fst] = [(k, e)]
+
+        for letter, words in alphabetical.items():
+            filtered_words = [x for x in words if not _is_banned(x[0])]
+            # filtered_words = [x for x in words]
+            alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
+
+        tmp_app_index[corpus]["words"] = alphabetical
+
+
+        functors = [(k, e) for (k, e) in res_fns.items()]
+        functors = sorted(functors, key=lambda x: x[0])
+        tmp_app_index[corpus]["functors"] = functors
+
+    with Path(appindex_json).open("w") as fp:
+        json.dump(tmp_app_index, fp)
+
+if __name__ == "__main__":
+    print("Starting app.py main()")
+    aparser = argparse.ArgumentParser(description="Arguments for app.py")
+    aparser.add_argument("--previous-json", type=str, default=None)
+    aparser.add_argument("--appindex-json", type=str)
+    aparser.add_argument("--sskj-wordlist", type=str)
+    args = aparser.parse_args()
+
+    corporas = ['gigafida']
+
+    with Path(args.sskj_wordlist).open("r") as fp:
+        sskj_wordlist = json.load(fp)
+
+    prepare_app_index(args.appindex_json, corporas, args.previous_json)
--- a/src/backend_flask/db_config.py
+++ b/src/backend_flask/db_config.py
@@ -1,2 +1,2 @@
-MONGO_URI = "mongodb://sizif:p5e3r4u8t7@my_mongo:27017/valdb"
+MONGO_URI = "mongodb://user:user@0.0.0.0:27017/valdb"
 MONGO_AUTH_SOURCE = 'admin'
--- a/src/backend_flask/get_sentence_ids.py
+++ b/src/backend_flask/get_sentence_ids.py
@@ -0,0 +1,18 @@
+import json
+import os
+
+input_dir = "/media/luka/Portable Disk/Datasets/gigafida_jos/final_json"
+output_file = "../../all_sentences.json"
+
+results = {}
+filenames = os.listdir(input_dir)
+len(filenames)
+for i, filename in enumerate(filenames):
+    if filename.endswith(".json"):
+        with open(os.path.join(input_dir, filename)) as json_file:
+            data = json.load(json_file)
+            results[filename.split('-')[0]] = list(data.keys())
+        print('Progress: %.2f %%' % (i/len(filenames)))
+
+with open(output_file, 'w') as f:
+    json.dump(results, f)