cjvt-valency/src/backend_flask/build_app_index.py

import argparse
import json

from flask import Flask
from flask_pymongo import PyMongo
from pathlib import Path

app = Flask(__name__)

app.config.from_object("db_config")
mongo = PyMongo(app)

app.config["BANNED_HEADWORDS"] = ["biti"]

def _is_banned(hw):
    banned = True
    if hw in app.config["BANNED_HEADWORDS"]:
        banned = True
    elif hw in sskj_wordlist["wordlist"]:
        banned = False
    elif (hw + " se") in sskj_wordlist["wordlist"]:
        banned = False
    return banned


def prepare_app_index(appindex_json, corporas, previous_json=None):
    if previous_json:
        with Path(previous_json).open("r") as fp:
            tmp_app_index = json.load(fp)
    else:
        tmp_app_index = {}
    # create app_index (used in frontend, left side word index)
    for c in corporas:
        tmp_app_index[c] = {}

    for corpus in corporas:
        res_hws = {}
        res_fns = {}

        # print('CORPUS...!!...')
        # print(corpus)
        # a = mongo.db[corpus]
        # print('TEST_OK')
        # print(a)
        # print(mongo.db)
        # a = mongo.db.list_collection_names()
        # print('TEST_OK2')
        nentries = mongo.db[corpus].count()
        idx = 0
        for e in mongo.db[corpus].find({}):
            if "headwords" not in e:
                continue
            for hw in e["headwords"]:
                if hw in res_hws:
                    res_hws[hw] += 1
                else:
                    res_hws[hw] = 1
            if "functors" not in e:
                continue
            for fn in e["functors"]:
                if fn in res_fns:
                    res_fns[fn] += 1
                else:
                    res_fns[fn] = 1
            idx += 1
            if idx % 10000 == 0:
                print("indexing {}: {}/{}".format(
                    corpus, idx, nentries))

        alphabetical = {}
        for k, e in res_hws.items():
            fst = k[0].lower()
            if fst in alphabetical:
                alphabetical[fst].append((k, e))
            else:
                alphabetical[fst] = [(k, e)]

        for letter, words in alphabetical.items():
            filtered_words = [x for x in words if not _is_banned(x[0])]
            # filtered_words = [x for x in words]
            alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])

        tmp_app_index[corpus]["words"] = alphabetical


        functors = [(k, e) for (k, e) in res_fns.items()]
        functors = sorted(functors, key=lambda x: x[0])
        tmp_app_index[corpus]["functors"] = functors

    with Path(appindex_json).open("w") as fp:
        json.dump(tmp_app_index, fp)

if __name__ == "__main__":
    print("Starting app.py main()")
    aparser = argparse.ArgumentParser(description="Arguments for app.py")
    aparser.add_argument("--previous-json", type=str, default=None)
    aparser.add_argument("--appindex-json", type=str)
    aparser.add_argument("--sskj-wordlist", type=str)
    args = aparser.parse_args()

    corporas = ['gigafida']

    with Path(args.sskj_wordlist).open("r") as fp:
        sskj_wordlist = json.load(fp)

    prepare_app_index(args.appindex_json, corporas, args.previous_json)