cjvt-valency/src/backend_flask/preprocess.py

CORPORA = ["kres", "ssj"]

if __name__ == "__main__":

	valdb = None

    def helper_tid_to_token(tid, tokens):
        for t in tokens:
            if t["tid"] == tid:
                return t
        return None

    # update entries (add headwords and fuctors for indexing)
    for corpus in CORPORA:
        for e in valdb[corpus].find({}):
            if e["srl_links"] is None:
                e["headwords"] = []
                e["functors"] = []
            else:
                hw_tids = list(set([x["from"] for x in e["srl_links"]]))
                hw_tokens = [helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids]
                headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens]
                e["headwords"] = headwords

                functors = list(set([x["afun"] for x in e["srl_links"]]))
                e["functors"] = functors

            valdb[corpus].save(e)

        valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)])
        valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)])

    # create app_index (used in frontend, left side word index)
    tmp_app_index = {c: {} for c in CORPORA}
    for corpus in CORPORA:
        res_hws = {}
        res_fns = {}
        for e in valdb[corpus].find({}):
            if "headwords" not in e:
                continue
            for hw in e["headwords"]:
                if hw in res_hws:
                    res_hws[hw] += 1
                else:
                    res_hws[hw] = 1
            if "functors" not in e:
                continue
            for fn in e["functors"]:
                if fn in res_fns:
                    res_fns[fn] += 1
                else:
                    res_fns[fn] = 1

        alphabetical = {}
        for k, e in res_hws.items():
            fst = k[0].lower()
            if fst in alphabetical:
                alphabetical[fst].append((k, e))
            else:
                alphabetical[fst] = [(k, e)]

        for k, e in alphabetical.items():
            alphabetical[k] = sorted(e, key=lambda x: x[0])
        tmp_app_index[corpus]["words"] = alphabetical

        functors = [(k, e) for (k, e) in res_fns.items()]
        functors = sorted(functors, key=lambda x: x[0])
        tmp_app_index[corpus]["functors"] = functors

    valdb.appindex.update({"dockey": "appindex"}, {"dockey": "appindex", "data": tmp_app_index}, upsert=True)