import argparse import json from flask import Flask from flask_pymongo import PyMongo from pathlib import Path app = Flask(__name__) app.config.from_object("db_config") mongo = PyMongo(app) app.config["BANNED_HEADWORDS"] = ["biti"] def _is_banned(hw): banned = True if hw in app.config["BANNED_HEADWORDS"]: banned = True elif hw in sskj_wordlist["wordlist"]: banned = False elif (hw + " se") in sskj_wordlist["wordlist"]: banned = False return banned def prepare_app_index(appindex_json, corporas, previous_json=None): if previous_json: with Path(previous_json).open("r") as fp: tmp_app_index = json.load(fp) else: tmp_app_index = {} # create app_index (used in frontend, left side word index) for c in corporas: tmp_app_index[c] = {} for corpus in corporas: res_hws = {} res_fns = {} # print('CORPUS...!!...') # print(corpus) # a = mongo.db[corpus] # print('TEST_OK') # print(a) # print(mongo.db) # a = mongo.db.list_collection_names() # print('TEST_OK2') nentries = mongo.db[corpus].count() idx = 0 for e in mongo.db[corpus].find({}): if "headwords" not in e: continue for hw in e["headwords"]: if hw in res_hws: res_hws[hw] += 1 else: res_hws[hw] = 1 if "functors" not in e: continue for fn in e["functors"]: if fn in res_fns: res_fns[fn] += 1 else: res_fns[fn] = 1 idx += 1 if idx % 10000 == 0: print("indexing {}: {}/{}".format( corpus, idx, nentries)) alphabetical = {} for k, e in res_hws.items(): fst = k[0].lower() if fst in alphabetical: alphabetical[fst].append((k, e)) else: alphabetical[fst] = [(k, e)] for letter, words in alphabetical.items(): filtered_words = [x for x in words if not _is_banned(x[0])] # filtered_words = [x for x in words] alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0]) tmp_app_index[corpus]["words"] = alphabetical functors = [(k, e) for (k, e) in res_fns.items()] functors = sorted(functors, key=lambda x: x[0]) tmp_app_index[corpus]["functors"] = functors with Path(appindex_json).open("w") as fp: json.dump(tmp_app_index, fp) if __name__ == "__main__": print("Starting main()") aparser = argparse.ArgumentParser(description="Arguments for") aparser.add_argument("--previous-json", type=str, default=None) aparser.add_argument("--appindex-json", type=str) aparser.add_argument("--sskj-wordlist", type=str) args = aparser.parse_args() corporas = ['gigafida'] with Path(args.sskj_wordlist).open("r") as fp: sskj_wordlist = json.load(fp) prepare_app_index(args.appindex_json, corporas, args.previous_json)