forked from kristjan/cjvt-valency
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
107 lines
3.1 KiB
107 lines
3.1 KiB
import argparse
|
|
import json
|
|
|
|
from flask import Flask
|
|
from flask_pymongo import PyMongo
|
|
from pathlib import Path
|
|
|
|
app = Flask(__name__)
|
|
|
|
app.config.from_object("db_config")
|
|
mongo = PyMongo(app)
|
|
|
|
app.config["BANNED_HEADWORDS"] = ["biti"]
|
|
|
|
def _is_banned(hw):
|
|
banned = True
|
|
if hw in app.config["BANNED_HEADWORDS"]:
|
|
banned = True
|
|
elif hw in sskj_wordlist["wordlist"]:
|
|
banned = False
|
|
elif (hw + " se") in sskj_wordlist["wordlist"]:
|
|
banned = False
|
|
return banned
|
|
|
|
|
|
def prepare_app_index(appindex_json, corporas, previous_json=None):
|
|
if previous_json:
|
|
with Path(previous_json).open("r") as fp:
|
|
tmp_app_index = json.load(fp)
|
|
else:
|
|
tmp_app_index = {}
|
|
# create app_index (used in frontend, left side word index)
|
|
for c in corporas:
|
|
tmp_app_index[c] = {}
|
|
|
|
for corpus in corporas:
|
|
res_hws = {}
|
|
res_fns = {}
|
|
|
|
# print('CORPUS...!!...')
|
|
# print(corpus)
|
|
# a = mongo.db[corpus]
|
|
# print('TEST_OK')
|
|
# print(a)
|
|
# print(mongo.db)
|
|
# a = mongo.db.list_collection_names()
|
|
# print('TEST_OK2')
|
|
nentries = mongo.db[corpus].count()
|
|
idx = 0
|
|
for e in mongo.db[corpus].find({}):
|
|
if "headwords" not in e:
|
|
continue
|
|
for hw in e["headwords"]:
|
|
if hw in res_hws:
|
|
res_hws[hw] += 1
|
|
else:
|
|
res_hws[hw] = 1
|
|
if "functors" not in e:
|
|
continue
|
|
for fn in e["functors"]:
|
|
if fn in res_fns:
|
|
res_fns[fn] += 1
|
|
else:
|
|
res_fns[fn] = 1
|
|
idx += 1
|
|
if idx % 10000 == 0:
|
|
print("indexing {}: {}/{}".format(
|
|
corpus, idx, nentries))
|
|
|
|
alphabetical = {}
|
|
for k, e in res_hws.items():
|
|
fst = k[0].lower()
|
|
if fst in alphabetical:
|
|
alphabetical[fst].append((k, e))
|
|
else:
|
|
alphabetical[fst] = [(k, e)]
|
|
|
|
for letter, words in alphabetical.items():
|
|
filtered_words = [x for x in words if not _is_banned(x[0])]
|
|
# filtered_words = [x for x in words]
|
|
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
|
|
|
|
tmp_app_index[corpus]["words"] = alphabetical
|
|
|
|
|
|
functors = [(k, e) for (k, e) in res_fns.items()]
|
|
functors = sorted(functors, key=lambda x: x[0])
|
|
tmp_app_index[corpus]["functors"] = functors
|
|
|
|
with Path(appindex_json).open("w") as fp:
|
|
json.dump(tmp_app_index, fp)
|
|
|
|
if __name__ == "__main__":
|
|
print("Starting app.py main()")
|
|
aparser = argparse.ArgumentParser(description="Arguments for app.py")
|
|
aparser.add_argument("--previous-json", type=str, default=None)
|
|
aparser.add_argument("--appindex-json", type=str)
|
|
aparser.add_argument("--sskj-wordlist", type=str)
|
|
args = aparser.parse_args()
|
|
|
|
corporas = ['gigafida']
|
|
|
|
with Path(args.sskj_wordlist).open("r") as fp:
|
|
sskj_wordlist = json.load(fp)
|
|
|
|
prepare_app_index(args.appindex_json, corporas, args.previous_json)
|