forked from kristjan/cjvt-valency
73 lines
2.5 KiB
Python
73 lines
2.5 KiB
Python
# Deprecated: headword creation moved to be part of corpusparser,
|
|
# index creation moved to app.py as a preprocessing (with exit) step
|
|
|
|
CORPORA = ["kres", "ssj"]
|
|
|
|
if __name__ == "__main__":
|
|
|
|
valdb = None
|
|
|
|
def helper_tid_to_token(tid, tokens):
|
|
for t in tokens:
|
|
if t["tid"] == tid:
|
|
return t
|
|
return None
|
|
|
|
# update entries (add headwords and fuctors for indexing)
|
|
for corpus in CORPORA:
|
|
for e in valdb[corpus].find({}):
|
|
if e["srl_links"] is None:
|
|
e["headwords"] = []
|
|
e["functors"] = []
|
|
else:
|
|
hw_tids = list(set([x["from"] for x in e["srl_links"]]))
|
|
hw_tokens = [helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids]
|
|
headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens]
|
|
e["headwords"] = headwords
|
|
|
|
functors = list(set([x["afun"] for x in e["srl_links"]]))
|
|
e["functors"] = functors
|
|
|
|
valdb[corpus].save(e)
|
|
|
|
valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)])
|
|
valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)])
|
|
|
|
# create app_index (used in frontend, left side word index)
|
|
tmp_app_index = {c: {} for c in CORPORA}
|
|
for corpus in CORPORA:
|
|
res_hws = {}
|
|
res_fns = {}
|
|
for e in valdb[corpus].find({}):
|
|
if "headwords" not in e:
|
|
continue
|
|
for hw in e["headwords"]:
|
|
if hw in res_hws:
|
|
res_hws[hw] += 1
|
|
else:
|
|
res_hws[hw] = 1
|
|
if "functors" not in e:
|
|
continue
|
|
for fn in e["functors"]:
|
|
if fn in res_fns:
|
|
res_fns[fn] += 1
|
|
else:
|
|
res_fns[fn] = 1
|
|
|
|
alphabetical = {}
|
|
for k, e in res_hws.items():
|
|
fst = k[0].lower()
|
|
if fst in alphabetical:
|
|
alphabetical[fst].append((k, e))
|
|
else:
|
|
alphabetical[fst] = [(k, e)]
|
|
|
|
for k, e in alphabetical.items():
|
|
alphabetical[k] = sorted(e, key=lambda x: x[0])
|
|
tmp_app_index[corpus]["words"] = alphabetical
|
|
|
|
functors = [(k, e) for (k, e) in res_fns.items()]
|
|
functors = sorted(functors, key=lambda x: x[0])
|
|
tmp_app_index[corpus]["functors"] = functors
|
|
|
|
valdb.appindex.update({"dockey": "appindex"}, {"dockey": "appindex", "data": tmp_app_index}, upsert=True) |