forked from kristjan/cjvt-valency
First commit on scripts branch
This commit is contained in:
@@ -37,8 +37,8 @@ app = Flask(__name__)
|
||||
app.config.from_object("db_config")
|
||||
mongo = PyMongo(app)
|
||||
|
||||
# app.config["CORPORA"] = ["ssj", "kres"]
|
||||
app.config["CORPORA"] = ["ssj"]
|
||||
# app.config["CORPORA"] = ["ssj", "kres", "gigafida"]
|
||||
app.config["CORPORA"] = ["gigafida"]
|
||||
app.config["BANNED_HEADWORDS"] = ["biti"]
|
||||
app.config["QUERY_LIMIT"] = 1000
|
||||
|
||||
@@ -248,20 +248,23 @@ def api_get_frames():
|
||||
if corpus not in app.config["CORPORA"]:
|
||||
return json.dumps({"error": "cor={kres,ssj}"})
|
||||
|
||||
log.info("Test1")
|
||||
cur = mongo.db[corpus].find({"headwords": hw})
|
||||
log.info("Test2")
|
||||
frames = []
|
||||
for ent in cur[:app.config["QUERY_LIMIT"]]:
|
||||
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
|
||||
cur.close()
|
||||
|
||||
log.info("Test3")
|
||||
# filter by relevant hw
|
||||
frames = [x for x in frames if x.hw == hw]
|
||||
|
||||
ret_frames = RF(frames, mongo.db.sensemap)
|
||||
|
||||
log.info("Test3")
|
||||
json_ret = {"frames": []}
|
||||
for frame in ret_frames:
|
||||
json_ret["frames"].append(frame.to_json())
|
||||
log.info("Test4")
|
||||
return json.dumps(json_ret)
|
||||
# return prepare_frames(ret_frames)
|
||||
|
||||
@@ -445,7 +448,7 @@ def _is_banned(hw):
|
||||
banned = False
|
||||
return banned
|
||||
|
||||
def prepare_app_index(appindex_json, sskj_wordlist):
|
||||
def prepare_app_index(appindex_json):
|
||||
log.info("[*] preparing app_index")
|
||||
# create app_index (used in frontend, left side word index)
|
||||
tmp_app_index = {c: {} for c in app.config["CORPORA"]}
|
||||
@@ -453,18 +456,17 @@ def prepare_app_index(appindex_json, sskj_wordlist):
|
||||
res_hws = {}
|
||||
res_fns = {}
|
||||
|
||||
print('CORPUS...!!...')
|
||||
print(corpus)
|
||||
a = mongo.db[corpus]
|
||||
print('TEST_OK')
|
||||
print(a)
|
||||
print(mongo.db)
|
||||
a = mongo.db.list_collection_names()
|
||||
print('TEST_OK2')
|
||||
# print('CORPUS...!!...')
|
||||
# print(corpus)
|
||||
# a = mongo.db[corpus]
|
||||
# print('TEST_OK')
|
||||
# print(a)
|
||||
# print(mongo.db)
|
||||
# a = mongo.db.list_collection_names()
|
||||
# print('TEST_OK2')
|
||||
nentries = mongo.db[corpus].count()
|
||||
idx = 0
|
||||
for e in mongo.db[corpus].find({}):
|
||||
print('aaa')
|
||||
if "headwords" not in e:
|
||||
continue
|
||||
for hw in e["headwords"]:
|
||||
@@ -494,6 +496,7 @@ def prepare_app_index(appindex_json, sskj_wordlist):
|
||||
|
||||
for letter, words in alphabetical.items():
|
||||
filtered_words = [x for x in words if not _is_banned(x[0])]
|
||||
# filtered_words = [x for x in words]
|
||||
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
|
||||
|
||||
tmp_app_index[corpus]["words"] = alphabetical
|
||||
@@ -570,12 +573,16 @@ if __name__ == "__main__":
|
||||
if args.prepare_db:
|
||||
with Path(args.sskj_wordlist).open("r") as fp:
|
||||
sskj_wordlist = json.load(fp)
|
||||
prepare_app_index(args.appindex_json, sskj_wordlist)
|
||||
prepare_app_index(args.appindex_json)
|
||||
sys.exit()
|
||||
|
||||
# app index from db
|
||||
with Path(args.appindex_json).open("r") as fp:
|
||||
app.config["app_index"] = json.load(fp)
|
||||
# a = app.config["app_index"]
|
||||
# b = app.config["app_index"]["kres"]
|
||||
# c = app.config["app_index"]["kres"]["words"]
|
||||
# print('HERE')
|
||||
|
||||
# log.info("[*] Starting app.py with config:\n%s".format(config))
|
||||
log.info("[*] Starting app.py with config:\n{}".format(config))
|
||||
|
||||
106
src/backend_flask/build_app_index.py
Normal file
106
src/backend_flask/build_app_index.py
Normal file
@@ -0,0 +1,106 @@
|
||||
import argparse
|
||||
import json
|
||||
|
||||
from flask import Flask
|
||||
from flask_pymongo import PyMongo
|
||||
from pathlib import Path
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
app.config.from_object("db_config")
|
||||
mongo = PyMongo(app)
|
||||
|
||||
app.config["BANNED_HEADWORDS"] = ["biti"]
|
||||
|
||||
def _is_banned(hw):
|
||||
banned = True
|
||||
if hw in app.config["BANNED_HEADWORDS"]:
|
||||
banned = True
|
||||
elif hw in sskj_wordlist["wordlist"]:
|
||||
banned = False
|
||||
elif (hw + " se") in sskj_wordlist["wordlist"]:
|
||||
banned = False
|
||||
return banned
|
||||
|
||||
|
||||
def prepare_app_index(appindex_json, corporas, previous_json=None):
|
||||
if previous_json:
|
||||
with Path(previous_json).open("r") as fp:
|
||||
tmp_app_index = json.load(fp)
|
||||
else:
|
||||
tmp_app_index = {}
|
||||
# create app_index (used in frontend, left side word index)
|
||||
for c in corporas:
|
||||
tmp_app_index[c] = {}
|
||||
|
||||
for corpus in corporas:
|
||||
res_hws = {}
|
||||
res_fns = {}
|
||||
|
||||
# print('CORPUS...!!...')
|
||||
# print(corpus)
|
||||
# a = mongo.db[corpus]
|
||||
# print('TEST_OK')
|
||||
# print(a)
|
||||
# print(mongo.db)
|
||||
# a = mongo.db.list_collection_names()
|
||||
# print('TEST_OK2')
|
||||
nentries = mongo.db[corpus].count()
|
||||
idx = 0
|
||||
for e in mongo.db[corpus].find({}):
|
||||
if "headwords" not in e:
|
||||
continue
|
||||
for hw in e["headwords"]:
|
||||
if hw in res_hws:
|
||||
res_hws[hw] += 1
|
||||
else:
|
||||
res_hws[hw] = 1
|
||||
if "functors" not in e:
|
||||
continue
|
||||
for fn in e["functors"]:
|
||||
if fn in res_fns:
|
||||
res_fns[fn] += 1
|
||||
else:
|
||||
res_fns[fn] = 1
|
||||
idx += 1
|
||||
if idx % 10000 == 0:
|
||||
print("indexing {}: {}/{}".format(
|
||||
corpus, idx, nentries))
|
||||
|
||||
alphabetical = {}
|
||||
for k, e in res_hws.items():
|
||||
fst = k[0].lower()
|
||||
if fst in alphabetical:
|
||||
alphabetical[fst].append((k, e))
|
||||
else:
|
||||
alphabetical[fst] = [(k, e)]
|
||||
|
||||
for letter, words in alphabetical.items():
|
||||
filtered_words = [x for x in words if not _is_banned(x[0])]
|
||||
# filtered_words = [x for x in words]
|
||||
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
|
||||
|
||||
tmp_app_index[corpus]["words"] = alphabetical
|
||||
|
||||
|
||||
functors = [(k, e) for (k, e) in res_fns.items()]
|
||||
functors = sorted(functors, key=lambda x: x[0])
|
||||
tmp_app_index[corpus]["functors"] = functors
|
||||
|
||||
with Path(appindex_json).open("w") as fp:
|
||||
json.dump(tmp_app_index, fp)
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Starting app.py main()")
|
||||
aparser = argparse.ArgumentParser(description="Arguments for app.py")
|
||||
aparser.add_argument("--previous-json", type=str, default=None)
|
||||
aparser.add_argument("--appindex-json", type=str)
|
||||
aparser.add_argument("--sskj-wordlist", type=str)
|
||||
args = aparser.parse_args()
|
||||
|
||||
corporas = ['gigafida']
|
||||
|
||||
with Path(args.sskj_wordlist).open("r") as fp:
|
||||
sskj_wordlist = json.load(fp)
|
||||
|
||||
prepare_app_index(args.appindex_json, corporas, args.previous_json)
|
||||
@@ -1,2 +1,2 @@
|
||||
MONGO_URI = "mongodb://sizif:p5e3r4u8t7@my_mongo:27017/valdb"
|
||||
MONGO_URI = "mongodb://user:user@0.0.0.0:27017/valdb"
|
||||
MONGO_AUTH_SOURCE = 'admin'
|
||||
|
||||
18
src/backend_flask/get_sentence_ids.py
Normal file
18
src/backend_flask/get_sentence_ids.py
Normal file
@@ -0,0 +1,18 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
input_dir = "/media/luka/Portable Disk/Datasets/gigafida_jos/final_json"
|
||||
output_file = "../../all_sentences.json"
|
||||
|
||||
results = {}
|
||||
filenames = os.listdir(input_dir)
|
||||
len(filenames)
|
||||
for i, filename in enumerate(filenames):
|
||||
if filename.endswith(".json"):
|
||||
with open(os.path.join(input_dir, filename)) as json_file:
|
||||
data = json.load(json_file)
|
||||
results[filename.split('-')[0]] = list(data.keys())
|
||||
print('Progress: %.2f %%' % (i/len(filenames)))
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(results, f)
|
||||
Reference in New Issue
Block a user