First commit on scripts branch

This commit is contained in:
2020-09-15 14:08:16 +02:00
parent c803057164
commit 3d91251905
23 changed files with 2032 additions and 1256209 deletions

View File

@@ -37,8 +37,8 @@ app = Flask(__name__)
app.config.from_object("db_config")
mongo = PyMongo(app)
# app.config["CORPORA"] = ["ssj", "kres"]
app.config["CORPORA"] = ["ssj"]
# app.config["CORPORA"] = ["ssj", "kres", "gigafida"]
app.config["CORPORA"] = ["gigafida"]
app.config["BANNED_HEADWORDS"] = ["biti"]
app.config["QUERY_LIMIT"] = 1000
@@ -248,20 +248,23 @@ def api_get_frames():
if corpus not in app.config["CORPORA"]:
return json.dumps({"error": "cor={kres,ssj}"})
log.info("Test1")
cur = mongo.db[corpus].find({"headwords": hw})
log.info("Test2")
frames = []
for ent in cur[:app.config["QUERY_LIMIT"]]:
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
cur.close()
log.info("Test3")
# filter by relevant hw
frames = [x for x in frames if x.hw == hw]
ret_frames = RF(frames, mongo.db.sensemap)
log.info("Test3")
json_ret = {"frames": []}
for frame in ret_frames:
json_ret["frames"].append(frame.to_json())
log.info("Test4")
return json.dumps(json_ret)
# return prepare_frames(ret_frames)
@@ -445,7 +448,7 @@ def _is_banned(hw):
banned = False
return banned
def prepare_app_index(appindex_json, sskj_wordlist):
def prepare_app_index(appindex_json):
log.info("[*] preparing app_index")
# create app_index (used in frontend, left side word index)
tmp_app_index = {c: {} for c in app.config["CORPORA"]}
@@ -453,18 +456,17 @@ def prepare_app_index(appindex_json, sskj_wordlist):
res_hws = {}
res_fns = {}
print('CORPUS...!!...')
print(corpus)
a = mongo.db[corpus]
print('TEST_OK')
print(a)
print(mongo.db)
a = mongo.db.list_collection_names()
print('TEST_OK2')
# print('CORPUS...!!...')
# print(corpus)
# a = mongo.db[corpus]
# print('TEST_OK')
# print(a)
# print(mongo.db)
# a = mongo.db.list_collection_names()
# print('TEST_OK2')
nentries = mongo.db[corpus].count()
idx = 0
for e in mongo.db[corpus].find({}):
print('aaa')
if "headwords" not in e:
continue
for hw in e["headwords"]:
@@ -494,6 +496,7 @@ def prepare_app_index(appindex_json, sskj_wordlist):
for letter, words in alphabetical.items():
filtered_words = [x for x in words if not _is_banned(x[0])]
# filtered_words = [x for x in words]
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
tmp_app_index[corpus]["words"] = alphabetical
@@ -570,12 +573,16 @@ if __name__ == "__main__":
if args.prepare_db:
with Path(args.sskj_wordlist).open("r") as fp:
sskj_wordlist = json.load(fp)
prepare_app_index(args.appindex_json, sskj_wordlist)
prepare_app_index(args.appindex_json)
sys.exit()
# app index from db
with Path(args.appindex_json).open("r") as fp:
app.config["app_index"] = json.load(fp)
# a = app.config["app_index"]
# b = app.config["app_index"]["kres"]
# c = app.config["app_index"]["kres"]["words"]
# print('HERE')
# log.info("[*] Starting app.py with config:\n%s".format(config))
log.info("[*] Starting app.py with config:\n{}".format(config))

View File

@@ -0,0 +1,106 @@
import argparse
import json
from flask import Flask
from flask_pymongo import PyMongo
from pathlib import Path
app = Flask(__name__)
app.config.from_object("db_config")
mongo = PyMongo(app)
app.config["BANNED_HEADWORDS"] = ["biti"]
def _is_banned(hw):
banned = True
if hw in app.config["BANNED_HEADWORDS"]:
banned = True
elif hw in sskj_wordlist["wordlist"]:
banned = False
elif (hw + " se") in sskj_wordlist["wordlist"]:
banned = False
return banned
def prepare_app_index(appindex_json, corporas, previous_json=None):
if previous_json:
with Path(previous_json).open("r") as fp:
tmp_app_index = json.load(fp)
else:
tmp_app_index = {}
# create app_index (used in frontend, left side word index)
for c in corporas:
tmp_app_index[c] = {}
for corpus in corporas:
res_hws = {}
res_fns = {}
# print('CORPUS...!!...')
# print(corpus)
# a = mongo.db[corpus]
# print('TEST_OK')
# print(a)
# print(mongo.db)
# a = mongo.db.list_collection_names()
# print('TEST_OK2')
nentries = mongo.db[corpus].count()
idx = 0
for e in mongo.db[corpus].find({}):
if "headwords" not in e:
continue
for hw in e["headwords"]:
if hw in res_hws:
res_hws[hw] += 1
else:
res_hws[hw] = 1
if "functors" not in e:
continue
for fn in e["functors"]:
if fn in res_fns:
res_fns[fn] += 1
else:
res_fns[fn] = 1
idx += 1
if idx % 10000 == 0:
print("indexing {}: {}/{}".format(
corpus, idx, nentries))
alphabetical = {}
for k, e in res_hws.items():
fst = k[0].lower()
if fst in alphabetical:
alphabetical[fst].append((k, e))
else:
alphabetical[fst] = [(k, e)]
for letter, words in alphabetical.items():
filtered_words = [x for x in words if not _is_banned(x[0])]
# filtered_words = [x for x in words]
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
tmp_app_index[corpus]["words"] = alphabetical
functors = [(k, e) for (k, e) in res_fns.items()]
functors = sorted(functors, key=lambda x: x[0])
tmp_app_index[corpus]["functors"] = functors
with Path(appindex_json).open("w") as fp:
json.dump(tmp_app_index, fp)
if __name__ == "__main__":
print("Starting app.py main()")
aparser = argparse.ArgumentParser(description="Arguments for app.py")
aparser.add_argument("--previous-json", type=str, default=None)
aparser.add_argument("--appindex-json", type=str)
aparser.add_argument("--sskj-wordlist", type=str)
args = aparser.parse_args()
corporas = ['gigafida']
with Path(args.sskj_wordlist).open("r") as fp:
sskj_wordlist = json.load(fp)
prepare_app_index(args.appindex_json, corporas, args.previous_json)

View File

@@ -1,2 +1,2 @@
MONGO_URI = "mongodb://sizif:p5e3r4u8t7@my_mongo:27017/valdb"
MONGO_URI = "mongodb://user:user@0.0.0.0:27017/valdb"
MONGO_AUTH_SOURCE = 'admin'

View File

@@ -0,0 +1,18 @@
import json
import os
input_dir = "/media/luka/Portable Disk/Datasets/gigafida_jos/final_json"
output_file = "../../all_sentences.json"
results = {}
filenames = os.listdir(input_dir)
len(filenames)
for i, filename in enumerate(filenames):
if filename.endswith(".json"):
with open(os.path.join(input_dir, filename)) as json_file:
data = json.load(json_file)
results[filename.split('-')[0]] = list(data.keys())
print('Progress: %.2f %%' % (i/len(filenames)))
with open(output_file, 'w') as f:
json.dump(results, f)

View File

@@ -1,3 +1,3 @@
{
"api_addr": "http://193.2.76.103:8084"
"api_addr": "http://0.0.0.0:8084"
}

View File

@@ -3513,14 +3513,12 @@
"balanced-match": {
"version": "1.0.0",
"bundled": true,
"dev": true,
"optional": true
"dev": true
},
"brace-expansion": {
"version": "1.1.11",
"bundled": true,
"dev": true,
"optional": true,
"requires": {
"balanced-match": "^1.0.0",
"concat-map": "0.0.1"
@@ -3535,20 +3533,17 @@
"code-point-at": {
"version": "1.1.0",
"bundled": true,
"dev": true,
"optional": true
"dev": true
},
"concat-map": {
"version": "0.0.1",
"bundled": true,
"dev": true,
"optional": true
"dev": true
},
"console-control-strings": {
"version": "1.1.0",
"bundled": true,
"dev": true,
"optional": true
"dev": true
},
"core-util-is": {
"version": "1.0.2",
@@ -3665,8 +3660,7 @@
"inherits": {
"version": "2.0.3",
"bundled": true,
"dev": true,
"optional": true
"dev": true
},
"ini": {
"version": "1.3.5",
@@ -3678,7 +3672,6 @@
"version": "1.0.0",
"bundled": true,
"dev": true,
"optional": true,
"requires": {
"number-is-nan": "^1.0.0"
}
@@ -3693,7 +3686,6 @@
"version": "3.0.4",
"bundled": true,
"dev": true,
"optional": true,
"requires": {
"brace-expansion": "^1.1.7"
}
@@ -3701,14 +3693,12 @@
"minimist": {
"version": "0.0.8",
"bundled": true,
"dev": true,
"optional": true
"dev": true
},
"minipass": {
"version": "2.3.5",
"bundled": true,
"dev": true,
"optional": true,
"requires": {
"safe-buffer": "^5.1.2",
"yallist": "^3.0.0"
@@ -3727,7 +3717,6 @@
"version": "0.5.1",
"bundled": true,
"dev": true,
"optional": true,
"requires": {
"minimist": "0.0.8"
}
@@ -3808,8 +3797,7 @@
"number-is-nan": {
"version": "1.0.1",
"bundled": true,
"dev": true,
"optional": true
"dev": true
},
"object-assign": {
"version": "4.1.1",
@@ -3821,7 +3809,6 @@
"version": "1.4.0",
"bundled": true,
"dev": true,
"optional": true,
"requires": {
"wrappy": "1"
}
@@ -3943,7 +3930,6 @@
"version": "1.0.2",
"bundled": true,
"dev": true,
"optional": true,
"requires": {
"code-point-at": "^1.0.0",
"is-fullwidth-code-point": "^1.0.0",

View File

@@ -62,7 +62,7 @@ export default {
name: "Nav",
props: ["appState"],
data() {return {
optCorpora: ["kres", "ssj"],
optCorpora: ["kres", "ssj", "gigafida"],
optIndexes: [
{key: "besede", val: "words"},
{key: "udeleženske vloge", val: "functors"},