make sskj-senses

This commit is contained in:
voje 2019-04-28 01:03:44 +02:00
parent 81395890ab
commit bf0970a90a
7 changed files with 79 additions and 11 deletions

View File

@ -19,6 +19,7 @@ KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json"
# This file comes with the source code. Make sure you unpack it and name it right. # This file comes with the source code. Make sure you unpack it and name it right.
SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json" SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json"
SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json"
OUTPUT = "db" OUTPUT = "db"
# OUTPUT = "file" # OUTPUT = "file"
@ -105,6 +106,7 @@ frontend-prod:
## Backend ## Backend
# runs once and exits before the app starts # runs once and exits before the app starts
# need to extract ./data/sskj_data.tar.gz first
backend-prepare-db: backend-prepare-db:
cd ./src/backend_flask; python3 app.py \ cd ./src/backend_flask; python3 app.py \
--config-file ./conf_files/dev_conf.yaml \ --config-file ./conf_files/dev_conf.yaml \
@ -121,3 +123,12 @@ backend-prod:
cd ./src/backend_flask; python3 app.py \ cd ./src/backend_flask; python3 app.py \
--config-file ./conf_files/prod_conf.yaml \ --config-file ./conf_files/prod_conf.yaml \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)
## add sskj senses to db (generated with pkg/seqparser)
sskj-senses:
python3 ./src/pkg/seqparser/seqparser/main.py \
--sskj-json $(SSKJ_JSON) \
--operation "senses_to_db" \
--dbaddr $(DBADDR) \
--dbuser $(DB_USR_USER) \
--dbpass $(DB_USR_PASS)

BIN
data/sskj_data.tar.gz Normal file

Binary file not shown.

Binary file not shown.

View File

@ -324,6 +324,8 @@ def api_get_functor_frames():
# SENSES ----------------------------. # SENSES ----------------------------.
# ssj_id is legacy notation, read
# it as general sentence_id
@app.route("/api/senses/get") @app.route("/api/senses/get")
def api_senses_get(): def api_senses_get():
@ -409,6 +411,8 @@ def api_senses_update():
ns["date"] = tmp_dt ns["date"] = tmp_dt
id_map[frontend_sense_id] = new_sense_id id_map[frontend_sense_id] = new_sense_id
print(ns)
# insert into db # insert into db
valdb[SENSES_COLL].insert(ns) valdb[SENSES_COLL].insert(ns)
@ -441,8 +445,8 @@ def _is_banned(hw):
elif (hw + " se") in sskj_wordlist["wordlist"]: elif (hw + " se") in sskj_wordlist["wordlist"]:
banned = False banned = False
if banned: if hw[-1] == "_":
log.debug("Banned headword: {}".format(hw)) log.debug("hw: {}, banned: {}".format(hw, banned))
return banned return banned
def prepare_app_index(): def prepare_app_index():

View File

@ -1,5 +1,5 @@
SSKJ_HTML = /home/kristjan/git/diploma/data/sskj/sskj2_v1.html SSKJ_HTML = /home/kristjan/git/diploma/data/sskj/sskj2_v1.html
SSKJ_JSON = "./sskj.json" SSKJ_JSON = "./sskj_senses.json"
WORDLIST = "./wordlist.json" WORDLIST = "./wordlist.json"
gen_json_files: gen_json_files:

View File

@ -1,15 +1,68 @@
from Seqparser import Seqparser from Seqparser import Seqparser
import argparse import argparse
import sys
from pathlib import Path
import json
import datetime
import hashlib
from pymongo import MongoClient
SSKJ_USER = "sskj2"
if __name__ == "__main__": if __name__ == "__main__":
aparser = argparse.ArgumentParser() aparser = argparse.ArgumentParser()
aparser.add_argument("--sskj-html", type=str) aparser.add_argument("--sskj-html", type=str)
aparser.add_argument("--sskj-json", type=str) aparser.add_argument("--sskj-json", type=str)
aparser.add_argument("--wordlist", type=str) aparser.add_argument("--wordlist", type=str)
aparser.add_argument("--operation", type=str)
aparser.add_argument("--dbaddr", type=str)
aparser.add_argument("--dbuser", type=str)
aparser.add_argument("--dbpass", type=str)
args = aparser.parse_args() args = aparser.parse_args()
if args.operation == "gen_sskj_json":
sqp = Seqparser() sqp = Seqparser()
# sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json) sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
sys.exit()
sqp.generate_sskj_wordlist(args.sskj_json, args.wordlist) if args.operation == "gen_wordlist":
sqp = Seqparser()
sqp.generate_sskj_wordlist(args.sskj_senses, args.wordlist)
sys.exit()
if args.operation == "senses_to_db":
db_entries = []
tmp_dt = datetime.datetime.utcnow()
with Path(args.sskj_json).open("r") as fp:
jdata = json.load(fp)
# print(jdata[list(jdata.keys())[201]])
for hw, entry in jdata.items():
for key, sense in entry[0]["senses"].items():
desc = sense[0][1]
if sense[0][0] == "razl":
desc = desc[:-1] # for some reason, descriptions contain a ':'
else:
desc = sense[0][0] + ": " + desc
tmp_entry = {
"desc": desc,
"hw": hw,
"author": SSKJ_USER
}
tmp_entry["sense_id"] = "{}-{}".format(
SSKJ_USER,
hashlib.sha256(str(tmp_entry).encode("utf-8")).hexdigest()[:10]
)
tmp_entry["date"] = tmp_dt
db_entries.append(tmp_entry)
print(len(db_entries))
# db login
client = MongoClient(
"mongodb://{}".format(args.dbaddr),
username=args.dbuser,
password=args.dbpass,
authSource="valdb",
authMechanism='SCRAM-SHA-1'
)
valdb = client.valdb
valdb.senses.insert_many(db_entries)