make sskj-senses
This commit is contained in:
parent
81395890ab
commit
bf0970a90a
11
Makefile
11
Makefile
|
@ -19,6 +19,7 @@ KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json"
|
|||
|
||||
# This file comes with the source code. Make sure you unpack it and name it right.
|
||||
SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json"
|
||||
SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json"
|
||||
|
||||
OUTPUT = "db"
|
||||
# OUTPUT = "file"
|
||||
|
@ -105,6 +106,7 @@ frontend-prod:
|
|||
## Backend
|
||||
|
||||
# runs once and exits before the app starts
|
||||
# need to extract ./data/sskj_data.tar.gz first
|
||||
backend-prepare-db:
|
||||
cd ./src/backend_flask; python3 app.py \
|
||||
--config-file ./conf_files/dev_conf.yaml \
|
||||
|
@ -121,3 +123,12 @@ backend-prod:
|
|||
cd ./src/backend_flask; python3 app.py \
|
||||
--config-file ./conf_files/prod_conf.yaml \
|
||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)
|
||||
|
||||
## add sskj senses to db (generated with pkg/seqparser)
|
||||
sskj-senses:
|
||||
python3 ./src/pkg/seqparser/seqparser/main.py \
|
||||
--sskj-json $(SSKJ_JSON) \
|
||||
--operation "senses_to_db" \
|
||||
--dbaddr $(DBADDR) \
|
||||
--dbuser $(DB_USR_USER) \
|
||||
--dbpass $(DB_USR_PASS)
|
||||
|
|
BIN
data/sskj_data.tar.gz
Normal file
BIN
data/sskj_data.tar.gz
Normal file
Binary file not shown.
Binary file not shown.
|
@ -324,6 +324,8 @@ def api_get_functor_frames():
|
|||
|
||||
|
||||
# SENSES ----------------------------.
|
||||
# ssj_id is legacy notation, read
|
||||
# it as general sentence_id
|
||||
|
||||
@app.route("/api/senses/get")
|
||||
def api_senses_get():
|
||||
|
@ -409,6 +411,8 @@ def api_senses_update():
|
|||
ns["date"] = tmp_dt
|
||||
id_map[frontend_sense_id] = new_sense_id
|
||||
|
||||
print(ns)
|
||||
|
||||
# insert into db
|
||||
valdb[SENSES_COLL].insert(ns)
|
||||
|
||||
|
@ -441,8 +445,8 @@ def _is_banned(hw):
|
|||
elif (hw + " se") in sskj_wordlist["wordlist"]:
|
||||
banned = False
|
||||
|
||||
if banned:
|
||||
log.debug("Banned headword: {}".format(hw))
|
||||
if hw[-1] == "_":
|
||||
log.debug("hw: {}, banned: {}".format(hw, banned))
|
||||
return banned
|
||||
|
||||
def prepare_app_index():
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
SSKJ_HTML = /home/kristjan/git/diploma/data/sskj/sskj2_v1.html
|
||||
SSKJ_JSON = "./sskj.json"
|
||||
SSKJ_JSON = "./sskj_senses.json"
|
||||
WORDLIST = "./wordlist.json"
|
||||
|
||||
gen_json_files:
|
||||
|
|
|
@ -1,15 +1,68 @@
|
|||
from Seqparser import Seqparser
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import json
|
||||
import datetime
|
||||
import hashlib
|
||||
from pymongo import MongoClient
|
||||
|
||||
SSKJ_USER = "sskj2"
|
||||
|
||||
if __name__ == "__main__":
|
||||
aparser = argparse.ArgumentParser()
|
||||
aparser.add_argument("--sskj-html", type=str)
|
||||
aparser.add_argument("--sskj-json", type=str)
|
||||
aparser.add_argument("--wordlist", type=str)
|
||||
args = aparser.parse_args()
|
||||
aparser = argparse.ArgumentParser()
|
||||
aparser.add_argument("--sskj-html", type=str)
|
||||
aparser.add_argument("--sskj-json", type=str)
|
||||
aparser.add_argument("--wordlist", type=str)
|
||||
aparser.add_argument("--operation", type=str)
|
||||
aparser.add_argument("--dbaddr", type=str)
|
||||
aparser.add_argument("--dbuser", type=str)
|
||||
aparser.add_argument("--dbpass", type=str)
|
||||
args = aparser.parse_args()
|
||||
|
||||
sqp = Seqparser()
|
||||
# sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
|
||||
if args.operation == "gen_sskj_json":
|
||||
sqp = Seqparser()
|
||||
sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
|
||||
sys.exit()
|
||||
|
||||
sqp.generate_sskj_wordlist(args.sskj_json, args.wordlist)
|
||||
if args.operation == "gen_wordlist":
|
||||
sqp = Seqparser()
|
||||
sqp.generate_sskj_wordlist(args.sskj_senses, args.wordlist)
|
||||
sys.exit()
|
||||
|
||||
if args.operation == "senses_to_db":
|
||||
db_entries = []
|
||||
tmp_dt = datetime.datetime.utcnow()
|
||||
with Path(args.sskj_json).open("r") as fp:
|
||||
jdata = json.load(fp)
|
||||
# print(jdata[list(jdata.keys())[201]])
|
||||
for hw, entry in jdata.items():
|
||||
for key, sense in entry[0]["senses"].items():
|
||||
desc = sense[0][1]
|
||||
if sense[0][0] == "razl":
|
||||
desc = desc[:-1] # for some reason, descriptions contain a ':'
|
||||
else:
|
||||
desc = sense[0][0] + ": " + desc
|
||||
tmp_entry = {
|
||||
"desc": desc,
|
||||
"hw": hw,
|
||||
"author": SSKJ_USER
|
||||
}
|
||||
tmp_entry["sense_id"] = "{}-{}".format(
|
||||
SSKJ_USER,
|
||||
hashlib.sha256(str(tmp_entry).encode("utf-8")).hexdigest()[:10]
|
||||
)
|
||||
tmp_entry["date"] = tmp_dt
|
||||
db_entries.append(tmp_entry)
|
||||
print(len(db_entries))
|
||||
|
||||
# db login
|
||||
client = MongoClient(
|
||||
"mongodb://{}".format(args.dbaddr),
|
||||
username=args.dbuser,
|
||||
password=args.dbpass,
|
||||
authSource="valdb",
|
||||
authMechanism='SCRAM-SHA-1'
|
||||
)
|
||||
valdb = client.valdb
|
||||
valdb.senses.insert_many(db_entries)
|
||||
|
|
Loading…
Reference in New Issue
Block a user