make sskj-senses
This commit is contained in:
parent
81395890ab
commit
bf0970a90a
11
Makefile
11
Makefile
|
@ -19,6 +19,7 @@ KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json"
|
||||||
|
|
||||||
# This file comes with the source code. Make sure you unpack it and name it right.
|
# This file comes with the source code. Make sure you unpack it and name it right.
|
||||||
SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json"
|
SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json"
|
||||||
|
SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json"
|
||||||
|
|
||||||
OUTPUT = "db"
|
OUTPUT = "db"
|
||||||
# OUTPUT = "file"
|
# OUTPUT = "file"
|
||||||
|
@ -105,6 +106,7 @@ frontend-prod:
|
||||||
## Backend
|
## Backend
|
||||||
|
|
||||||
# runs once and exits before the app starts
|
# runs once and exits before the app starts
|
||||||
|
# need to extract ./data/sskj_data.tar.gz first
|
||||||
backend-prepare-db:
|
backend-prepare-db:
|
||||||
cd ./src/backend_flask; python3 app.py \
|
cd ./src/backend_flask; python3 app.py \
|
||||||
--config-file ./conf_files/dev_conf.yaml \
|
--config-file ./conf_files/dev_conf.yaml \
|
||||||
|
@ -121,3 +123,12 @@ backend-prod:
|
||||||
cd ./src/backend_flask; python3 app.py \
|
cd ./src/backend_flask; python3 app.py \
|
||||||
--config-file ./conf_files/prod_conf.yaml \
|
--config-file ./conf_files/prod_conf.yaml \
|
||||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)
|
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)
|
||||||
|
|
||||||
|
## add sskj senses to db (generated with pkg/seqparser)
|
||||||
|
sskj-senses:
|
||||||
|
python3 ./src/pkg/seqparser/seqparser/main.py \
|
||||||
|
--sskj-json $(SSKJ_JSON) \
|
||||||
|
--operation "senses_to_db" \
|
||||||
|
--dbaddr $(DBADDR) \
|
||||||
|
--dbuser $(DB_USR_USER) \
|
||||||
|
--dbpass $(DB_USR_PASS)
|
||||||
|
|
BIN
data/sskj_data.tar.gz
Normal file
BIN
data/sskj_data.tar.gz
Normal file
Binary file not shown.
Binary file not shown.
|
@ -324,6 +324,8 @@ def api_get_functor_frames():
|
||||||
|
|
||||||
|
|
||||||
# SENSES ----------------------------.
|
# SENSES ----------------------------.
|
||||||
|
# ssj_id is legacy notation, read
|
||||||
|
# it as general sentence_id
|
||||||
|
|
||||||
@app.route("/api/senses/get")
|
@app.route("/api/senses/get")
|
||||||
def api_senses_get():
|
def api_senses_get():
|
||||||
|
@ -409,6 +411,8 @@ def api_senses_update():
|
||||||
ns["date"] = tmp_dt
|
ns["date"] = tmp_dt
|
||||||
id_map[frontend_sense_id] = new_sense_id
|
id_map[frontend_sense_id] = new_sense_id
|
||||||
|
|
||||||
|
print(ns)
|
||||||
|
|
||||||
# insert into db
|
# insert into db
|
||||||
valdb[SENSES_COLL].insert(ns)
|
valdb[SENSES_COLL].insert(ns)
|
||||||
|
|
||||||
|
@ -441,8 +445,8 @@ def _is_banned(hw):
|
||||||
elif (hw + " se") in sskj_wordlist["wordlist"]:
|
elif (hw + " se") in sskj_wordlist["wordlist"]:
|
||||||
banned = False
|
banned = False
|
||||||
|
|
||||||
if banned:
|
if hw[-1] == "_":
|
||||||
log.debug("Banned headword: {}".format(hw))
|
log.debug("hw: {}, banned: {}".format(hw, banned))
|
||||||
return banned
|
return banned
|
||||||
|
|
||||||
def prepare_app_index():
|
def prepare_app_index():
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
SSKJ_HTML = /home/kristjan/git/diploma/data/sskj/sskj2_v1.html
|
SSKJ_HTML = /home/kristjan/git/diploma/data/sskj/sskj2_v1.html
|
||||||
SSKJ_JSON = "./sskj.json"
|
SSKJ_JSON = "./sskj_senses.json"
|
||||||
WORDLIST = "./wordlist.json"
|
WORDLIST = "./wordlist.json"
|
||||||
|
|
||||||
gen_json_files:
|
gen_json_files:
|
||||||
|
|
|
@ -1,15 +1,68 @@
|
||||||
from Seqparser import Seqparser
|
from Seqparser import Seqparser
|
||||||
import argparse
|
import argparse
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
import datetime
|
||||||
|
import hashlib
|
||||||
|
from pymongo import MongoClient
|
||||||
|
|
||||||
|
SSKJ_USER = "sskj2"
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
aparser = argparse.ArgumentParser()
|
aparser = argparse.ArgumentParser()
|
||||||
aparser.add_argument("--sskj-html", type=str)
|
aparser.add_argument("--sskj-html", type=str)
|
||||||
aparser.add_argument("--sskj-json", type=str)
|
aparser.add_argument("--sskj-json", type=str)
|
||||||
aparser.add_argument("--wordlist", type=str)
|
aparser.add_argument("--wordlist", type=str)
|
||||||
|
aparser.add_argument("--operation", type=str)
|
||||||
|
aparser.add_argument("--dbaddr", type=str)
|
||||||
|
aparser.add_argument("--dbuser", type=str)
|
||||||
|
aparser.add_argument("--dbpass", type=str)
|
||||||
args = aparser.parse_args()
|
args = aparser.parse_args()
|
||||||
|
|
||||||
|
if args.operation == "gen_sskj_json":
|
||||||
sqp = Seqparser()
|
sqp = Seqparser()
|
||||||
# sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
|
sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
sqp.generate_sskj_wordlist(args.sskj_json, args.wordlist)
|
if args.operation == "gen_wordlist":
|
||||||
|
sqp = Seqparser()
|
||||||
|
sqp.generate_sskj_wordlist(args.sskj_senses, args.wordlist)
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
if args.operation == "senses_to_db":
|
||||||
|
db_entries = []
|
||||||
|
tmp_dt = datetime.datetime.utcnow()
|
||||||
|
with Path(args.sskj_json).open("r") as fp:
|
||||||
|
jdata = json.load(fp)
|
||||||
|
# print(jdata[list(jdata.keys())[201]])
|
||||||
|
for hw, entry in jdata.items():
|
||||||
|
for key, sense in entry[0]["senses"].items():
|
||||||
|
desc = sense[0][1]
|
||||||
|
if sense[0][0] == "razl":
|
||||||
|
desc = desc[:-1] # for some reason, descriptions contain a ':'
|
||||||
|
else:
|
||||||
|
desc = sense[0][0] + ": " + desc
|
||||||
|
tmp_entry = {
|
||||||
|
"desc": desc,
|
||||||
|
"hw": hw,
|
||||||
|
"author": SSKJ_USER
|
||||||
|
}
|
||||||
|
tmp_entry["sense_id"] = "{}-{}".format(
|
||||||
|
SSKJ_USER,
|
||||||
|
hashlib.sha256(str(tmp_entry).encode("utf-8")).hexdigest()[:10]
|
||||||
|
)
|
||||||
|
tmp_entry["date"] = tmp_dt
|
||||||
|
db_entries.append(tmp_entry)
|
||||||
|
print(len(db_entries))
|
||||||
|
|
||||||
|
# db login
|
||||||
|
client = MongoClient(
|
||||||
|
"mongodb://{}".format(args.dbaddr),
|
||||||
|
username=args.dbuser,
|
||||||
|
password=args.dbpass,
|
||||||
|
authSource="valdb",
|
||||||
|
authMechanism='SCRAM-SHA-1'
|
||||||
|
)
|
||||||
|
valdb = client.valdb
|
||||||
|
valdb.senses.insert_many(db_entries)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user