make sskj-senses

This commit is contained in:
2019-04-28 01:03:44 +02:00
parent 81395890ab
commit bf0970a90a
7 changed files with 79 additions and 11 deletions

View File

@@ -324,6 +324,8 @@ def api_get_functor_frames():
# SENSES ----------------------------.
# ssj_id is legacy notation, read
# it as general sentence_id
@app.route("/api/senses/get")
def api_senses_get():
@@ -409,6 +411,8 @@ def api_senses_update():
ns["date"] = tmp_dt
id_map[frontend_sense_id] = new_sense_id
print(ns)
# insert into db
valdb[SENSES_COLL].insert(ns)
@@ -441,8 +445,8 @@ def _is_banned(hw):
elif (hw + " se") in sskj_wordlist["wordlist"]:
banned = False
if banned:
log.debug("Banned headword: {}".format(hw))
if hw[-1] == "_":
log.debug("hw: {}, banned: {}".format(hw, banned))
return banned
def prepare_app_index():

View File

@@ -1,5 +1,5 @@
SSKJ_HTML = /home/kristjan/git/diploma/data/sskj/sskj2_v1.html
SSKJ_JSON = "./sskj.json"
SSKJ_JSON = "./sskj_senses.json"
WORDLIST = "./wordlist.json"
gen_json_files:

View File

@@ -1,15 +1,68 @@
from Seqparser import Seqparser
import argparse
import sys
from pathlib import Path
import json
import datetime
import hashlib
from pymongo import MongoClient
SSKJ_USER = "sskj2"
if __name__ == "__main__":
aparser = argparse.ArgumentParser()
aparser.add_argument("--sskj-html", type=str)
aparser.add_argument("--sskj-json", type=str)
aparser.add_argument("--wordlist", type=str)
args = aparser.parse_args()
aparser = argparse.ArgumentParser()
aparser.add_argument("--sskj-html", type=str)
aparser.add_argument("--sskj-json", type=str)
aparser.add_argument("--wordlist", type=str)
aparser.add_argument("--operation", type=str)
aparser.add_argument("--dbaddr", type=str)
aparser.add_argument("--dbuser", type=str)
aparser.add_argument("--dbpass", type=str)
args = aparser.parse_args()
sqp = Seqparser()
# sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
if args.operation == "gen_sskj_json":
sqp = Seqparser()
sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
sys.exit()
sqp.generate_sskj_wordlist(args.sskj_json, args.wordlist)
if args.operation == "gen_wordlist":
sqp = Seqparser()
sqp.generate_sskj_wordlist(args.sskj_senses, args.wordlist)
sys.exit()
if args.operation == "senses_to_db":
db_entries = []
tmp_dt = datetime.datetime.utcnow()
with Path(args.sskj_json).open("r") as fp:
jdata = json.load(fp)
# print(jdata[list(jdata.keys())[201]])
for hw, entry in jdata.items():
for key, sense in entry[0]["senses"].items():
desc = sense[0][1]
if sense[0][0] == "razl":
desc = desc[:-1] # for some reason, descriptions contain a ':'
else:
desc = sense[0][0] + ": " + desc
tmp_entry = {
"desc": desc,
"hw": hw,
"author": SSKJ_USER
}
tmp_entry["sense_id"] = "{}-{}".format(
SSKJ_USER,
hashlib.sha256(str(tmp_entry).encode("utf-8")).hexdigest()[:10]
)
tmp_entry["date"] = tmp_dt
db_entries.append(tmp_entry)
print(len(db_entries))
# db login
client = MongoClient(
"mongodb://{}".format(args.dbaddr),
username=args.dbuser,
password=args.dbpass,
authSource="valdb",
authMechanism='SCRAM-SHA-1'
)
valdb = client.valdb
valdb.senses.insert_many(db_entries)

File diff suppressed because one or more lines are too long