parent
81395890ab
commit
bf0970a90a
Binary file not shown.
Binary file not shown.
@ -1,15 +1,68 @@
|
||||
from Seqparser import Seqparser
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import json
|
||||
import datetime
|
||||
import hashlib
|
||||
from pymongo import MongoClient
|
||||
|
||||
SSKJ_USER = "sskj2"
|
||||
|
||||
if __name__ == "__main__":
|
||||
aparser = argparse.ArgumentParser()
|
||||
aparser.add_argument("--sskj-html", type=str)
|
||||
aparser.add_argument("--sskj-json", type=str)
|
||||
aparser.add_argument("--wordlist", type=str)
|
||||
args = aparser.parse_args()
|
||||
aparser = argparse.ArgumentParser()
|
||||
aparser.add_argument("--sskj-html", type=str)
|
||||
aparser.add_argument("--sskj-json", type=str)
|
||||
aparser.add_argument("--wordlist", type=str)
|
||||
aparser.add_argument("--operation", type=str)
|
||||
aparser.add_argument("--dbaddr", type=str)
|
||||
aparser.add_argument("--dbuser", type=str)
|
||||
aparser.add_argument("--dbpass", type=str)
|
||||
args = aparser.parse_args()
|
||||
|
||||
if args.operation == "gen_sskj_json":
|
||||
sqp = Seqparser()
|
||||
sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
|
||||
sys.exit()
|
||||
|
||||
sqp = Seqparser()
|
||||
# sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
|
||||
if args.operation == "gen_wordlist":
|
||||
sqp = Seqparser()
|
||||
sqp.generate_sskj_wordlist(args.sskj_senses, args.wordlist)
|
||||
sys.exit()
|
||||
|
||||
sqp.generate_sskj_wordlist(args.sskj_json, args.wordlist)
|
||||
if args.operation == "senses_to_db":
|
||||
db_entries = []
|
||||
tmp_dt = datetime.datetime.utcnow()
|
||||
with Path(args.sskj_json).open("r") as fp:
|
||||
jdata = json.load(fp)
|
||||
# print(jdata[list(jdata.keys())[201]])
|
||||
for hw, entry in jdata.items():
|
||||
for key, sense in entry[0]["senses"].items():
|
||||
desc = sense[0][1]
|
||||
if sense[0][0] == "razl":
|
||||
desc = desc[:-1] # for some reason, descriptions contain a ':'
|
||||
else:
|
||||
desc = sense[0][0] + ": " + desc
|
||||
tmp_entry = {
|
||||
"desc": desc,
|
||||
"hw": hw,
|
||||
"author": SSKJ_USER
|
||||
}
|
||||
tmp_entry["sense_id"] = "{}-{}".format(
|
||||
SSKJ_USER,
|
||||
hashlib.sha256(str(tmp_entry).encode("utf-8")).hexdigest()[:10]
|
||||
)
|
||||
tmp_entry["date"] = tmp_dt
|
||||
db_entries.append(tmp_entry)
|
||||
print(len(db_entries))
|
||||
|
||||
# db login
|
||||
client = MongoClient(
|
||||
"mongodb://{}".format(args.dbaddr),
|
||||
username=args.dbuser,
|
||||
password=args.dbpass,
|
||||
authSource="valdb",
|
||||
authMechanism='SCRAM-SHA-1'
|
||||
)
|
||||
valdb = client.valdb
|
||||
valdb.senses.insert_many(db_entries)
|
||||
|
Loading…
Reference in new issue