from Seqparser import Seqparser import argparse import sys from pathlib import Path import json import datetime import hashlib from pymongo import MongoClient SSKJ_USER = "sskj2" if __name__ == "__main__": aparser = argparse.ArgumentParser() aparser.add_argument("--sskj-html", type=str) aparser.add_argument("--sskj-json", type=str) aparser.add_argument("--wordlist", type=str) aparser.add_argument("--operation", type=str) aparser.add_argument("--dbaddr", type=str) aparser.add_argument("--dbuser", type=str) aparser.add_argument("--dbpass", type=str) args = aparser.parse_args() if args.operation == "gen_sskj_json": sqp = Seqparser() sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json) sys.exit() if args.operation == "gen_wordlist": sqp = Seqparser() sqp.generate_sskj_wordlist(args.sskj_senses, args.wordlist) sys.exit() if args.operation == "senses_to_db": db_entries = [] tmp_dt = datetime.datetime.utcnow() with Path(args.sskj_json).open("r") as fp: jdata = json.load(fp) # print(jdata[list(jdata.keys())[201]]) for hw, entry in jdata.items(): for key, sense in entry[0]["senses"].items(): desc = sense[0][1] if sense[0][0] == "razl": desc = desc[:-1] # for some reason, descriptions contain a ':' else: desc = sense[0][0] + ": " + desc tmp_entry = { "desc": desc, "hw": hw, "author": SSKJ_USER } tmp_entry["sense_id"] = "{}-{}".format( SSKJ_USER, hashlib.sha256(str(tmp_entry).encode("utf-8")).hexdigest()[:10] ) tmp_entry["date"] = tmp_dt db_entries.append(tmp_entry) print(len(db_entries)) # db login client = MongoClient( "mongodb://{}".format(args.dbaddr), username=args.dbuser, password=args.dbpass, authSource="valdb", authMechanism='SCRAM-SHA-1' ) valdb = client.valdb valdb.senses.insert_many(db_entries)