You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
cjvt-valency/src/pkg/seqparser/seqparser/main.py

69 lines
2.3 KiB

from Seqparser import Seqparser
import argparse
import sys
from pathlib import Path
import json
import datetime
import hashlib
from pymongo import MongoClient
SSKJ_USER = "sskj2"
if __name__ == "__main__":
aparser = argparse.ArgumentParser()
aparser.add_argument("--sskj-html", type=str)
aparser.add_argument("--sskj-json", type=str)
aparser.add_argument("--wordlist", type=str)
aparser.add_argument("--operation", type=str)
aparser.add_argument("--dbaddr", type=str)
aparser.add_argument("--dbuser", type=str)
aparser.add_argument("--dbpass", type=str)
args = aparser.parse_args()
if args.operation == "gen_sskj_json":
sqp = Seqparser()
sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
sys.exit()
if args.operation == "gen_wordlist":
sqp = Seqparser()
sqp.generate_sskj_wordlist(args.sskj_senses, args.wordlist)
sys.exit()
if args.operation == "senses_to_db":
db_entries = []
tmp_dt = datetime.datetime.utcnow()
with Path(args.sskj_json).open("r") as fp:
jdata = json.load(fp)
# print(jdata[list(jdata.keys())[201]])
for hw, entry in jdata.items():
for key, sense in entry[0]["senses"].items():
desc = sense[0][1]
if sense[0][0] == "razl":
desc = desc[:-1] # for some reason, descriptions contain a ':'
else:
desc = sense[0][0] + ": " + desc
tmp_entry = {
"desc": desc,
"hw": hw,
"author": SSKJ_USER
}
tmp_entry["sense_id"] = "{}-{}".format(
SSKJ_USER,
hashlib.sha256(str(tmp_entry).encode("utf-8")).hexdigest()[:10]
)
tmp_entry["date"] = tmp_dt
db_entries.append(tmp_entry)
print(len(db_entries))
# db login
client = MongoClient(
"mongodb://{}".format(args.dbaddr),
username=args.dbuser,
password=args.dbpass,
authSource="valdb",
authMechanism='SCRAM-SHA-1'
)
valdb = client.valdb
valdb.senses.insert_many(db_entries)