forked from kristjan/cjvt-valency
parent
81395890ab
commit
bf0970a90a
Binary file not shown.
Binary file not shown.
@ -1,15 +1,68 @@
|
|||||||
from Seqparser import Seqparser
|
from Seqparser import Seqparser
|
||||||
import argparse
|
import argparse
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
import datetime
|
||||||
|
import hashlib
|
||||||
|
from pymongo import MongoClient
|
||||||
|
|
||||||
|
SSKJ_USER = "sskj2"
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
aparser = argparse.ArgumentParser()
|
aparser = argparse.ArgumentParser()
|
||||||
aparser.add_argument("--sskj-html", type=str)
|
aparser.add_argument("--sskj-html", type=str)
|
||||||
aparser.add_argument("--sskj-json", type=str)
|
aparser.add_argument("--sskj-json", type=str)
|
||||||
aparser.add_argument("--wordlist", type=str)
|
aparser.add_argument("--wordlist", type=str)
|
||||||
args = aparser.parse_args()
|
aparser.add_argument("--operation", type=str)
|
||||||
|
aparser.add_argument("--dbaddr", type=str)
|
||||||
|
aparser.add_argument("--dbuser", type=str)
|
||||||
|
aparser.add_argument("--dbpass", type=str)
|
||||||
|
args = aparser.parse_args()
|
||||||
|
|
||||||
|
if args.operation == "gen_sskj_json":
|
||||||
|
sqp = Seqparser()
|
||||||
|
sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
sqp = Seqparser()
|
if args.operation == "gen_wordlist":
|
||||||
# sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
|
sqp = Seqparser()
|
||||||
|
sqp.generate_sskj_wordlist(args.sskj_senses, args.wordlist)
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
sqp.generate_sskj_wordlist(args.sskj_json, args.wordlist)
|
if args.operation == "senses_to_db":
|
||||||
|
db_entries = []
|
||||||
|
tmp_dt = datetime.datetime.utcnow()
|
||||||
|
with Path(args.sskj_json).open("r") as fp:
|
||||||
|
jdata = json.load(fp)
|
||||||
|
# print(jdata[list(jdata.keys())[201]])
|
||||||
|
for hw, entry in jdata.items():
|
||||||
|
for key, sense in entry[0]["senses"].items():
|
||||||
|
desc = sense[0][1]
|
||||||
|
if sense[0][0] == "razl":
|
||||||
|
desc = desc[:-1] # for some reason, descriptions contain a ':'
|
||||||
|
else:
|
||||||
|
desc = sense[0][0] + ": " + desc
|
||||||
|
tmp_entry = {
|
||||||
|
"desc": desc,
|
||||||
|
"hw": hw,
|
||||||
|
"author": SSKJ_USER
|
||||||
|
}
|
||||||
|
tmp_entry["sense_id"] = "{}-{}".format(
|
||||||
|
SSKJ_USER,
|
||||||
|
hashlib.sha256(str(tmp_entry).encode("utf-8")).hexdigest()[:10]
|
||||||
|
)
|
||||||
|
tmp_entry["date"] = tmp_dt
|
||||||
|
db_entries.append(tmp_entry)
|
||||||
|
print(len(db_entries))
|
||||||
|
|
||||||
|
# db login
|
||||||
|
client = MongoClient(
|
||||||
|
"mongodb://{}".format(args.dbaddr),
|
||||||
|
username=args.dbuser,
|
||||||
|
password=args.dbpass,
|
||||||
|
authSource="valdb",
|
||||||
|
authMechanism='SCRAM-SHA-1'
|
||||||
|
)
|
||||||
|
valdb = client.valdb
|
||||||
|
valdb.senses.insert_many(db_entries)
|
||||||
|
Loading…
Reference in new issue