Compare commits

..

1 Commits
master ... dev

Author SHA1 Message Date
voje c563df31ba fixed reduce_0 functors sort ACT, PAT, other...
5 years ago

3
.gitignore vendored

@ -1,7 +1,4 @@
data/samples/
data/wordlist.json
data/sskj_senses.json
data/appindex.json
*egg-info/
*.pyc
src/frontend_vue/node_modules/

@ -1,16 +0,0 @@
FROM cjvt-python-env
RUN mkdir -p /project/src/backend_flask
RUN mkdir -p /project/src/pkg
RUN mkdir -p /project/data
COPY src/backend_flask /project/src/backend_flask
COPY src/pkg /project/src/pkg
COPY data/appindex.json /project/data
COPY src/backend_flask/entrypoint.sh /.
COPY src/backend_flask/conf_files/prod_conf.yaml /project
ENTRYPOINT ["/bin/bash", "/entrypoint.sh"]

@ -3,41 +3,21 @@
MAKE_ROOT = $(shell pwd)
### Input data
# I received ssj500k in one .xml file,
# kres is composed of many .xml files
# I generated srl tags for kres in separate .json files
# (for each kres.xml file there is a kres.json file with srl tags)
# SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_xml/ssj500k-sl.body.sample.xml"
SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link"
# KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
# KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
KRES_FOLDER = "/home/kristjan/kres_data/payload/kres_xml"
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json"
# This file comes with the source code. Make sure you unpack it and name it right.
SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json"
SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json"
# for pre-generation the index of all headwords and functors
APPINDEX_PATH = "$(MAKE_ROOT)/data/appindex.json"
OUTPUT = "db"
# OUTPUT = "file"
OUTDIR = "/tmp/three" # if you're running this in docker, make sure to mount the volume
DBADDR = "0.0.0.0:27017" # don't use localhost
N_CORES = 5
# insert kres files into database in chunks, for fewer connections
KRES_CHUNK_SIZE = 30
# Some backend parameters can be found in conf file (see make backend)
# credentials from .gitignored file
# create it from env.default
include env.local
N_CORES = 3
# insert kres files into database in chunks, for fewer connections
KRES_CHUNK_SIZE = 30
# Backend parameters found in conf file (see make backend)
export
.PHONY: python-env fill-database
@ -68,7 +48,6 @@ python-env:
python-env-install:
pip3 install -e src/pkg/cjvt-corpusparser/.
pip3 install -e src/pkg/valency/.
pip3 install -e src/pkg/seqparser/.
# from inside python-env container:
data/samples:
@ -105,46 +84,22 @@ frontend-dev:
frontend-prod:
cd src/frontend_vue/; $(MAKE) prod
build-frontend-prod:
cd src/frontend_vue/; $(MAKE) build-prod
## Backend
# runs once and exits before the app starts
# need to extract ./data/sskj_data.tar.gz first
backend-prepare-db:
cd ./src/backend_flask; python3 app.py \
--config-file ./conf_files/dev_conf.yaml \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \
--sskj-wordlist $(SSKJ_WORDLIST) \
--appindex-json $(APPINDEX_PATH) \
--prepare-db
backend-dev:
cd ./src/backend_flask; python3 app.py \
--config-file ./conf_files/dev_conf.yaml \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \
--appindex-json $(APPINDEX_PATH)
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)
backend-prod-old:
backend-prod:
cd ./src/backend_flask; python3 app.py \
--config-file ./conf_files/prod_conf.yaml \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \
--appindex-json $(APPINDEX_PATH)
build-backend-flask:
cd ./src/backend_flask; $(MAKE) build
## add sskj senses to db (generated with pkg/seqparser)
sskj-senses:
python3 ./src/pkg/seqparser/seqparser/main.py \
--sskj-json $(SSKJ_JSON) \
--operation "senses_to_db" \
--dbaddr $(DBADDR) \
--dbuser $(DB_USR_USER) \
--dbpass $(DB_USR_PASS)
deploy-prod-stack:
- docker network create val-backend
docker stack deploy -c production.yaml val
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)

@ -52,6 +52,8 @@ $ make fill-database-ssj
$ make fill-database-kres
# You can detach from the running process using Ctrl-p + Ctrl-q
# this is a long operation
# if running on a remote server, use nohup:
$ nohup $(make fill-database > fill-database.log) &
@ -72,10 +74,6 @@ $ make python-env-install
# needs to be ran once to modify a new database
$ make backend-prepare-db
# if you have the file prepared (sskj_senses.json), you can
# fill the database with some senses
$ make sskj-senses
# with debugger
$ make backend-dev
@ -83,99 +81,19 @@ $ make backend-dev
$ make backend-prod
```
API endpoints:
* GET word list (pre-cached)
* GET reduced frames (pre-cached)
* POST senses
* User auth logic
### Vue frontend (1 container)
Relies on Flask backend.
Before running `make`, you might need to set the correct api address.
Check `./src/frontend_vue/config/config_prod.json`.
bash
```
# $ make frontend-dev # development
# development
# ./config_dev.json
$ make frontend-dev # development
# production
# ./config_prod.json
$ make frontend-prod
```
App available on: `http://0.0.0.0:8080`.
## Production deployment
Prerequisite: machine with free ports 80 and 8084.
### Database
Either build the database from scratch (lenghty process) using above instructions or just migrate the database from the faculty server (recommended).
Build container my-mongo:
```bash
# run once and destroy containers
$ make database-service
```
### Backend
Set database connection details in `/src/backend_flask/db_config.py`.
Change 'valuser' and 'valuserpass' to the database user.
```bash
mongodb://valuser:valuserpass@my_mongo/valdb
```
In the above line, replace `valuser` with the username and `valuserpass` with the password that was used to create the database tables (the values were set in the root Makefile).
You can also set the number of workers in `/src/backend_flask/entrypoint.sh`.
In line with `gunicorn -t 4 -b 127.0.0.1:8084 app:app`, edit the `-t` parameter.
Rule of thumb is 2x number of available CPU cores.
Build the backend container:
```bash
# From git root
$ make build-backend-flask
```
### Frontend
Set the server address (where backend will be runnig) in `src/frontend_vue/config/config_prod.json`.
Build the `/dist` folder that contains the static app (we will be using Nginx to serve it).
```bash
# From git root
$ make build-frontend-prod
```
All set, now run the stack.
Stack configuration in `production.yaml`.
```bash
# From git root
$ make deploy-prod-stack
```
## Uploading a mongo dump
There's a 15GB mongo dump containing the fully processed kres and ssj data.
We can use that file to deploy our aplication.
With this database, we will need a minimum of 8GB ram to serve the app.
If the server is struggling, frontend will throw "Network errors".
Check `0.0.0.0:8081` and remove (or backup) the current example database `valdb`.
Run the stack with mongo port mapped:
(uncomment the lines in `production.yaml`)
```yml
ports:
- 27017:27017
```
Run a separate my-mongo container with the mounted data:
```bash
$ mongo run -it --net host -v <local_dump_path>/dumps my-mongo /bin/bash
```
Inside the container (edit the uesrname, password):
```bash
$ mongorestore /dumps/valdb --db valdb --uri=mongodb://valuser:valuserpass@0.0.0.0:27017
```
After uploading, restart the stack with `27017` commented out.

@ -0,0 +1 @@
/home/kristjan/kres_data/payload/kres_json/

@ -0,0 +1 @@
/home/kristjan/kres_mount/kres_parsed/tei/

Binary file not shown.

File diff suppressed because one or more lines are too long

@ -2,7 +2,7 @@ version: '3.1'
services:
my_mongo:
my-mongo:
image: my-mongo
restart: always
ports:
@ -13,7 +13,7 @@ services:
volumes:
- ${HOME}/mongo_container/data/:/data/db
mongo_express:
mongo-express:
image: mongo-express
restart: always
ports:
@ -23,4 +23,4 @@ services:
ME_CONFIG_BASICAUTH_PASSWORD: ${MONGOEXPRESS_PASS}
ME_CONFIG_MONGODB_ADMINUSERNAME: ${DB_ADM_USER}
ME_CONFIG_MONGODB_ADMINPASSWORD: ${DB_ADM_PASS}
ME_CONFIG_MONGODB_SERVER: my_mongo
ME_CONFIG_MONGODB_SERVER: my-mongo

@ -1,26 +1,26 @@
FROM ubuntu:18.04
FROM ubuntu:16.04
RUN apt-get update --fix-missing
RUN apt-get install -y \
vim \
python3 \
python3-pip \
sshfs \
curl
RUN pip3 install --upgrade pip
sshfs
RUN pip3 install \
lxml \
pandas \
sklearn \
argparse \
pyyaml \
pathlib \
flask \
flask_cors \
pymongo \
flask-pymongo \
gunicorn
flask
RUN apt-get install -y \
curl
ENV PYTHONIOENCODING UTF-8
RUN pip3 install \
pyyaml \
flask_cors

@ -1,4 +1,4 @@
IMAGE_NAME="cjvt-python-env" # don't change, used in backend_flask/Makefile
IMAGE_NAME="cjvt-python-env"
CNNAME="python-env"
all: build run

@ -1,5 +0,0 @@
#!/bin/bash
echo "testing entrypoint."
$(exit 1)
exit 0

@ -1,6 +1,27 @@
### Credentials
MONGOEXPRESS_USER = mxuser
MONGOEXPRESS_PASS = mxuserpassword
DB_ADM_USER = valadmin
DB_ADM_PASS = valadminpass
DB_USR_USER = valuser
DB_USR_PASS = valuserpass
### Input data
# I received ssj500k in one .xml file,
# kres is composed of many .xml files
# I generated srl tags for kres in separate .json files
# (for each kres.xml file there is a kres.json file with srl tags)
# Use the files from /data/samples.tar.gz for a quick app build with a subset of data.
SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_xml/ssj500k-sl.body.sample.xml"
# SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link"
KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
# KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"

@ -0,0 +1,25 @@
# Environment specific Makefile parameters
# Copy this file and name it makefile_args
# makefile_args gets .gitignored
# SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_xml/ssj500k-sl.body.sample.xml"
SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link"
# KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
OUTPUT = "db"
# OUTPUT = "file"
OUTDIR = "/tmp/three" # if you're running this in docker, make sure to mount the volume
DBADDR = "0.0.0.0:27017" # don't use localhost
# credentials from .gitignored file
# create it from env.default
include env.local
N_CORES = 5
# insert kres files into database in chunks, for fewer connections
KRES_CHUNK_SIZE = 30
# Backend parameters found in conf file (see make backend)

@ -1,26 +0,0 @@
# frontend
server {
listen 80;
server_name _;
location / {
root /srv/dist;
index index.html index.htm;
}
location /home {
return 301 /;
}
}
# backend
server {
listen 8084;
server_name _;
location / {
proxy_set_header X-Forward-For $proxy_add_x_forwarded_for;
proxy_set_header Host $http_host;
proxy_pass http://backend_flask:8084;
}
}

@ -1,43 +0,0 @@
version: '3.1'
services:
my_mongo:
image: my-mongo
restart: always
# ports:
# - 27017:27017
expose:
- 27017
environment:
MONGO_INITDB_ROOT_USERNAME: valuser
MONGO_INITDB_ROOT_PASSWORD: valuserpass
volumes:
- ${HOME}/mongo_container/data/:/data/db
mongo_express:
image: mongo-express
restart: always
ports:
- 8081:8081
environment:
ME_CONFIG_BASICAUTH_USERNAME: test
ME_CONFIG_BASICAUTH_PASSWORD: test
ME_CONFIG_MONGODB_ADMINUSERNAME: valadmin
ME_CONFIG_MONGODB_ADMINPASSWORD: rolercoaster
ME_CONFIG_MONGODB_SERVER: my_mongo
backend_flask:
image: backend-flask
expose:
- 8084
proxy:
image: nginx
ports:
- 80:80
- 8084:8084
volumes:
- ./nginx.conf:/etc/nginx/conf.d/default.conf
- ./src/frontend_vue/dist:/srv/dist

@ -1,16 +0,0 @@
IMG="backend-flask"
CNT="backend_flask"
clean:
- docker rm -f $(CNT)
run: clean build
docker run -d --net host --name $(CNT) $(IMG)
docker logs -f $(CNT)
build: build-cjvt-python-env
# docker build . -f ../../Dockerfile-backend-flask -t $(IMG)
cd ../..; docker build . -f Dockerfile-backend-flask -t $(IMG)
build-cjvt-python-env:
cd ../../dockerfiles/python-env; $(MAKE) build

@ -26,21 +26,23 @@ from email.mime.text import MIMEText
from copy import deepcopy as DC
from pathlib import Path
from pymongo import MongoClient
from flask_pymongo import PyMongo
import pymongo
import argparse
# some db collections
USERS_COLL = "users"
TOKENS_COLL = "usertokens"
SENSES_COLL = "senses"
SENSEMAP_COLL = "sensemap"
# pre-generated data (gui leftside word index)
CORPORA = ["ssj", "kres"]
app_index = None
log = logging.getLogger(__name__)
valdb = None
app = Flask(__name__)
app.config.from_object("db_config")
mongo = PyMongo(app)
app.config["CORPORA"] = ["ssj", "kres"]
app.config["BANNED_HEADWORDS"] = ["biti"]
app.config["QUERY_LIMIT"] = 1000
# when running vuejs via webpack
# CORS(app)
@ -55,7 +57,7 @@ CORS(app)
@app.route("/api/dev")
def api_dev():
print("DEV")
cur = mongo.db.kres.find({"headwords": "nagovarjati"})
cur = valdb.kres.find({"headwords": "nagovarjati"})
frames = []
for ent in cur:
frames += frames_from_db_entry(ent)
@ -68,12 +70,12 @@ def api_dev():
@app.route("/api/words/<corpus>")
def api_words(corpus):
return json.dumps({
"sorted_words": app.config["app_index"][corpus]["words"], # todo - make corpus as arg
"sorted_words": app_index[corpus]["words"], # todo - make corpus as arg
})
@app.route("/api/functors/<corpus>")
def api_functors(corpus):
return json.dumps(app.config["app_index"][corpus]["functors"])
return json.dumps(app_index[corpus]["functors"])
# INDEX SELECTION -------------------^
@ -94,7 +96,7 @@ def api_register():
):
return "ERR"
email_hash = hashlib.sha256(email.encode("utf-8")).hexdigest()
existing = list(mongo.db.users.find({
existing = list(valdb[USERS_COLL].find({
"$or": [{"username": username}, {"email": email_hash}]
}))
if len(existing) > 0:
@ -105,7 +107,7 @@ def api_register():
password.encode("utf-8")).hexdigest(),
"email": email_hash
}
mongo.db.users.insert(entry)
valdb[USERS_COLL].insert(entry)
return "OK"
@ -117,7 +119,7 @@ def api_login():
password = data["password"]
hpass = hashlib.sha256(password.encode("utf-8")).hexdigest()
db_user = list(mongo.db.users.find({
db_user = list(valdb[USERS_COLL].find({
"username": username,
"hpass": hpass
}))
@ -131,7 +133,7 @@ def api_login():
"date": datetime.datetime.utcnow(),
"token": token
}
mongo.db.usertokens.update(
valdb[TOKENS_COLL].update(
{"username": token_entry["username"]},
token_entry,
upsert=True
@ -174,7 +176,7 @@ def api_new_pass():
username = data["username"]
email = data["email"]
hemail = hashlib.sha256(email.encode("utf-8")).hexdigest()
db_res = list(mongo.db.users.find({
db_res = list(valdb[USERS_COLL].find({
"username": username,
"email": hemail
}))
@ -186,7 +188,7 @@ def api_new_pass():
string.ascii_letters + string.digits) for i in range(10)])
# update locally
hpass = hashlib.sha256(new_pass.encode("utf-8")).hexdigest()
mongo.db.users.update(
valdb[USERS_COLL].update(
{
"username": username,
"email": hemail
@ -204,12 +206,12 @@ def token_to_username(token):
key = {
"token": token
}
res = list(mongo.db.usertokens.find(key))
res = list(valdb[TOKENS_COLL].find(key))
if len(res) != 1:
return None
username = res[0]["username"]
# update deletion interval
mongo.db.usertokens.update(
valdb[TOKENS_COLL].update(
key, {"$set": {"date": datetime.datetime.utcnow()}})
return username
@ -244,19 +246,18 @@ def api_get_frames():
RF = reduce_functions[rf_name]["f"]
corpus = request.args.get("cor")
if corpus not in app.config["CORPORA"]:
if corpus not in CORPORA:
return json.dumps({"error": "cor={kres,ssj}"})
cur = mongo.db[corpus].find({"headwords": hw})
cur = valdb[corpus].find({"headwords": hw})
frames = []
for ent in cur[:app.config["QUERY_LIMIT"]]:
for ent in cur:
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
cur.close()
# filter by relevant hw
frames = [x for x in frames if x.hw == hw]
ret_frames = RF(frames, mongo.db.sensemap)
ret_frames = RF(frames, valdb[SENSEMAP_COLL])
json_ret = {"frames": []}
for frame in ret_frames:
@ -297,20 +298,19 @@ def api_get_functor_frames():
RF = reduce_functions[rf_name]["f"]
corpus = request.args.get("cor")
if corpus not in app.config["CORPORA"]:
if corpus not in CORPORA:
return json.dumps({"error": "cor={kres,ssj}"})
cur = mongo.db[corpus].find({"functors": functor})
cur = valdb[corpus].find({"functors": functor})
frames = []
for ent in cur[:app.config["QUERY_LIMIT"]]:
for ent in cur:
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
cur.close()
# filter by relevant functor
frames = [x for x in frames if functor in x.get_functors()]
# raw_frames = vallex.functors_index[functor] # TODO
ret_frames = RF(frames, mongo.db.sensemap)
ret_frames = RF(frames, valdb[SENSEMAP_COLL])
ret_frames = _aggregate_by_hw(ret_frames)
json_ret = {"frames": []}
@ -322,17 +322,15 @@ def api_get_functor_frames():
# SENSES ----------------------------.
# ssj_id is legacy notation, read
# it as general sentence_id
@app.route("/api/senses/get")
def api_senses_get():
# returns senses and mapping for hw
hw = request.args.get("hw")
senses = list(mongo.db.senses.find({
senses = list(valdb[SENSES_COLL].find({
"hw": hw
}))
sense_map_query = list(mongo.db.sensemap.find({
sense_map_query = list(valdb[SENSEMAP_COLL].find({
"hw": hw
}))
# aggregation by max date possible on DB side
@ -409,10 +407,8 @@ def api_senses_update():
ns["date"] = tmp_dt
id_map[frontend_sense_id] = new_sense_id
print(ns)
# insert into db
mongo.db.senses.insert(ns)
valdb[SENSES_COLL].insert(ns)
# replace tmp_id with mongo's _id
for ssj_id, el in sense_map.items():
@ -427,34 +423,22 @@ def api_senses_update():
"date": datetime.datetime.utcnow()
}
# vallex.db["v2_sense_map"].update(key, data, upsert=True)
mongo.db.sensemap.insert(data)
valdb[SENSEMAP_COLL].insert(data)
return "OK"
# SENSES ----------------------------^
# APP PREFLIGHT ---------------------.
def _is_banned(hw):
banned = True
if hw in app.config["BANNED_HEADWORDS"]:
banned = True
elif hw in sskj_wordlist["wordlist"]:
banned = False
elif (hw + " se") in sskj_wordlist["wordlist"]:
banned = False
return banned
def prepare_app_index(appindex_json, sskj_wordlist):
def prepare_app_index():
log.info("[*] preparing app_index")
# create app_index (used in frontend, left side word index)
tmp_app_index = {c: {} for c in app.config["CORPORA"]}
for corpus in app.config["CORPORA"]:
tmp_app_index = {c: {} for c in CORPORA}
for corpus in CORPORA:
res_hws = {}
res_fns = {}
nentries = mongo.db[corpus].count()
idx = 0
for e in mongo.db[corpus].find({}):
for e in valdb[corpus].find({}):
if "headwords" not in e:
continue
for hw in e["headwords"]:
@ -469,10 +453,6 @@ def prepare_app_index(appindex_json, sskj_wordlist):
res_fns[fn] += 1
else:
res_fns[fn] = 1
idx += 1
if idx % 10000 == 0:
log.debug("indexing {}: {}/{}".format(
corpus, idx, nentries))
alphabetical = {}
for k, e in res_hws.items():
@ -482,47 +462,19 @@ def prepare_app_index(appindex_json, sskj_wordlist):
else:
alphabetical[fst] = [(k, e)]
for letter, words in alphabetical.items():
filtered_words = [x for x in words if not _is_banned(x[0])]
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
for k, e in alphabetical.items():
alphabetical[k] = sorted(e, key=lambda x: x[0])
tmp_app_index[corpus]["words"] = alphabetical
functors = [(k, e) for (k, e) in res_fns.items()]
functors = sorted(functors, key=lambda x: x[0])
tmp_app_index[corpus]["functors"] = functors
with Path(appindex_json).open("w") as fp:
json.dump(tmp_app_index, fp)
valdb.appindex.update({"dockey": "appindex"}, {"dockey": "appindex", "data": tmp_app_index}, upsert=True)
# APP PREFLIGHT ---------------------^
def init_wsgi(app):
print("Initiating wsgi")
config = None
with Path("/project/prod_conf.yaml").open("r") as fp:
config = list(yaml.safe_load_all(fp))[0]
app.debug = False
logfile = config["logfile"]
logging.basicConfig(filename=logfile, level=logging.INFO)
# app index from db
with Path(config["appindex"]).open("r") as fp:
# a dirty hack but ok
app.config["app_index"] = json.load(fp)
# log.info("[*] Starting app.py with config:\n%s".format(config))
log.info("[*] Starting app.py with config:\n{}".format(config))
# if we don't pass arguments, assume production environment (gunicorn)
if "gunicorn" in sys.argv[0]:
init_wsgi(app)
if __name__ == "__main__":
print("Starting app.py main()")
aparser = argparse.ArgumentParser(description="Arguments for app.py")
@ -531,10 +483,9 @@ if __name__ == "__main__":
aparser.add_argument("--dbuser", type=str)
aparser.add_argument("--dbpass", type=str)
aparser.add_argument("--dbaddr", type=str)
aparser.add_argument("--sskj-wordlist", type=str)
aparser.add_argument("--appindex-json", type=str)
args = aparser.parse_args()
config = None
with Path(args.config_file).open("r") as fp:
config = list(yaml.safe_load_all(fp))[0]
@ -545,31 +496,25 @@ if __name__ == "__main__":
else:
logging.basicConfig(filename=logfile, level=logging.INFO)
"""
# db login
client = MongoClient(
"mongodb://{}".format(args.dbaddr),
username=args.dbuser,
password=args.dbpass,
authSource="mongo.db",
authSource="valdb",
authMechanism='SCRAM-SHA-1'
)
valdb = client.mongo.db
"""
valdb = client.valdb
if args.prepare_db:
with Path(args.sskj_wordlist).open("r") as fp:
sskj_wordlist = json.load(fp)
prepare_app_index(args.appindex_json, sskj_wordlist)
prepare_app_index()
sys.exit()
# app index from db
with Path(args.appindex_json).open("r") as fp:
app.config["app_index"] = json.load(fp)
app_index = (valdb.appindex.find_one({"dockey": "appindex"}))["data"]
# log.info("[*] Starting app.py with config:\n%s".format(config))
log.info("[*] Starting app.py with config:\n{}".format(config))
app.run(host=str(config["host"]), port=int(config["port"]))

@ -4,4 +4,3 @@ port: 8084
host: localhost
logfile: "/var/log/valency_backend.log"
---

@ -1,5 +1,6 @@
---
debug: True
port: 8084
host: 0.0.0.0
logfile: "/var/log/valency_backend.log"
appindex: /project/data/appindex.json
---

@ -1,2 +0,0 @@
MONGO_URI = "mongodb://sizif:p5e3r4u8t7@my_mongo:27017/valdb"
MONGO_AUTH_SOURCE = 'admin'

@ -1,8 +0,0 @@
#!/bin/bash
pip3 install -e /project/src/pkg/cjvt-corpusparser/.
pip3 install -e /project/src/pkg/valency/.
pip3 install -e /project/src/pkg/seqparser/.
cd /project/src/backend_flask
gunicorn -t 4 -b 0.0.0.0:8084 app:app

@ -0,0 +1,73 @@
# Deprecated: headword creation moved to be part of corpusparser,
# index creation moved to app.py as a preprocessing (with exit) step
CORPORA = ["kres", "ssj"]
if __name__ == "__main__":
valdb = None
def helper_tid_to_token(tid, tokens):
for t in tokens:
if t["tid"] == tid:
return t
return None
# update entries (add headwords and fuctors for indexing)
for corpus in CORPORA:
for e in valdb[corpus].find({}):
if e["srl_links"] is None:
e["headwords"] = []
e["functors"] = []
else:
hw_tids = list(set([x["from"] for x in e["srl_links"]]))
hw_tokens = [helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids]
headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens]
e["headwords"] = headwords
functors = list(set([x["afun"] for x in e["srl_links"]]))
e["functors"] = functors
valdb[corpus].save(e)
valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)])
valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)])
# create app_index (used in frontend, left side word index)
tmp_app_index = {c: {} for c in CORPORA}
for corpus in CORPORA:
res_hws = {}
res_fns = {}
for e in valdb[corpus].find({}):
if "headwords" not in e:
continue
for hw in e["headwords"]:
if hw in res_hws:
res_hws[hw] += 1
else:
res_hws[hw] = 1
if "functors" not in e:
continue
for fn in e["functors"]:
if fn in res_fns:
res_fns[fn] += 1
else:
res_fns[fn] = 1
alphabetical = {}
for k, e in res_hws.items():
fst = k[0].lower()
if fst in alphabetical:
alphabetical[fst].append((k, e))
else:
alphabetical[fst] = [(k, e)]
for k, e in alphabetical.items():
alphabetical[k] = sorted(e, key=lambda x: x[0])
tmp_app_index[corpus]["words"] = alphabetical
functors = [(k, e) for (k, e) in res_fns.items()]
functors = sorted(functors, key=lambda x: x[0])
tmp_app_index[corpus]["functors"] = functors
valdb.appindex.update({"dockey": "appindex"}, {"dockey": "appindex", "data": tmp_app_index}, upsert=True)

@ -9,7 +9,8 @@ info:
echo "Pick either dev or prod."
clean:
- docker rm -f $(CONNAME)
- docker kill $(CONNAME)
- docker rm $(CONNAME)
build-container:
docker build . -t $(IMGNAME)
@ -18,12 +19,5 @@ dev: build-container clean
docker run --name $(CONNAME) -d -p 8080:8080 -v $(shell pwd):/src $(IMGNAME) /src/ops_scripts/dev.sh
prod: build-container clean
docker run --restart always --name $(CONNAME) -d -p 8080:8080 -v $(shell pwd):/src $(IMGNAME) /src/ops_scripts/prod.sh
node-env: clean
docker run --name $(CONNAME) -it -p 8080:8080 -v $(shell pwd):/src $(IMGNAME)
build-prod: build-container clean
docker run --rm -v $(shell pwd):/src $(IMGNAME) /src/ops_scripts/prod.sh
docker run --name $(CONNAME) -d -p 8080:8080 -v $(shell pwd):/src $(IMGNAME) /src/ops_scripts/prod.sh

@ -1,3 +1,3 @@
{
"api_addr": "http://193.2.76.103:8084"
"api_addr": "http://0.0.0.0:8084"
}

@ -1 +0,0 @@
<!DOCTYPE html><html><head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>vue_frontend</title><link href=/static/css/app.05a420a551b5bded5dfec6b370d3edca.css rel=stylesheet></head><body><div id=app></div><script type=text/javascript src=/static/js/manifest.2ae2e69a05c33dfc65f8.js></script><script type=text/javascript src=/static/js/vendor.5d3d2fd333c62579d227.js></script><script type=text/javascript src=/static/js/app.8538f7133303d3e391b2.js></script></body></html>

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -1,2 +0,0 @@
!function(r){var n=window.webpackJsonp;window.webpackJsonp=function(e,u,c){for(var f,i,p,a=0,l=[];a<e.length;a++)i=e[a],o[i]&&l.push(o[i][0]),o[i]=0;for(f in u)Object.prototype.hasOwnProperty.call(u,f)&&(r[f]=u[f]);for(n&&n(e,u,c);l.length;)l.shift()();if(c)for(a=0;a<c.length;a++)p=t(t.s=c[a]);return p};var e={},o={2:0};function t(n){if(e[n])return e[n].exports;var o=e[n]={i:n,l:!1,exports:{}};return r[n].call(o.exports,o,o.exports,t),o.l=!0,o.exports}t.m=r,t.c=e,t.d=function(r,n,e){t.o(r,n)||Object.defineProperty(r,n,{configurable:!1,enumerable:!0,get:e})},t.n=function(r){var n=r&&r.__esModule?function(){return r.default}:function(){return r};return t.d(n,"a",n),n},t.o=function(r,n){return Object.prototype.hasOwnProperty.call(r,n)},t.p="/",t.oe=function(r){throw console.error(r),r}}([]);
//# sourceMappingURL=manifest.2ae2e69a05c33dfc65f8.js.map

@ -1 +0,0 @@
{"version":3,"sources":["webpack:///webpack/bootstrap d176f5affa434246605f"],"names":["parentJsonpFunction","window","chunkIds","moreModules","executeModules","moduleId","chunkId","result","i","resolves","length","installedChunks","push","Object","prototype","hasOwnProperty","call","modules","shift","__webpack_require__","s","installedModules","2","exports","module","l","m","c","d","name","getter","o","defineProperty","configurable","enumerable","get","n","__esModule","object","property","p","oe","err","console","error"],"mappings":"aACA,IAAAA,EAAAC,OAAA,aACAA,OAAA,sBAAAC,EAAAC,EAAAC,GAIA,IADA,IAAAC,EAAAC,EAAAC,EAAAC,EAAA,EAAAC,KACQD,EAAAN,EAAAQ,OAAoBF,IAC5BF,EAAAJ,EAAAM,GACAG,EAAAL,IACAG,EAAAG,KAAAD,EAAAL,GAAA,IAEAK,EAAAL,GAAA,EAEA,IAAAD,KAAAF,EACAU,OAAAC,UAAAC,eAAAC,KAAAb,EAAAE,KACAY,EAAAZ,GAAAF,EAAAE,IAIA,IADAL,KAAAE,EAAAC,EAAAC,GACAK,EAAAC,QACAD,EAAAS,OAAAT,GAEA,GAAAL,EACA,IAAAI,EAAA,EAAYA,EAAAJ,EAAAM,OAA2BF,IACvCD,EAAAY,IAAAC,EAAAhB,EAAAI,IAGA,OAAAD,GAIA,IAAAc,KAGAV,GACAW,EAAA,GAIA,SAAAH,EAAAd,GAGA,GAAAgB,EAAAhB,GACA,OAAAgB,EAAAhB,GAAAkB,QAGA,IAAAC,EAAAH,EAAAhB,IACAG,EAAAH,EACAoB,GAAA,EACAF,YAUA,OANAN,EAAAZ,GAAAW,KAAAQ,EAAAD,QAAAC,IAAAD,QAAAJ,GAGAK,EAAAC,GAAA,EAGAD,EAAAD,QAKAJ,EAAAO,EAAAT,EAGAE,EAAAQ,EAAAN,EAGAF,EAAAS,EAAA,SAAAL,EAAAM,EAAAC,GACAX,EAAAY,EAAAR,EAAAM,IACAhB,OAAAmB,eAAAT,EAAAM,GACAI,cAAA,EACAC,YAAA,EACAC,IAAAL,KAMAX,EAAAiB,EAAA,SAAAZ,GACA,IAAAM,EAAAN,KAAAa,WACA,WAA2B,OAAAb,EAAA,SAC3B,WAAiC,OAAAA,GAEjC,OADAL,EAAAS,EAAAE,EAAA,IAAAA,GACAA,GAIAX,EAAAY,EAAA,SAAAO,EAAAC,GAAsD,OAAA1B,OAAAC,UAAAC,eAAAC,KAAAsB,EAAAC,IAGtDpB,EAAAqB,EAAA,IAGArB,EAAAsB,GAAA,SAAAC,GAA8D,MAApBC,QAAAC,MAAAF,GAAoBA","file":"static/js/manifest.2ae2e69a05c33dfc65f8.js","sourcesContent":[" \t// install a JSONP callback for chunk loading\n \tvar parentJsonpFunction = window[\"webpackJsonp\"];\n \twindow[\"webpackJsonp\"] = function webpackJsonpCallback(chunkIds, moreModules, executeModules) {\n \t\t// add \"moreModules\" to the modules object,\n \t\t// then flag all \"chunkIds\" as loaded and fire callback\n \t\tvar moduleId, chunkId, i = 0, resolves = [], result;\n \t\tfor(;i < chunkIds.length; i++) {\n \t\t\tchunkId = chunkIds[i];\n \t\t\tif(installedChunks[chunkId]) {\n \t\t\t\tresolves.push(installedChunks[chunkId][0]);\n \t\t\t}\n \t\t\tinstalledChunks[chunkId] = 0;\n \t\t}\n \t\tfor(moduleId in moreModules) {\n \t\t\tif(Object.prototype.hasOwnProperty.call(moreModules, moduleId)) {\n \t\t\t\tmodules[moduleId] = moreModules[moduleId];\n \t\t\t}\n \t\t}\n \t\tif(parentJsonpFunction) parentJsonpFunction(chunkIds, moreModules, executeModules);\n \t\twhile(resolves.length) {\n \t\t\tresolves.shift()();\n \t\t}\n \t\tif(executeModules) {\n \t\t\tfor(i=0; i < executeModules.length; i++) {\n \t\t\t\tresult = __webpack_require__(__webpack_require__.s = executeModules[i]);\n \t\t\t}\n \t\t}\n \t\treturn result;\n \t};\n\n \t// The module cache\n \tvar installedModules = {};\n\n \t// objects to store loaded and loading chunks\n \tvar installedChunks = {\n \t\t2: 0\n \t};\n\n \t// The require function\n \tfunction __webpack_require__(moduleId) {\n\n \t\t// Check if module is in cache\n \t\tif(installedModules[moduleId]) {\n \t\t\treturn installedModules[moduleId].exports;\n \t\t}\n \t\t// Create a new module (and put it into the cache)\n \t\tvar module = installedModules[moduleId] = {\n \t\t\ti: moduleId,\n \t\t\tl: false,\n \t\t\texports: {}\n \t\t};\n\n \t\t// Execute the module function\n \t\tmodules[moduleId].call(module.exports, module, module.exports, __webpack_require__);\n\n \t\t// Flag the module as loaded\n \t\tmodule.l = true;\n\n \t\t// Return the exports of the module\n \t\treturn module.exports;\n \t}\n\n\n \t// expose the modules object (__webpack_modules__)\n \t__webpack_require__.m = modules;\n\n \t// expose the module cache\n \t__webpack_require__.c = installedModules;\n\n \t// define getter function for harmony exports\n \t__webpack_require__.d = function(exports, name, getter) {\n \t\tif(!__webpack_require__.o(exports, name)) {\n \t\t\tObject.defineProperty(exports, name, {\n \t\t\t\tconfigurable: false,\n \t\t\t\tenumerable: true,\n \t\t\t\tget: getter\n \t\t\t});\n \t\t}\n \t};\n\n \t// getDefaultExport function for compatibility with non-harmony modules\n \t__webpack_require__.n = function(module) {\n \t\tvar getter = module && module.__esModule ?\n \t\t\tfunction getDefault() { return module['default']; } :\n \t\t\tfunction getModuleExports() { return module; };\n \t\t__webpack_require__.d(getter, 'a', getter);\n \t\treturn getter;\n \t};\n\n \t// Object.prototype.hasOwnProperty.call\n \t__webpack_require__.o = function(object, property) { return Object.prototype.hasOwnProperty.call(object, property); };\n\n \t// __webpack_public_path__\n \t__webpack_require__.p = \"/\";\n\n \t// on error function for async loading\n \t__webpack_require__.oe = function(err) { console.error(err); throw err; };\n\n\n\n// WEBPACK FOOTER //\n// webpack/bootstrap d176f5affa434246605f"],"sourceRoot":""}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -4,4 +4,4 @@ cp ./config/config_prod.json ./config/config.json
npm install
npm run build
# http-server /src/dist
http-server /src/dist

File diff suppressed because it is too large Load Diff

@ -10,52 +10,50 @@
"build": "node build/build.js"
},
"dependencies": {
"ajv": "^6.10.0",
"axios": "^0.18.0",
"bootstrap-vue": "^2.0.0-rc.19",
"jquery": "^3.4.0",
"bootstrap-vue": "^2.0.0-rc.11",
"sha256": "^0.2.0",
"vue": "^2.6.10",
"vue-cookies": "^1.5.13",
"vue-router": "^3.0.6",
"vue": "^2.5.2",
"vue-cookies": "^1.5.6",
"vue-router": "^3.0.1",
"vue-spinner": "^1.0.3"
},
"devDependencies": {
"autoprefixer": "^7.1.2",
"babel-core": "^6.22.1",
"babel-helper-vue-jsx-merge-props": "^2.0.3",
"babel-loader": "^7.1.5",
"babel-loader": "^7.1.1",
"babel-plugin-syntax-jsx": "^6.18.0",
"babel-plugin-transform-runtime": "^6.22.0",
"babel-plugin-transform-vue-jsx": "^3.5.0",
"babel-preset-env": "^1.3.2",
"babel-preset-stage-2": "^6.22.0",
"chalk": "^2.4.2",
"copy-webpack-plugin": "^4.6.0",
"css-loader": "^2.1.1",
"chalk": "^2.0.1",
"copy-webpack-plugin": "^4.0.1",
"css-loader": "^0.28.0",
"extract-text-webpack-plugin": "^3.0.0",
"file-loader": "^1.1.4",
"friendly-errors-webpack-plugin": "^1.6.1",
"html-webpack-plugin": "^2.30.1",
"node-notifier": "^5.4.0",
"node-notifier": "^5.1.2",
"optimize-css-assets-webpack-plugin": "^3.2.0",
"ora": "^1.2.0",
"portfinder": "^1.0.20",
"portfinder": "^1.0.13",
"postcss-import": "^11.0.0",
"postcss-loader": "^2.1.6",
"postcss-loader": "^2.0.8",
"postcss-url": "^7.2.1",
"rimraf": "^2.6.3",
"semver": "^5.7.0",
"rimraf": "^2.6.0",
"semver": "^5.3.0",
"shelljs": "^0.7.6",
"uglifyjs-webpack-plugin": "^1.3.0",
"url-loader": "^1.1.2",
"vue-loader": "^13.7.3",
"uglifyjs-webpack-plugin": "^1.1.1",
"url-loader": "^0.5.8",
"vue-loader": "^13.3.0",
"vue-style-loader": "^3.0.1",
"vue-template-compiler": "^2.6.10",
"vue-template-compiler": "^2.5.2",
"webpack": "^3.6.0",
"webpack-bundle-analyzer": "^3.3.2",
"webpack-dev-server": "^2.11.5",
"webpack-merge": "^4.2.1"
"webpack-bundle-analyzer": "^2.9.0",
"webpack-dev-server": "^2.9.1",
"webpack-merge": "^4.1.0"
},
"engines": {
"node": ">= 6.0.0",

@ -7,25 +7,3 @@ export default {
name: 'App',
}
</script>
<style>
body {
font-family: cambria;
}
.ulred {
color: #b71511;
color: rgb(183,21,17);
}
.lmenu td {
color: #9e9e9e;
}
.redlinks a {
color: #9e9e9e;
}
.redlinks a:hover {
color: #b71511;
}
.text-secondary {
color: #9e9e9e !important;
}
</style>

@ -6,12 +6,7 @@
<div class="col-sm-7">
<div class="row">
<div class="col-sm-12">
<span v-if="frameData.sentences.length < frameData.sentence_count">
št. povedi: {{ frameData.sentence_count }} (prikazanih {{ frameData.sentences.length }})
</span>
<span v-else>
št. povedi: {{ frameData.sentences.length }}
</span>
št. povedi: {{ frameData.sentences.length }}
</div>
</div>

@ -1,17 +1,20 @@
<template>
<div>
<p
v-if="this.$root.store.api_error !== null"
class="text-warning"
>
api_error: {{ this.$root.store.api_error }}
</p>
<Nav></Nav>
<div class="my-home container-fluid">
<div class="row">
<div id="search" class="col-sm-2 border-right fill" :key=this.$root.store.indexReloader>
<div id="serach" class="col-sm-2 border-right fill" :key=this.$root.store.indexReloader>
<LWords
v-if="this.$root.store.selIndex.val === 'words'"></LWords>
<LFunctors v-else></LFunctors>
</div>
<div class="col-sm-10">
<p class="text-danger" v-if="this.$root.store.api_error != null">
{{ this.$root.store.api_error }}
</p>
<router-view></router-view>
</div>
</div>

@ -1,5 +1,5 @@
<template>
<div class="redlinks">
<div>
<table>
<tr v-for="functor in functors">
<td><a href="#" v-on:click="selectFunctor(functor)">{{ functor[0] }}</a></td>

@ -1,5 +1,5 @@
<template>
<div class="redlinks">
<div>
<select v-model="selectedLetter">
<option v-for="letter in alphabet" :value="letter">
{{ letter.toUpperCase() }} ({{ getNumWords(letter) }})

@ -1,5 +1,5 @@
<template>
<div class="redlinks">
<div>
<div class="col-sm-2">
<a href="#" v-on:click="this.$root.routeBack">Nazaj</a>
</div>

@ -1,11 +1,6 @@
<template>
<!--in case of error-->
<div v-if="this.$root.store.api_error != null">
</div>
<!--load mode-->
<div v-else-if="state === 'loading'">
<div v-if="show_loader">
<pulse-loader :color="loader_color"></pulse-loader>
</div>
@ -81,11 +76,16 @@ export default {
},
state: "loading", // editing, normal
request_reload: false,
loader_color: "#b71511",
loader_color: "#007bff",
}},
created: function () {
this.reload()
},
computed: {
show_loader: function () {
return this.state === "loading" && this.$root.store.api_error !== null
}
},
watch: {
hw: function () {
this.reload()
@ -118,7 +118,6 @@ export default {
}
}
var component = this
component.state = "loading"
this.$http.get(
this.$root.store.api_addr +
"/api/functor-frames" +
@ -132,7 +131,6 @@ export default {
})
.catch(function(error) {
component.$root.store.api_error = error
component.state = "error"
})
},
getFrames: function (hw, reduce_fun=null) {
@ -151,7 +149,6 @@ export default {
}
}
var component = this
component.state = "loading"
this.$http.get(
this.$root.store.api_addr + "/api/frames" +
"?hw=" + hw + "&rf=" + reduce_fun +
@ -164,7 +161,6 @@ export default {
})
.catch(function(error) {
component.$root.store.api_error = error
component.state = "error"
})
},
buildSentences: function () {

@ -1,11 +1,8 @@
<template>
<nav>
<b-navbar id="nav-red-bg" toggleable="md" type="light" variant="light">
<b-navbar toggleable="md" type="light" variant="light">
<b-navbar-toggle target="nav_collapse"></b-navbar-toggle>
<!--b-navbar-brand>Vezljivostni vzorci slovenskih glagolov</b-navbar-brand-->
<b-navbar-brand class=cursorpointer v-on:click="goHome">
VEZLJIVOSTNI VZORCI SLOVENSKIH GLAGOLOV
</b-navbar-brand>
<b-navbar-brand>Vezljivostni vzorci slovenskih glagolov</b-navbar-brand>
<b-collapse is-nav id="nav_collapse">
<b-navbar-nav>
@ -103,25 +100,7 @@ export default {
this.$router.push({
name: "Home"
})
},
goHome() {
this.$router.replace({path: "/home"})
}
}
}
</script>
<style>
#nav-red-bg {
background-color: rgb(183,21,17,0.9) !important;
}
nav a {
color: white;
}
nav a:hover {
color: white;
}
.cursorpointer {
cursor: pointer;
}
</style>
</script>

@ -1,5 +1,5 @@
<template>
<div class=redlinks>
<div>
<div class="col-sm-2">
<a href="#" v-on:click="this.$root.routeBack">Nazaj</a>
</div>

@ -1,5 +1,5 @@
<template>
<div class="redlinks">
<div>
<div class="col-sm-2">
<a href="#" v-on:click="this.$root.routeBack">Nazaj</a>
</div>
@ -7,15 +7,6 @@
<div class="alert alert-danger" v-if="error">
<p>{{ error }}</p>
</div>
<div class="form-group">
<input
type="email"
class="form-control"
placeholder="e-pošta"
v-model="credentials.email"
autocomplete="off"
>
</div>
<div class="form-group">
<input
type="text"
@ -25,6 +16,15 @@
autocomplete="off"
>
</div>
<div class="form-group">
<input
type="email"
class="form-control"
placeholder="e-pošta"
v-model="credentials.email"
autocomplete="off"
>
</div>
<div class="form-group">
<input
type="password"
@ -38,7 +38,7 @@
<input
type="password"
class="form-control js-login__password "
placeholder="Ponovite geslo"
placeholder="Ponovite geslo."
v-model="credentials.snd_password"
autocomplete="off"
>

@ -1,9 +0,0 @@
SSKJ_HTML = /home/kristjan/git/diploma/data/sskj/sskj2_v1.html
SSKJ_JSON = "./sskj_senses.json"
WORDLIST = "./wordlist.json"
gen_json_files:
cd seqparser; python3 main.py \
--sskj-html=$(SSKJ_HTML) \
--sskj-json=$(SSKJ_JSON) \
--wordlist=$(WORDLIST)

@ -1,313 +0,0 @@
from bs4 import BeautifulSoup as BS
import re
from collections import defaultdict
from time import time
import pickle
import json
from copy import deepcopy as DC
from pathlib import Path
# Match sese ordinals (1., 2., ...)
rord = re.compile(r"^ *[0-9]+\. *$")
# Get rid of accented characters.
intab = "ÁÉÍÓÚàáäçèéêìíîñòóôöùúüčŔŕ"
outtb = "AEIOUaaaceeeiiinoooouuučRr"
transtab = str.maketrans(intab, outtb)
def d_time(fun):
def wrapper(*args, **kwargs):
tstart = time()
fun(*args, **kwargs)
duration = time() - tstart
print("Function {} ran for {:.2f} s.".format(
fun.__name__, duration))
return wrapper
class Seqparser:
def __init__(sskj_file):
pass
@d_time
def html_to_verb_adj_json(self, infile, outfile):
out_dict = defaultdict(list)
with Path(infile).open("rb") as fp:
for line in fp:
data = self.parse_line(line)
if data is None: continue
out_dict[data["izt_clean"]].append(data)
with Path(outfile).open("w") as fp:
json.dump(dict(out_dict), fp)
@d_time
def generate_sskj_wordlist(self, in_json_file, out_wordlist):
wordlist = None
with Path(in_json_file).open("r") as fp:
jdata = json.load(fp)
wordlist = list(jdata.keys())
with Path(out_wordlist).open("w") as fp:
json.dump({"wordlist": wordlist}, fp)
# main functions
def html_to_raw_pickle(self, sskj_html_filepath, raw_pickle_filepath):
entries = dict(self.parse_file(sskj_html_filepath, self.parse_line))
print("entries len: " + str(len(entries)))
with open(raw_pickle_filepath, "wb") as f:
tmpstr = json.dumps(dict(entries))
pickle.dump(tmpstr, f)
# debugging
def raw_pickle_to_parsed_pickle(
self, raw_pickle_filepath, parsed_pickle_filepath,
se_list_filepath
):
data = self.load_raw_pickle(raw_pickle_filepath)
print("raw_pickle data len: " + str(len(data)))
se_list = self.gen_se_list(data)
print("se_list len: " + str(len(se_list)))
with open(se_list_filepath, "wb") as f:
pickle.dump(se_list, f)
data1 = self.remove_se(data)
data2 = self.reorganize(data1, se_list)
print("data2 len: " + str(len(data2.keys())))
with open(parsed_pickle_filepath, "wb") as f:
pickle.dump(data2, f)
# helper html reading functions
def parse_file(self, path, f_parse_line):
tstart = time()
entries = defaultdict(list)
with open(path, "r") as f:
for line in f:
data = f_parse_line(line)
if data is not None:
entries[data["izt_clean"]].append(data)
print("parse_file({}) in {:.2f}s".format(path, time() - tstart))
return entries
def parse_line(self, line):
def helper_bv_set(g_or_p):
if g_or_p not in ["G", "P"]:
print("Err g_or_p.")
exit(1)
if data.get("bv") is not None:
if data["bv"] != g_or_p:
print(str(line))
# exit(1)
data["bv"] = g_or_p
data = {
"izt": "",
"izt_clean": "",
"senses": defaultdict(list)
}
soup = BS(line, "html.parser")
current_sense_id = "0"
for span in soup.find_all("span"):
# sense id
if span.string is not None:
rmatch = rord.match(span.string)
if rmatch is not None:
current_sense_id = rmatch.group().strip()
title = span.attrs.get("title")
if title is not None:
title = title.lower()
# only verbs and adjectives
if "glagol" in title:
helper_bv_set("G")
data["bv_full"] = title
elif "pridevn" in title:
helper_bv_set("P")
data["bv_full"] = title
# žšč
if title == "iztočnica":
data["izt"] = span.string
data["izt_clean"] = span.string.translate(transtab).lower()
# sense description
if title == "razlaga" and span.string is not None:
data["senses"][current_sense_id].append(
("razl", span.string))
if "pridevnik od" in span.string:
helper_bv_set("P")
if title == "sopomenka":
subspan = span.find_all("a")[0]
if subspan.string is not None:
data["senses"][current_sense_id].append(
("sopo", subspan.string))
# save verbs and adjectives
if (
("bv" not in data) or
(data["bv"] != "P" and data["bv"] != "G")
):
return None
# sanity check
if data["bv"] == "P" and " se" in data["izt_clean"]:
print(data)
exit(1)
# append _ to adjective keywords
if data["bv"] == "P":
data["izt_clean"] = data["izt_clean"] + "_"
# cleanup
if "bv" not in data:
print("Should not be here (no bv).")
exit(1)
del(data["bv"])
if "bv_full" in data:
del(data["bv_full"])
return data
# helper functions
def load_raw_pickle(self, raw_pickle_filepath):
with open(raw_pickle_filepath, "rb") as f:
tmpstr = pickle.load(f)
return json.loads(tmpstr)
def helper_loop(self, data, fnc):
for k, lst in data.items():
for el in lst:
fnc(el)
def gen_se_list(self, data):
def fnc1(el):
ic = el["izt_clean"]
if " se" in ic:
se_list.append(ic)
def fnc2(el):
ic = el["izt_clean"]
if ic in se_pruned:
se_pruned.remove(ic)
# hw entries that only exist with " se"
se_list = []
self.helper_loop(data, fnc1)
se_pruned = set([hw.split(" se")[0] for hw in se_list])
self.helper_loop(data, fnc2)
return sorted(list(se_pruned))
def remove_se(self, data):
def fnc1(el):
nel = DC(el)
ic = nel["izt_clean"]
if " se" in ic:
nic = ic.split(" se")[0]
nel["izt_clean"] = nic
data_new[nel["izt_clean"]].append(nel)
data_new = defaultdict(list)
self.helper_loop(data, fnc1)
return dict(data_new)
def reorganize(self, data, se_list):
# some hw entries have several headwords,
# some senses have subsenses
# index everything, make 1 object per hw
def helper_prune(sense_str):
# remove space padding
sense_str = sense_str.strip()
if len(sense_str) == 1:
return sense_str
# remove banned characters from string ending
banned = ": ; . , - ! ?".split(" ")
if sense_str[-1] in banned:
return sense_str[:-1]
return sense_str
data_new = {}
for k, lst in data.items():
new_el = {
"hw": k,
"has_se": k in se_list,
"senses": []
}
# if there is a single hw entry, hw_id is 0
if len(lst) == 1:
homonym_id = -1
else:
homonym_id = 0
# loop homonyms
for el in lst:
homonym_id += 1
# loop top lvl sense ids
for sense_id, sens_lst in el["senses"].items():
# loop subsenses
for i, sens in enumerate(sens_lst):
nsid = sense_id.split(".")[0]
if len(sens_lst) == 1:
nsid += "-0"
else:
nsid += ("-" + str(i + 1))
new_sense = {
"homonym_id": homonym_id,
# sense_id: sense_id-subsense_id
"sense_id": nsid,
"sense_type": sens[0],
"sense_desc": helper_prune(sens[1]),
}
new_el["senses"].append(new_sense)
hw = new_el["hw"]
if hw in data_new:
print("Shouldn't be here.")
print(new_el)
exit(1)
data_new[hw] = DC(new_el)
# return data_new
# check
for hw, el in data_new.items():
for sens in el["senses"]:
if sens["sense_desc"] is None:
print(sens)
return data_new
def plst(lst):
for el in lst:
print(el)
if __name__ == "__main__":
datapath = "../../../data"
html_filepath = datapath + "/sskj/sskj2_v1.html"
raw_pickle_filepath = datapath + "/tmp_pickles/raw_sskj.pickle"
parsed_pickle_filepath = datapath + "/no_del_pickles/sskj_senses.pickle"
se_list_filepath = datapath + "/no_del_pickles/se_list.pickle"
p = Seqparser()
if True:
print("html_to_raw_pickle({}, {})".format(
html_filepath, raw_pickle_filepath))
print("Big file, this might take a while (2 min).")
tstart = time()
p.html_to_raw_pickle(html_filepath, raw_pickle_filepath)
print("Finished in {:.2f}.".format(time() - tstart))
if False:
print("raw_pickle_to_parsed_pickle({}, {}, {})".format(
raw_pickle_filepath, parsed_pickle_filepath, se_list_filepath))
tstart = time()
p.raw_pickle_to_parsed_pickle(
raw_pickle_filepath, parsed_pickle_filepath, se_list_filepath)
print("Finished in {:.2f}.".format(time() - tstart))
print("Done.")

@ -1,68 +0,0 @@
from Seqparser import Seqparser
import argparse
import sys
from pathlib import Path
import json
import datetime
import hashlib
from pymongo import MongoClient
SSKJ_USER = "sskj2"
if __name__ == "__main__":
aparser = argparse.ArgumentParser()
aparser.add_argument("--sskj-html", type=str)
aparser.add_argument("--sskj-json", type=str)
aparser.add_argument("--wordlist", type=str)
aparser.add_argument("--operation", type=str)
aparser.add_argument("--dbaddr", type=str)
aparser.add_argument("--dbuser", type=str)
aparser.add_argument("--dbpass", type=str)
args = aparser.parse_args()
if args.operation == "gen_sskj_json":
sqp = Seqparser()
sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
sys.exit()
if args.operation == "gen_wordlist":
sqp = Seqparser()
sqp.generate_sskj_wordlist(args.sskj_senses, args.wordlist)
sys.exit()
if args.operation == "senses_to_db":
db_entries = []
tmp_dt = datetime.datetime.utcnow()
with Path(args.sskj_json).open("r") as fp:
jdata = json.load(fp)
# print(jdata[list(jdata.keys())[201]])
for hw, entry in jdata.items():
for key, sense in entry[0]["senses"].items():
desc = sense[0][1]
if sense[0][0] == "razl":
desc = desc[:-1] # for some reason, descriptions contain a ':'
else:
desc = sense[0][0] + ": " + desc
tmp_entry = {
"desc": desc,
"hw": hw,
"author": SSKJ_USER
}
tmp_entry["sense_id"] = "{}-{}".format(
SSKJ_USER,
hashlib.sha256(str(tmp_entry).encode("utf-8")).hexdigest()[:10]
)
tmp_entry["date"] = tmp_dt
db_entries.append(tmp_entry)
print(len(db_entries))
# db login
client = MongoClient(
"mongodb://{}".format(args.dbaddr),
username=args.dbuser,
password=args.dbpass,
authSource="valdb",
authMechanism='SCRAM-SHA-1'
)
valdb = client.valdb
valdb.senses.insert_many(db_entries)

@ -1,11 +0,0 @@
from setuptools import setup
setup(
name='seqparser',
version='0.0.1',
description='Parser for sskj2 html dump.',
author='Kristjan Voje',
author_email='kristjan.voje@gmail.com',
license='MIT',
packages=['seqparser'],
)

@ -37,8 +37,7 @@ def frames_from_db_entry(dbent):
return frames
class Frame():
def __init__(self, tids, deep_links=None, slots=None,
hw_lemma=None, sentences=None, sentence_count=None):
def __init__(self, tids, deep_links=None, slots=None, hw_lemma=None, sentences=None):
self.hw = hw_lemma
self.tids = tids # list of tokens with the same hw_lemma
# Each tid = "S123.t123";
@ -51,8 +50,6 @@ class Frame():
self.sense_info = {}
self.sentences = sentences
self.aggr_sent = None # Dictionary { hw: self.sentences idx }
self.sentence_count = sentence_count # paging, optimization
def get_functors(self):
return [slot.functor for slot in self.slots]
@ -65,8 +62,7 @@ class Frame():
"slots": [slot.to_json() for slot in self.slots],
"sentences": self.sentences,
"aggr_sent": self.aggr_sent,
"sense_info": self.sense_info,
"sentence_count": self.sentence_count
"sense_info": self.sense_info
}
return ret

@ -0,0 +1,96 @@
import logging
log = logging.getLogger(__name__)
class Frame():
def __init__(self, tids, deep_links=None, slots=None, hw=None):
self.hw = hw
self.tids = tids # list of tokens with the same hw_lemma
# Each tid = "S123.t123";
# you can get sentence with vallex.get_sentence(S123)
self.slots = []
if slots is None:
self.slots = self.init_slots(deep_links)
else:
self.slots = slots
self.sense_info = {}
self.sentences = None # Used for passing to view in app.py, get_frames
self.aggr_sent = None # Dictionary { hw: self.sentences idx }
def to_json(self):
ret = {
"hw": self.hw,
"tids": self.tids,
"slots": [slot.to_json() for slot in self.slots],
"sentences": self.sentences,
"aggr_sent": self.aggr_sent,
"sense_info": self.sense_info
}
return ret
def init_slots(self, deep):
slots = []
for link in deep:
slots.append(Slot(
functor=link["functor"],
tids=[link["to"]]
))
return slots
def sort_slots(self):
# ACT, PAT, alphabetically
srt1 = [
x for x in self.slots
if (x.functor == "ACT" or
x.functor == "PAT")
]
srt1 = sorted(srt1, key=lambda x: x.functor)
srt2 = [
x for x in self.slots
if (x.functor != "ACT" and
x.functor != "PAT")
]
srt2 = sorted(srt2, key=lambda x: x.functor)
self.slots = (srt1 + srt2)
def to_string(self):
ret = "Frame:\n"
ret += "sense_info: {}\n".format(str(self.sense_info))
ret += "tids: ["
for t in self.tids:
ret += (str(t) + ", ")
ret += "]\n"
if self.slots is not None:
ret += "slots:\n"
for sl in self.slots:
ret += (sl.to_string() + "\n")
return ret
class Slot():
# Each slot is identified by its functor (ACT, PAT, ...)
# It consists of different tokens.
def __init__(self, functor, tids=None, count=None):
self.functor = functor
self.tids = tids or [] # combining multiple sentences vertically
self.count = count or 1
def to_string(self):
ret = "---- Slot:\n"
ret += "functor: {}\n".format(self.functor)
ret += "tids: ["
for t in self.tids:
ret += (str(t) + ", ")
ret += "]\n"
ret += "]\n"
ret += "----\n"
return ret
def to_json(self):
ret = {
"functor": self.functor,
"tids": self.tids,
"count": self.count
}
return ret

@ -9,7 +9,6 @@ import logging
log = logging.getLogger(__name__)
SENSE_UNDEFINED = "nedefinirano"
SENTENCE_LIMIT = 10
## TIDI: use frame.py
## TODO: build a list of [Frame] with lists of [Slot]
@ -34,8 +33,10 @@ def reduce_0(frames, valdb_sensemap=None):
separated_frames = []
for frame in frames:
for tid in frame.tids:
tmp_frame = DC(frame)
tmp_frame = frame
tmp_frame.tids = [tid]
tmp_frame.sort_slots()
separated_frames.append(tmp_frame)
sorting_strings.append("".join(
[slot.functor for slot in tmp_frame.slots]
@ -71,10 +72,7 @@ def reduce_1(frames, valdb_sensemap=None):
for functor in fs[0]:
slots[functor] = Slot(functor=functor)
# Reduce slots from all frames. (Merge ACT from all frames, ...)
sentence_count = len(fs[1])
for frame in fs[1]:
if len(tids) >= SENTENCE_LIMIT:
break
tids += frame.tids
sentences += frame.sentences
for sl in frame.slots:
@ -82,13 +80,8 @@ def reduce_1(frames, valdb_sensemap=None):
slots_list = []
for k, e in slots.items():
slots_list.append(e)
rf = Frame(
hw_lemma=fs[1][0].hw,
tids=tids,
slots=slots_list,
sentences=sentences,
sentence_count=sentence_count
)
# TODO does appending hw_lemma of first frame work for functor frames too?
rf = Frame(hw_lemma=fs[1][0].hw, tids=tids, slots=slots_list, sentences=sentences)
rf.sort_slots()
ret_frames.append(rf)
return sorted_by_len_tids(ret_frames)
@ -191,11 +184,7 @@ def frames_from_sense_ids(raw_frames, id_map):
tids = []
reduced_slots = []
sentences = []
sentence_count = len(frames)
for frame in frames:
if len(tids) >= SENTENCE_LIMIT:
break
tids += frame.tids
sentences += frame.sentences
for slot in frame.slots:
@ -217,8 +206,7 @@ def frames_from_sense_ids(raw_frames, id_map):
hw_lemma="derp",
tids=tids,
slots=reduced_slots,
sentences=sentences,
sentence_count=sentence_count,
sentences=sentences
)
id_map_entry = (
id_map.get(tids[0]) or

Loading…
Cancel
Save