Compare commits
31 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| ec083a8d63 | |||
| 69c3521e4b | |||
| 75b015dcda | |||
| c18aaff11f | |||
| 34b776be11 | |||
| 26bca0b083 | |||
| 2551a9c6a8 | |||
| 5cdc963c2d | |||
| ce1fb46b4e | |||
| 220529b777 | |||
| ae5f2869bc | |||
| 931b3531b3 | |||
| 3d91251905 | |||
| c803057164 | |||
| b4db4e5255 | |||
| ef02583d72 | |||
| 155c0b2c3d | |||
| c96b199932 | |||
| 02c0e74798 | |||
| 2ff339e24c | |||
| 707034153c | |||
| ba72802f1f | |||
| 14a6e2423b | |||
| c5fc78dca1 | |||
| 439446b1d0 | |||
| eca236bc7e | |||
| 11d3dfc0e6 | |||
| 1aff111cb9 | |||
| 910955abb8 | |||
| bf0970a90a | |||
| 81395890ab |
14
.gitignore
vendored
14
.gitignore
vendored
@@ -1,9 +1,23 @@
|
||||
data/samples/
|
||||
data/wordlist.json
|
||||
data/sskj_senses.json
|
||||
data/appindex.json
|
||||
*egg-info/
|
||||
*.pyc
|
||||
src/frontend_vue/node_modules/
|
||||
src/frontend_vue/dist/
|
||||
dockerfiles/database/create.js
|
||||
dockerfiles/database/create_mongo.js
|
||||
dockerfiles/database/create_postgres.js
|
||||
dockerfiles/database/mongo_db.gz
|
||||
dockerfiles/database/postgres_db.tar
|
||||
dockerfiles/database/postgres_db_OLD.tar
|
||||
*__pycache__/
|
||||
env.local
|
||||
logs/*
|
||||
.idea/
|
||||
venv*
|
||||
data/
|
||||
data
|
||||
deploy_instructions/
|
||||
run.sh
|
||||
|
||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -1,3 +1,6 @@
|
||||
[submodule "src/pkg/cjvt-corpusparser"]
|
||||
path = src/pkg/cjvt-corpusparser
|
||||
url = git@gitea.cjvt.si:kristjan/cjvt-corpusparser.git
|
||||
[submodule "src/pkg/luscenje_struktur"]
|
||||
path = src/pkg/luscenje_struktur
|
||||
url = https://gitea.cjvt.si/ozbolt/luscenje_struktur.git
|
||||
|
||||
16
Dockerfile-backend-flask
Normal file
16
Dockerfile-backend-flask
Normal file
@@ -0,0 +1,16 @@
|
||||
FROM cjvt-python-env
|
||||
|
||||
RUN mkdir -p /project/src/backend_flask
|
||||
RUN mkdir -p /project/src/pkg
|
||||
RUN mkdir -p /project/data
|
||||
|
||||
COPY src/backend_flask /project/src/backend_flask
|
||||
COPY src/pkg /project/src/pkg
|
||||
|
||||
COPY data/appindex.json /project/data
|
||||
|
||||
COPY src/backend_flask/entrypoint.sh /.
|
||||
|
||||
COPY src/backend_flask/conf_files/prod_conf.yaml /project
|
||||
|
||||
ENTRYPOINT ["/bin/bash", "/entrypoint.sh"]
|
||||
65
Makefile
65
Makefile
@@ -11,20 +11,30 @@ MAKE_ROOT = $(shell pwd)
|
||||
# SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_xml/ssj500k-sl.body.sample.xml"
|
||||
SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link"
|
||||
# KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
|
||||
KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
|
||||
# KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
|
||||
KRES_FOLDER = "/home/kristjan/kres_data/payload/kres_xml"
|
||||
GIGAFIDA_FOLDER = "/home/lukakrsnik/cjvt-valency/data_all/giga_orig"
|
||||
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
|
||||
KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
|
||||
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
|
||||
KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json"
|
||||
GIGAFIDA_SRL_FOLDER = "/home/lukakrsnik/cjvt-valency/data_all/final_json"
|
||||
# This file comes with the source code. Make sure you unpack it and name it right.
|
||||
SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json"
|
||||
SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json"
|
||||
|
||||
# for pre-generation the index of all headwords and functors
|
||||
APPINDEX_PATH = "$(MAKE_ROOT)/data/appindex.json"
|
||||
|
||||
OUTPUT = "db"
|
||||
# OUTPUT = "file"
|
||||
OUTDIR = "/tmp/three" # if you're running this in docker, make sure to mount the volume
|
||||
OUTDIR = "/project/data" # if you're running this in docker, make sure to mount the volume
|
||||
DBADDR = "0.0.0.0:27017" # don't use localhost
|
||||
|
||||
# credentials from .gitignored file
|
||||
# create it from env.default
|
||||
include env.local
|
||||
|
||||
N_CORES = 5
|
||||
N_CORES = 4
|
||||
# insert kres files into database in chunks, for fewer connections
|
||||
KRES_CHUNK_SIZE = 30
|
||||
|
||||
@@ -47,6 +57,12 @@ database-service:
|
||||
database-users:
|
||||
cd dockerfiles/database; $(MAKE) create_users
|
||||
|
||||
database-restore:
|
||||
cd dockerfiles/database; $(MAKE) restore_db
|
||||
|
||||
database-restore-postgres:
|
||||
cd dockerfiles/database; $(MAKE) restore_postgres_db
|
||||
|
||||
# also useful, if we want to restart the db
|
||||
database-clean:
|
||||
cd dockerfiles/database; $(MAKE) clean_stack
|
||||
@@ -59,6 +75,8 @@ python-env:
|
||||
python-env-install:
|
||||
pip3 install -e src/pkg/cjvt-corpusparser/.
|
||||
pip3 install -e src/pkg/valency/.
|
||||
pip3 install -e src/pkg/seqparser/.
|
||||
pip3 install -e src/pkg/luscenje_struktur/.
|
||||
|
||||
# from inside python-env container:
|
||||
data/samples:
|
||||
@@ -83,7 +101,14 @@ fill-database-kres: data/samples
|
||||
--chunk-size $(KRES_CHUNK_SIZE) \
|
||||
--cores $(N_CORES)
|
||||
|
||||
|
||||
fill-database-gigafida: data/samples
|
||||
python3 src/pkg/cjvt-corpusparser/corpusparser/main.py --kres-folder $(GIGAFIDA_FOLDER) \
|
||||
--corpus="gigafida" \
|
||||
--ssj-file $(SSJ_FILE) --kres-srl-folder $(GIGAFIDA_SRL_FOLDER) \
|
||||
--output $(OUTPUT) --outdir $(OUTDIR) --dbaddr $(DBADDR) \
|
||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) \
|
||||
--chunk-size $(KRES_CHUNK_SIZE) \
|
||||
--cores $(N_CORES)
|
||||
|
||||
## Frontend
|
||||
|
||||
@@ -95,22 +120,46 @@ frontend-dev:
|
||||
frontend-prod:
|
||||
cd src/frontend_vue/; $(MAKE) prod
|
||||
|
||||
build-frontend-prod:
|
||||
cd src/frontend_vue/; $(MAKE) build-prod
|
||||
|
||||
|
||||
## Backend
|
||||
|
||||
# runs once and exits before the app starts
|
||||
# need to extract ./data/sskj_data.tar.gz first
|
||||
backend-prepare-db:
|
||||
cd ./src/backend_flask; python3 app.py \
|
||||
--config-file ./conf_files/dev_conf.yaml \
|
||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \
|
||||
--sskj-wordlist $(SSKJ_WORDLIST) \
|
||||
--appindex-json $(APPINDEX_PATH) \
|
||||
--prepare-db
|
||||
|
||||
backend-dev:
|
||||
cd ./src/backend_flask; python3 app.py \
|
||||
--config-file ./conf_files/dev_conf.yaml \
|
||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)
|
||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \
|
||||
--appindex-json $(APPINDEX_PATH)
|
||||
|
||||
backend-prod:
|
||||
backend-prod-old:
|
||||
cd ./src/backend_flask; python3 app.py \
|
||||
--config-file ./conf_files/prod_conf.yaml \
|
||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)
|
||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \
|
||||
--appindex-json $(APPINDEX_PATH)
|
||||
|
||||
build-backend-flask:
|
||||
cd ./src/backend_flask; $(MAKE) build
|
||||
|
||||
## add sskj senses to db (generated with pkg/seqparser)
|
||||
sskj-senses:
|
||||
python3 ./src/pkg/seqparser/seqparser/main.py \
|
||||
--sskj-json $(SSKJ_JSON) \
|
||||
--operation "senses_to_db" \
|
||||
--dbaddr $(DBADDR) \
|
||||
--dbuser $(DB_USR_USER) \
|
||||
--dbpass $(DB_USR_PASS)
|
||||
|
||||
deploy-prod-stack:
|
||||
- docker network create val-backend
|
||||
docker stack deploy -c production.yaml val
|
||||
|
||||
127
README.md
127
README.md
@@ -52,8 +52,6 @@ $ make fill-database-ssj
|
||||
$ make fill-database-kres
|
||||
# You can detach from the running process using Ctrl-p + Ctrl-q
|
||||
|
||||
|
||||
|
||||
# this is a long operation
|
||||
# if running on a remote server, use nohup:
|
||||
$ nohup $(make fill-database > fill-database.log) &
|
||||
@@ -74,6 +72,10 @@ $ make python-env-install
|
||||
# needs to be ran once to modify a new database
|
||||
$ make backend-prepare-db
|
||||
|
||||
# if you have the file prepared (sskj_senses.json), you can
|
||||
# fill the database with some senses
|
||||
$ make sskj-senses
|
||||
|
||||
# with debugger
|
||||
$ make backend-dev
|
||||
|
||||
@@ -100,3 +102,124 @@ $ make frontend-prod
|
||||
```
|
||||
|
||||
App available on: `http://0.0.0.0:8080`.
|
||||
|
||||
|
||||
## Production deployment
|
||||
Prerequisite: machine with free ports 80 and 8084.
|
||||
|
||||
|
||||
### Database
|
||||
Either build the database from scratch (lenghty process) using above instructions or just migrate the database from the faculty server (recommended).
|
||||
|
||||
Build container my-mongo:
|
||||
```bash
|
||||
# run once and destroy containers
|
||||
$ make database-service
|
||||
```
|
||||
|
||||
### Backend
|
||||
Set database connection details in `/src/backend_flask/db_config.py`.
|
||||
Change 'valuser' and 'valuserpass' to the database user.
|
||||
```bash
|
||||
mongodb://valuser:valuserpass@my_mongo/valdb
|
||||
```
|
||||
In the above line, replace `valuser` with the username and `valuserpass` with the password that was used to create the database tables (the values were set in the root Makefile).
|
||||
|
||||
You can also set the number of workers in `/src/backend_flask/entrypoint.sh`.
|
||||
In line with `gunicorn -t 4 -b 127.0.0.1:8084 app:app`, edit the `-t` parameter.
|
||||
Rule of thumb is 2x number of available CPU cores.
|
||||
|
||||
Build the backend container:
|
||||
```bash
|
||||
# From git root
|
||||
$ make build-backend-flask
|
||||
```
|
||||
|
||||
### Frontend
|
||||
Set the server address (where backend will be runnig) in `src/frontend_vue/config/config_prod.json`.
|
||||
Build the `/dist` folder that contains the static app (we will be using Nginx to serve it).
|
||||
```bash
|
||||
# From git root
|
||||
$ make build-frontend-prod
|
||||
```
|
||||
|
||||
All set, now run the stack.
|
||||
Stack configuration in `production.yaml`.
|
||||
|
||||
```bash
|
||||
# From git root
|
||||
$ make deploy-prod-stack
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Uploading a mongo dump
|
||||
There's a 15GB mongo dump containing the fully processed kres and ssj data.
|
||||
We can use that file to deploy our aplication.
|
||||
With this database, we will need a minimum of 8GB ram to serve the app.
|
||||
If the server is struggling, frontend will throw "Network errors".
|
||||
|
||||
Check `0.0.0.0:8081` and remove (or backup) the current example database `valdb`.
|
||||
|
||||
Run the stack with mongo port mapped:
|
||||
(uncomment the lines in `production.yaml`)
|
||||
```yml
|
||||
ports:
|
||||
- 27017:27017
|
||||
```
|
||||
|
||||
Run a separate my-mongo container with the mounted data:
|
||||
```bash
|
||||
$ mongo run -it --net host -v <local_dump_path>/dumps my-mongo /bin/bash
|
||||
```
|
||||
|
||||
Inside the container (edit the uesrname, password):
|
||||
```bash
|
||||
$ mongorestore /dumps/valdb --db valdb --uri=mongodb://valuser:valuserpass@0.0.0.0:27017
|
||||
```
|
||||
|
||||
After uploading, restart the stack with `27017` commented out.
|
||||
|
||||
## Script running
|
||||
|
||||
### Environment setup
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
pip install git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git
|
||||
pip install git+https://gitea.cjvt.si/kristjan/cjvt-corpusparser.git
|
||||
```
|
||||
|
||||
### Running on already setup environment
|
||||
```bash
|
||||
make database-service
|
||||
```
|
||||
|
||||
### Setting up environment for running on ramdisk
|
||||
|
||||
```bash
|
||||
# create ramdisk
|
||||
sudo mount -t tmpfs tmpfs /mnt/tmp
|
||||
sudo mount -o remount,size=120G,noexec,nosuid,nodev,noatime /mnt/tmp
|
||||
|
||||
# change volumes to /mnt/tmp:/data/db
|
||||
vim dockerfiles/database/valency-stack.yml
|
||||
|
||||
# change Makefile -runStack to mkdir -p /mnt/tmp
|
||||
vim dockerfiles/database/Makefile
|
||||
|
||||
# run service
|
||||
make database-service
|
||||
|
||||
# run ONLY ONCE to create users and restore database
|
||||
make database-users
|
||||
make database-restore
|
||||
|
||||
# double check if it worked
|
||||
docker exec -it ef0a /bin/bash
|
||||
|
||||
# following steps in docker bash:
|
||||
# check if it worked by
|
||||
mongo --username <REGULAR USER> --password --authenticationDatabase valdb
|
||||
db.getRoles()
|
||||
|
||||
```
|
||||
@@ -1 +0,0 @@
|
||||
/home/kristjan/workdir/final_json/
|
||||
@@ -1 +0,0 @@
|
||||
/home/kristjan/kres_data/payload/kres_json/
|
||||
@@ -1 +0,0 @@
|
||||
/home/kristjan/kres_mount/kres_parsed/tei/
|
||||
@@ -1 +0,0 @@
|
||||
/home/kristjan/kres_mount/kres_parsed/tei/
|
||||
Binary file not shown.
@@ -1 +0,0 @@
|
||||
/home/kristjan/git/diploma/data/ssj500k-sl.TEI/ssj500k-sl.body.xml
|
||||
@@ -1,5 +1,5 @@
|
||||
FROM mongo:latest
|
||||
FROM mongo:4.2.9
|
||||
|
||||
WORKDIR /
|
||||
COPY init_inside_container.sh /.
|
||||
COPY create.js /.
|
||||
COPY init_inside_mongo_container.sh /.
|
||||
COPY create_mongo.js /.
|
||||
|
||||
@@ -2,33 +2,62 @@
|
||||
# collection names: lower case, plural
|
||||
# user names?
|
||||
|
||||
# mongo admin -u root -p password --eval "db.getSiblingDB('vlDB').addUser('vluser', 'password')"
|
||||
|
||||
STACKNAME = dbstack
|
||||
|
||||
.PHONY: start_db FORCE
|
||||
|
||||
all: build_run create_users
|
||||
|
||||
build_run: build_mongo run_stack
|
||||
build_run: build_mongo run_docker_compose
|
||||
|
||||
create.js: FORCE
|
||||
postgres_create_roles:
|
||||
echo 'psql -v ON_ERROR_STOP=OFF --username $(DB_ADM_USER) <<-EOSQL' > create_postgres.js
|
||||
echo "create user $(DB_USR_USER) with encrypted password '$(DB_USR_PASS)';" >> create_postgres.js
|
||||
echo "create database superdb_small;" >> create_postgres.js
|
||||
echo "grant all privileges on database superdb_small to $(DB_USR_USER);" >> create_postgres.js
|
||||
echo "grant usage on schema public to $(DB_USR_USER);" >> create_postgres.js
|
||||
echo "grant select on all tables in schema public to $(DB_USR_USER);" >> create_postgres.js
|
||||
echo "EOSQL" >> create_postgres.js
|
||||
chmod +x create_postgres.js
|
||||
|
||||
FORCE:
|
||||
echo 'db.auth("$(DB_ADM_USER)", "$(DB_ADM_PASS)")' > create.js
|
||||
echo 'use valdb' >> create.js
|
||||
echo 'db.createUser({user: "$(DB_USR_USER)", pwd: "$(DB_USR_PASS)", roles: ["readWrite"]})' >> create.js
|
||||
mongo_create_roles:
|
||||
echo 'db.auth("$(DB_ADM_USER)", "$(DB_ADM_PASS)")' > create_mongo.js
|
||||
echo 'use valdb' >> create_mongo.js
|
||||
echo 'db.createUser({user: "$(DB_USR_USER)", pwd: "$(DB_USR_PASS)", roles: ["readWrite"]})' >> create_mongo.js
|
||||
echo 'db.grantRolesToUser("$(DB_USR_USER)", [{ role: "readWrite", db: "extvaldb"}])' >> create_mongo.js
|
||||
|
||||
build_mongo: create.js
|
||||
build_mongo: mongo_create_roles
|
||||
docker build . -t my-mongo --no-cache
|
||||
|
||||
clean_stack:
|
||||
docker stack rm $(STACKNAME)
|
||||
# build_postgres: postgres_create_roles
|
||||
# docker build . -t my-mongo --no-cache
|
||||
|
||||
run_stack:
|
||||
mkdir -p ${HOME}/mongo_container/data/
|
||||
docker stack deploy --compose-file mongodb-stack.yml $(STACKNAME)
|
||||
run_docker_compose:
|
||||
mkdir -p ${HOME}/valency_data/mongo_container/data/
|
||||
#docker kill $(shell ./get_mongo_container_name.sh)
|
||||
#docker kill $(shell ./get_postgres_container_name.sh)
|
||||
#docker-compose stop
|
||||
docker-compose -f valency-stack.yml up -d --force-recreate
|
||||
# docker stack deploy --compose-file mongodb-stack.yml $(STACKNAME)
|
||||
|
||||
create_users: create.js
|
||||
docker exec $(shell ./get_container_name.sh) /init_inside_container.sh
|
||||
create_users: create_mongo_users create_postgres_users
|
||||
|
||||
|
||||
create_mongo_users: mongo_create_roles
|
||||
docker exec $(shell ./get_mongo_container_name.sh) /init_inside_mongo_container.sh
|
||||
# rm create.js
|
||||
|
||||
create_postgres_users: postgres_create_roles
|
||||
docker exec $(shell ./get_postgres_container_name.sh) /scripts/init_inside_postgres_container.sh
|
||||
|
||||
restore_db: restore_mongo_db restore_postgres_db
|
||||
|
||||
restore_mongo_db:
|
||||
ifeq (,$(wildcard ./mongo_db.gz))
|
||||
$(error "mongo_db.gz does not exists. Make sure to have dump of mongo db in 'dockerfiles/database/mongo_db.gz'")
|
||||
else
|
||||
docker exec $(shell ./get_mongo_container_name.sh) sh -c 'mongorestore --gzip --archive=/scripts/mongo_db.gz --db valdb --username $(DB_USR_USER) --password $(DB_USR_PASS) --authenticationDatabase valdb'
|
||||
endif
|
||||
|
||||
restore_postgres_db:
|
||||
ifeq (,$(wildcard ./postgres_db.tar))
|
||||
$(error "postgres_db.tar does not exists. Make sure to have dump of postgres db in 'dockerfiles/database/postgres_db.tar'")
|
||||
else
|
||||
docker exec $(shell ./get_postgres_container_name.sh) sh -c 'pg_restore -U $(DB_ADM_USER) --dbname=superdb_small --create --verbose /scripts/postgres_db.tar'
|
||||
endif
|
||||
|
||||
2
dockerfiles/database/get_postgres_container_name.sh
Executable file
2
dockerfiles/database/get_postgres_container_name.sh
Executable file
@@ -0,0 +1,2 @@
|
||||
#!/bin/bash
|
||||
docker ps | grep postgres | awk '{print $1}'
|
||||
@@ -1,3 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
mongo admin < /create.js
|
||||
3
dockerfiles/database/init_inside_mongo_container.sh
Executable file
3
dockerfiles/database/init_inside_mongo_container.sh
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
mongo admin < /create_mongo.js
|
||||
3
dockerfiles/database/init_inside_postgres_container.sh
Executable file
3
dockerfiles/database/init_inside_postgres_container.sh
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
/scripts/create_postgres.js
|
||||
@@ -1,26 +0,0 @@
|
||||
version: '3.1'
|
||||
|
||||
services:
|
||||
|
||||
my-mongo:
|
||||
image: my-mongo
|
||||
restart: always
|
||||
ports:
|
||||
- 27017:27017
|
||||
environment:
|
||||
MONGO_INITDB_ROOT_USERNAME: ${DB_ADM_USER}
|
||||
MONGO_INITDB_ROOT_PASSWORD: ${DB_ADM_PASS}
|
||||
volumes:
|
||||
- ${HOME}/mongo_container/data/:/data/db
|
||||
|
||||
mongo-express:
|
||||
image: mongo-express
|
||||
restart: always
|
||||
ports:
|
||||
- 8087:8081
|
||||
environment:
|
||||
ME_CONFIG_BASICAUTH_USERNAME: ${MONGOEXPRESS_USER}
|
||||
ME_CONFIG_BASICAUTH_PASSWORD: ${MONGOEXPRESS_PASS}
|
||||
ME_CONFIG_MONGODB_ADMINUSERNAME: ${DB_ADM_USER}
|
||||
ME_CONFIG_MONGODB_ADMINPASSWORD: ${DB_ADM_PASS}
|
||||
ME_CONFIG_MONGODB_SERVER: my-mongo
|
||||
27
dockerfiles/database/valency-stack.yml
Normal file
27
dockerfiles/database/valency-stack.yml
Normal file
@@ -0,0 +1,27 @@
|
||||
version: '3.1'
|
||||
|
||||
services:
|
||||
|
||||
my_mongo:
|
||||
image: my-mongo
|
||||
restart: always
|
||||
ports:
|
||||
- 127.0.0.1:27017:27017
|
||||
environment:
|
||||
MONGO_INITDB_ROOT_USERNAME: ${DB_ADM_USER}
|
||||
MONGO_INITDB_ROOT_PASSWORD: ${DB_ADM_PASS}
|
||||
volumes:
|
||||
- ${HOME}/valency_data/mongo_container/data/:/data/db
|
||||
- ./:/scripts
|
||||
|
||||
my_postgres:
|
||||
image: postgres
|
||||
restart: always
|
||||
ports:
|
||||
- 127.0.0.1:5432:5432
|
||||
environment:
|
||||
POSTGRES_USER: ${DB_ADM_USER}
|
||||
POSTGRES_PASSWORD: ${DB_ADM_PASS}
|
||||
volumes:
|
||||
- ${HOME}/valency_data/postgres_container/data/:/var/lib/postgresql/data
|
||||
- ./:/scripts
|
||||
@@ -1,26 +1,37 @@
|
||||
FROM ubuntu:16.04
|
||||
FROM ubuntu:18.04
|
||||
|
||||
RUN apt-get update --fix-missing
|
||||
RUN apt-get install -y \
|
||||
vim \
|
||||
python3 \
|
||||
python3-pip \
|
||||
sshfs
|
||||
sshfs \
|
||||
curl \
|
||||
locales
|
||||
|
||||
RUN pip3 install --upgrade pip
|
||||
|
||||
RUN pip3 install \
|
||||
lxml \
|
||||
pandas \
|
||||
sklearn \
|
||||
argparse \
|
||||
pyyaml \
|
||||
pathlib \
|
||||
flask \
|
||||
flask_cors \
|
||||
pymongo \
|
||||
flask
|
||||
flask-pymongo \
|
||||
gunicorn \
|
||||
SQLAlchemy \
|
||||
tqdm \
|
||||
psycopg2-binary
|
||||
|
||||
RUN apt-get install -y \
|
||||
curl
|
||||
# Set the locale
|
||||
RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \
|
||||
locale-gen
|
||||
ENV LANG en_US.UTF-8
|
||||
ENV LANGUAGE en_US:en
|
||||
ENV LC_ALL en_US.UTF-8
|
||||
|
||||
ENV PYTHONIOENCODING UTF-8
|
||||
|
||||
RUN pip3 install \
|
||||
pyyaml \
|
||||
flask_cors
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
IMAGE_NAME="cjvt-python-env"
|
||||
IMAGE_NAME="cjvt-python-env" # don't change, used in backend_flask/Makefile
|
||||
CNNAME="python-env"
|
||||
|
||||
all: build run
|
||||
|
||||
5
dockerfiles/python-env/entrypoint.sh
Executable file
5
dockerfiles/python-env/entrypoint.sh
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "testing entrypoint."
|
||||
$(exit 1)
|
||||
exit 0
|
||||
29
nginx.conf
Normal file
29
nginx.conf
Normal file
@@ -0,0 +1,29 @@
|
||||
# frontend
|
||||
server {
|
||||
listen 80;
|
||||
server_name _;
|
||||
|
||||
location / {
|
||||
root /srv/dist;
|
||||
index index.html index.htm;
|
||||
}
|
||||
|
||||
location /home {
|
||||
return 301 /;
|
||||
}
|
||||
}
|
||||
|
||||
# backend
|
||||
server {
|
||||
listen 8084;
|
||||
server_name _;
|
||||
|
||||
location / {
|
||||
proxy_set_header X-Forward-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header Host $http_host;
|
||||
proxy_pass http://backend_flask:8084;
|
||||
}
|
||||
}
|
||||
|
||||
https://vezljivostni.cjvt.si/api/* -> http://vezljivostni-host.cjvt.si:8084/api/*
|
||||
https://vezljivostni.cjvt.si/* -> http://vezljivostni-host.cjvt.si:80/*
|
||||
43
production.yaml
Normal file
43
production.yaml
Normal file
@@ -0,0 +1,43 @@
|
||||
version: '3.1'
|
||||
|
||||
services:
|
||||
|
||||
my_mongo:
|
||||
image: my-mongo
|
||||
restart: always
|
||||
# ports:
|
||||
# - 27017:27017
|
||||
expose:
|
||||
- 27017
|
||||
environment:
|
||||
MONGO_INITDB_ROOT_USERNAME: valuser
|
||||
MONGO_INITDB_ROOT_PASSWORD: valuserpass
|
||||
volumes:
|
||||
- ${HOME}/mongo_container/data/:/data/db
|
||||
|
||||
mongo_express:
|
||||
image: mongo-express
|
||||
restart: always
|
||||
ports:
|
||||
- 8081:8081
|
||||
environment:
|
||||
ME_CONFIG_BASICAUTH_USERNAME: test
|
||||
ME_CONFIG_BASICAUTH_PASSWORD: test
|
||||
ME_CONFIG_MONGODB_ADMINUSERNAME: valadmin
|
||||
ME_CONFIG_MONGODB_ADMINPASSWORD: rolercoaster
|
||||
ME_CONFIG_MONGODB_SERVER: my_mongo
|
||||
|
||||
backend_flask:
|
||||
image: backend-flask
|
||||
expose:
|
||||
- 8084
|
||||
|
||||
proxy:
|
||||
image: nginx
|
||||
ports:
|
||||
- 80:80
|
||||
- 8084:8084
|
||||
volumes:
|
||||
- ./nginx.conf:/etc/nginx/conf.d/default.conf
|
||||
- ./src/frontend_vue/dist:/srv/dist
|
||||
|
||||
37
requirements.txt
Normal file
37
requirements.txt
Normal file
@@ -0,0 +1,37 @@
|
||||
asn1crypto==0.24.0
|
||||
beautifulsoup4==4.8.0
|
||||
bs4==0.0.1
|
||||
cffi==1.12.3
|
||||
Click==7.0
|
||||
cryptography==2.1.4
|
||||
Flask==1.1.1
|
||||
Flask-Cors==3.0.8
|
||||
Flask-PyMongo==2.3.0
|
||||
gunicorn==19.9.0
|
||||
idna==2.6
|
||||
itsdangerous==1.1.0
|
||||
Jinja2==2.10.1
|
||||
joblib==0.13.2
|
||||
keyring==10.6.0
|
||||
keyrings.alt==3.0
|
||||
lxml==4.4.0
|
||||
MarkupSafe==1.1.1
|
||||
numpy==1.17.0
|
||||
pandas==0.25.0
|
||||
pathlib==1.0.1
|
||||
psycopg2==2.8.4
|
||||
pycparser==2.19
|
||||
pycrypto==2.6.1
|
||||
pymongo==3.8.0
|
||||
python-dateutil==2.8.0
|
||||
pytz==2019.2
|
||||
pyxdg==0.25
|
||||
PyYAML==5.1.2
|
||||
scikit-learn==0.21.3
|
||||
scipy==1.3.0
|
||||
SecretStorage==2.3.1
|
||||
six==1.11.0
|
||||
sklearn==0.0
|
||||
soupsieve==1.9.3
|
||||
SQLAlchemy==1.3.12
|
||||
Werkzeug==0.15.5
|
||||
1708
scripts/create_xml.py
Normal file
1708
scripts/create_xml.py
Normal file
File diff suppressed because it is too large
Load Diff
189
scripts/extract_keywords.py
Normal file
189
scripts/extract_keywords.py
Normal file
@@ -0,0 +1,189 @@
|
||||
import copy
|
||||
import csv
|
||||
from xml.etree import ElementTree
|
||||
import re
|
||||
import sys
|
||||
import logging
|
||||
import argparse
|
||||
import pickle
|
||||
import time
|
||||
import gc
|
||||
import subprocess
|
||||
import concurrent.futures
|
||||
import tempfile
|
||||
|
||||
|
||||
def read_gigafida(path):
|
||||
words = {}
|
||||
with open(path) as tsvfile:
|
||||
reader = csv.reader(tsvfile, delimiter='\t')
|
||||
for row in reader:
|
||||
words[row[0]] = int(row[2])
|
||||
return words
|
||||
|
||||
|
||||
def read_sloleks(path):
|
||||
words = set()
|
||||
with open(path) as tsvfile:
|
||||
reader = csv.reader(tsvfile, delimiter='\t')
|
||||
for row in reader:
|
||||
words.add(row[1])
|
||||
return words
|
||||
|
||||
|
||||
def read_zele(path):
|
||||
with open(path) as f:
|
||||
content = f.readlines()
|
||||
# fix content
|
||||
content[0] = content[0][1:]
|
||||
# a = content[2]
|
||||
# a = content[2].split()
|
||||
# a = content[2].split()[0].split('<IZT>')[1]
|
||||
# a = content[2].split()[0].split('<IZT>')[1].split('</IZT>')[0]
|
||||
content = [x.split()[0].split('<IZT>')[1].split('</IZT>')[0] for x in content]
|
||||
# content = [x.split() for x in content]
|
||||
return set(content)
|
||||
|
||||
|
||||
def read_wordlist(path):
|
||||
with open(path) as f:
|
||||
content = [line[:-1] for line in f.readlines()]
|
||||
print(content[-1])
|
||||
return set(content)
|
||||
|
||||
|
||||
def filter_gigafida(gigafida_raw, min_limit, max_limit):
|
||||
return {word[0]: word[1] for word in gigafida_raw.items() if (word[0][-2:] == 'ti' or word[0][-2:] == 'či') and word[1] > min_limit and word[1] <= max_limit}
|
||||
|
||||
|
||||
def set_list_intersection(gigafida_filtered, sloleks):
|
||||
intersection = {}
|
||||
for word, num in gigafida_filtered.items():
|
||||
if word in sloleks:
|
||||
intersection[word] = num
|
||||
return intersection
|
||||
|
||||
|
||||
def list_list_union(list1, list2):
|
||||
union = copy.copy(list1)
|
||||
for w, n in list2.items():
|
||||
if w not in list1:
|
||||
union[w] = list2[w]
|
||||
return union
|
||||
|
||||
|
||||
def list_list_subtraction(list1, list2):
|
||||
subtraction = {}
|
||||
for w, n in list2.items():
|
||||
# if w == 'dejati':
|
||||
# print('here')
|
||||
if w not in list1:
|
||||
subtraction[w] = n
|
||||
return subtraction
|
||||
|
||||
|
||||
def set_set_subtraction(set1, set2):
|
||||
subtraction = {}
|
||||
for w in set2:
|
||||
if w not in set1:
|
||||
subtraction[w] = -1
|
||||
return subtraction
|
||||
|
||||
|
||||
def create_document(list1, path):
|
||||
with open(path, "w") as text_file:
|
||||
for w, n in list1.items():
|
||||
text_file.write("%s\t%d\n" % (w, n))
|
||||
|
||||
|
||||
def create_document_set(list1, path):
|
||||
with open(path, "w") as text_file:
|
||||
for w in sorted(list(list1)):
|
||||
text_file.write("%s\n" % w)
|
||||
|
||||
|
||||
def gigafida_merge(sloleks, zele, gigafida_raw, giga_min, giga_max):
|
||||
gigafida_filtered = filter_gigafida(gigafida_raw, giga_min, giga_max)
|
||||
sloleks_gf_intersect = set_list_intersection(gigafida_filtered, sloleks)
|
||||
gigafida_filtered1 = filter_gigafida(gigafida_raw, 1, sys.maxsize)
|
||||
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
|
||||
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
|
||||
sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
|
||||
return sloleks_zele_subtraction
|
||||
|
||||
|
||||
def main(args):
|
||||
gigafida_raw = read_gigafida(args.gigafida_verb_list)
|
||||
sloleks = read_sloleks(args.sloleks)
|
||||
zele = read_zele(args.zele)
|
||||
if args.wordlist is not None:
|
||||
sloleks_wordlist = set()
|
||||
# sloleks_wordlist = set()
|
||||
for el in sloleks:
|
||||
if el in gigafida_raw:
|
||||
sloleks_wordlist.add(el)
|
||||
filtered_wordlist = read_wordlist(args.wordlist)
|
||||
|
||||
# sloleks_wordlist = set()
|
||||
for el in sloleks:
|
||||
if el in gigafida_raw:
|
||||
filtered_wordlist.add(el)
|
||||
|
||||
create_document_set(filtered_wordlist, 'wordlist.tsv')
|
||||
# gigafida_merge(sloleks, zele, gigafida_raw, 3, sys.maxsize)
|
||||
gigafida_filtered3 = filter_gigafida(gigafida_raw, 2, sys.maxsize)
|
||||
sloleks_gf_intersect = set_list_intersection(gigafida_filtered3, sloleks)
|
||||
|
||||
nouns_sloleks_gf_intersect = sorted(sloleks_gf_intersect.items(), key=lambda x: x[1], reverse=True)
|
||||
res = [el[0] for el in nouns_sloleks_gf_intersect]
|
||||
|
||||
gigafida_filtered1 = filter_gigafida(gigafida_raw, 0, sys.maxsize)
|
||||
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
|
||||
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
|
||||
sloleks_zele_subtraction = set_set_subtraction(sloleks, zele)
|
||||
create_document(gigafida_filtered3, 'gigafida_3+.tsv')
|
||||
# create_document(sloleks_gf_intersect, 'gigafida_3+-sloleks-presek.tsv')
|
||||
create_document(sloleks_zele_union, 'gigafida_3+-sloleks_zele-presek.tsv')
|
||||
create_document(sloleks_zele_subtraction, 'sloleks-zele-razlika.tsv')
|
||||
|
||||
# gigafida_filtered = filter_gigafida(gigafida_raw, 10, sys.maxsize)
|
||||
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
|
||||
gigafida_10 = gigafida_merge(sloleks, zele, gigafida_raw, 10, sys.maxsize)
|
||||
create_document(gigafida_10, 'gigafida_10+-sloleks_zele-razlika.tsv')
|
||||
|
||||
# gigafida_filtered = filter_gigafida(gigafida_raw, 3, 10)
|
||||
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
|
||||
gigafida_3_10 = gigafida_merge(sloleks, zele, gigafida_raw, 2, 10)
|
||||
create_document(gigafida_3_10, 'gigafida_3-10-sloleks_zele-razlika.tsv')
|
||||
# pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Extract keywords from multiple lists.')
|
||||
parser.add_argument('gigafida_verb_list',
|
||||
help='Path to gigafida list of verbs in tsv format.')
|
||||
parser.add_argument('sloleks',
|
||||
help='Path to Sloleks in tsv format.')
|
||||
parser.add_argument('--zele',
|
||||
help='Path to zele valency dictionary.')
|
||||
parser.add_argument('--wordlist', default=None,
|
||||
help='Path to filtered wordlist.')
|
||||
parser.add_argument('--handchecked_words', default=None,
|
||||
help='Path to handchecked words.')
|
||||
# parser.add_argument('--min_limit',
|
||||
# help='Limit min number of ocurrences',
|
||||
# type=int, default=0)
|
||||
# parser.add_argument('--max_limit',
|
||||
# help='Limit max number of ocurrences',
|
||||
# type=int, default=sys.maxsize)
|
||||
parser.add_argument('--verbose', help='Enable verbose output to stderr',
|
||||
choices=["warning", "info", "debug"], default="info",
|
||||
const="info", nargs='?')
|
||||
|
||||
args = parser.parse_args()
|
||||
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
||||
|
||||
start = time.time()
|
||||
main(args)
|
||||
logging.info("TIME: {}".format(time.time() - start))
|
||||
117
scripts/form_csv.py
Normal file
117
scripts/form_csv.py
Normal file
@@ -0,0 +1,117 @@
|
||||
import argparse
|
||||
import csv
|
||||
import os
|
||||
|
||||
from lxml import etree, objectify, html
|
||||
|
||||
|
||||
def write_general_statistics(path, out_list):
|
||||
if len(out_list) == 0:
|
||||
return
|
||||
with open(path, 'w') as csvfile:
|
||||
writer = csv.writer(csvfile, delimiter='\t',
|
||||
quotechar='"')
|
||||
writer.writerow(['Semantic role', 'Valency pattern ratio', 'Valency sentence ratio'])
|
||||
for line in out_list:
|
||||
writer.writerow(line)
|
||||
|
||||
|
||||
def write_statistics(path, out_list):
|
||||
if len(out_list) == 0:
|
||||
return
|
||||
with open(path, 'w') as csvfile:
|
||||
writer = csv.writer(csvfile, delimiter='\t',
|
||||
quotechar='"')
|
||||
writer.writerow(['Valency pattern id', 'Frequency all GF', 'Semantic role', 'Pattern representation', 'Corpus example'])
|
||||
for line in out_list:
|
||||
writer.writerow(line)
|
||||
|
||||
|
||||
def main(args):
|
||||
for file in sorted(os.listdir(args.input)):
|
||||
path = os.path.join(args.input, file)
|
||||
tree = etree.parse(path)
|
||||
gf_output = []
|
||||
ssj_output = []
|
||||
head = next(tree.iter('head'))
|
||||
headword = head.find('headword').find('lemma').text
|
||||
#for div in root.iterfind('.//div'):
|
||||
for elem in tree.iter('statisticsContainer'):
|
||||
# for element in tree.iterfind('statisticsContainer'):
|
||||
# for element in tree.find('statisticsContainer'):
|
||||
semRole = elem.find('semanticRole').text
|
||||
gf_pattern = None
|
||||
gf_sentence = None
|
||||
ssj_pattern = None
|
||||
ssj_sentence = None
|
||||
measure = elem.find('measureList')
|
||||
for el in measure:
|
||||
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'Gigafida 2.0':
|
||||
gf_pattern = el.text
|
||||
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'Gigafida 2.0':
|
||||
gf_sentence = el.text
|
||||
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'ssj500k 2.2':
|
||||
ssj_pattern = el.text
|
||||
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'ssj500k 2.2':
|
||||
ssj_sentence = el.text
|
||||
if gf_pattern is not None and gf_sentence is not None:
|
||||
gf_output.append([semRole, gf_pattern, gf_sentence])
|
||||
if ssj_pattern is not None and ssj_sentence is not None:
|
||||
ssj_output.append([semRole, ssj_pattern, ssj_sentence])
|
||||
|
||||
print(file)
|
||||
|
||||
analyze_output = []
|
||||
for elem in tree.iter('valencyPattern'):
|
||||
valency_pattern_id = elem.attrib['id']
|
||||
|
||||
# get frequency
|
||||
measure = ''
|
||||
for measure_el in elem.find('measureList').findall('measure'):
|
||||
if measure_el.attrib['source'] == 'Gigafida 2.0':
|
||||
measure = measure_el.text
|
||||
|
||||
# get semantic roles
|
||||
semantic_roles_list = []
|
||||
for semantic_rol_con in elem.find('semanticRoleContainerList').findall('semanticRoleContainer'):
|
||||
semantic_roles_list.append(semantic_rol_con.find('semanticRole').text)
|
||||
semantic_roles = '_'.join(semantic_roles_list)
|
||||
|
||||
# pattern representation
|
||||
pattern_representation = elem.find('patternRepresentation').text
|
||||
|
||||
# corpus example
|
||||
if elem.find('exampleContainerList') is not None and elem.find('exampleContainerList').find('exampleContainer') is not None and elem.find('exampleContainerList').find('exampleContainer').find('corpusExample') is not None:
|
||||
corpus_example_text = html.tostring(elem.find('exampleContainerList').find('exampleContainer').find('corpusExample'), encoding='unicode')
|
||||
|
||||
else:
|
||||
continue
|
||||
|
||||
# ugly postprocessing to remove xmlns:xsi=... duh..
|
||||
root = etree.fromstring(corpus_example_text)
|
||||
|
||||
# Remove namespace prefixes
|
||||
for elem in root.getiterator():
|
||||
elem.tag = etree.QName(elem).localname
|
||||
# Remove unused namespace declarations
|
||||
etree.cleanup_namespaces(root)
|
||||
|
||||
corpus_example = etree.tostring(root, encoding='unicode')
|
||||
|
||||
print(f"Valency pattern {valency_pattern_id}")
|
||||
|
||||
|
||||
analyze_output.append([valency_pattern_id, measure, semantic_roles, pattern_representation, corpus_example])
|
||||
|
||||
write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output)
|
||||
write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output)
|
||||
write_statistics(os.path.join(args.output, headword + '_patterns.tsv'), analyze_output)
|
||||
|
||||
if __name__ == '__main__':
|
||||
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
|
||||
arg_parser.add_argument('--input', type=str, help='Input directory')
|
||||
arg_parser.add_argument('--output', type=str, help='Output directory')
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
main(args)
|
||||
1
scripts/valency
Symbolic link
1
scripts/valency
Symbolic link
@@ -0,0 +1 @@
|
||||
../src/pkg/valency/valency
|
||||
8
scripts/xsd_checker.py
Normal file
8
scripts/xsd_checker.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from lxml import etree as lxml
|
||||
|
||||
with open('../data/inventory.xsd') as f:
|
||||
xmlschema_doc = lxml.parse(f)
|
||||
xmlschema = lxml.XMLSchema(xmlschema_doc)
|
||||
with open('../data/xmls/output.xml') as op:
|
||||
doc = lxml.parse(op)
|
||||
print(xmlschema.validate(doc))
|
||||
0
src/__init__.py
Normal file
0
src/__init__.py
Normal file
16
src/backend_flask/Makefile
Normal file
16
src/backend_flask/Makefile
Normal file
@@ -0,0 +1,16 @@
|
||||
IMG="backend-flask"
|
||||
CNT="backend_flask"
|
||||
|
||||
clean:
|
||||
- docker rm -f $(CNT)
|
||||
|
||||
run: clean build
|
||||
docker run -d --net host --name $(CNT) $(IMG)
|
||||
docker logs -f $(CNT)
|
||||
|
||||
build: build-cjvt-python-env
|
||||
# docker build . -f ../../Dockerfile-backend-flask -t $(IMG)
|
||||
cd ../..; docker build . -f Dockerfile-backend-flask -t $(IMG)
|
||||
|
||||
build-cjvt-python-env:
|
||||
cd ../../dockerfiles/python-env; $(MAKE) build
|
||||
@@ -26,23 +26,22 @@ from email.mime.text import MIMEText
|
||||
from copy import deepcopy as DC
|
||||
from pathlib import Path
|
||||
from pymongo import MongoClient
|
||||
from flask_pymongo import PyMongo
|
||||
import pymongo
|
||||
import argparse
|
||||
|
||||
# some db collections
|
||||
USERS_COLL = "users"
|
||||
TOKENS_COLL = "usertokens"
|
||||
SENSES_COLL = "senses"
|
||||
SENSEMAP_COLL = "sensemap"
|
||||
|
||||
# pre-generated data (gui leftside word index)
|
||||
CORPORA = ["ssj", "kres"]
|
||||
app_index = None
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
valdb = None
|
||||
app = Flask(__name__)
|
||||
|
||||
app.config.from_object("db_config")
|
||||
mongo = PyMongo(app)
|
||||
|
||||
# app.config["CORPORA"] = ["ssj", "kres", "gigafida"]
|
||||
app.config["CORPORA"] = ["gigafida"]
|
||||
app.config["BANNED_HEADWORDS"] = ["biti"]
|
||||
app.config["QUERY_LIMIT"] = 1000
|
||||
|
||||
|
||||
# when running vuejs via webpack
|
||||
# CORS(app)
|
||||
@@ -57,7 +56,7 @@ CORS(app)
|
||||
@app.route("/api/dev")
|
||||
def api_dev():
|
||||
print("DEV")
|
||||
cur = valdb.kres.find({"headwords": "nagovarjati"})
|
||||
cur = mongo.db.kres.find({"headwords": "nagovarjati"})
|
||||
frames = []
|
||||
for ent in cur:
|
||||
frames += frames_from_db_entry(ent)
|
||||
@@ -70,12 +69,12 @@ def api_dev():
|
||||
@app.route("/api/words/<corpus>")
|
||||
def api_words(corpus):
|
||||
return json.dumps({
|
||||
"sorted_words": app_index[corpus]["words"], # todo - make corpus as arg
|
||||
"sorted_words": app.config["app_index"][corpus]["words"], # todo - make corpus as arg
|
||||
})
|
||||
|
||||
@app.route("/api/functors/<corpus>")
|
||||
def api_functors(corpus):
|
||||
return json.dumps(app_index[corpus]["functors"])
|
||||
return json.dumps(app.config["app_index"][corpus]["functors"])
|
||||
|
||||
# INDEX SELECTION -------------------^
|
||||
|
||||
@@ -96,7 +95,7 @@ def api_register():
|
||||
):
|
||||
return "ERR"
|
||||
email_hash = hashlib.sha256(email.encode("utf-8")).hexdigest()
|
||||
existing = list(valdb[USERS_COLL].find({
|
||||
existing = list(mongo.db.users.find({
|
||||
"$or": [{"username": username}, {"email": email_hash}]
|
||||
}))
|
||||
if len(existing) > 0:
|
||||
@@ -107,7 +106,7 @@ def api_register():
|
||||
password.encode("utf-8")).hexdigest(),
|
||||
"email": email_hash
|
||||
}
|
||||
valdb[USERS_COLL].insert(entry)
|
||||
mongo.db.users.insert(entry)
|
||||
return "OK"
|
||||
|
||||
|
||||
@@ -119,7 +118,7 @@ def api_login():
|
||||
password = data["password"]
|
||||
hpass = hashlib.sha256(password.encode("utf-8")).hexdigest()
|
||||
|
||||
db_user = list(valdb[USERS_COLL].find({
|
||||
db_user = list(mongo.db.users.find({
|
||||
"username": username,
|
||||
"hpass": hpass
|
||||
}))
|
||||
@@ -133,7 +132,7 @@ def api_login():
|
||||
"date": datetime.datetime.utcnow(),
|
||||
"token": token
|
||||
}
|
||||
valdb[TOKENS_COLL].update(
|
||||
mongo.db.usertokens.update(
|
||||
{"username": token_entry["username"]},
|
||||
token_entry,
|
||||
upsert=True
|
||||
@@ -176,7 +175,7 @@ def api_new_pass():
|
||||
username = data["username"]
|
||||
email = data["email"]
|
||||
hemail = hashlib.sha256(email.encode("utf-8")).hexdigest()
|
||||
db_res = list(valdb[USERS_COLL].find({
|
||||
db_res = list(mongo.db.users.find({
|
||||
"username": username,
|
||||
"email": hemail
|
||||
}))
|
||||
@@ -188,7 +187,7 @@ def api_new_pass():
|
||||
string.ascii_letters + string.digits) for i in range(10)])
|
||||
# update locally
|
||||
hpass = hashlib.sha256(new_pass.encode("utf-8")).hexdigest()
|
||||
valdb[USERS_COLL].update(
|
||||
mongo.db.users.update(
|
||||
{
|
||||
"username": username,
|
||||
"email": hemail
|
||||
@@ -206,12 +205,12 @@ def token_to_username(token):
|
||||
key = {
|
||||
"token": token
|
||||
}
|
||||
res = list(valdb[TOKENS_COLL].find(key))
|
||||
res = list(mongo.db.usertokens.find(key))
|
||||
if len(res) != 1:
|
||||
return None
|
||||
username = res[0]["username"]
|
||||
# update deletion interval
|
||||
valdb[TOKENS_COLL].update(
|
||||
mongo.db.usertokens.update(
|
||||
key, {"$set": {"date": datetime.datetime.utcnow()}})
|
||||
return username
|
||||
|
||||
@@ -246,22 +245,26 @@ def api_get_frames():
|
||||
RF = reduce_functions[rf_name]["f"]
|
||||
|
||||
corpus = request.args.get("cor")
|
||||
if corpus not in CORPORA:
|
||||
if corpus not in app.config["CORPORA"]:
|
||||
return json.dumps({"error": "cor={kres,ssj}"})
|
||||
|
||||
cur = valdb[corpus].find({"headwords": hw})
|
||||
log.info("Test1")
|
||||
cur = mongo.db[corpus].find({"headwords": hw})
|
||||
log.info("Test2")
|
||||
frames = []
|
||||
for ent in cur:
|
||||
for ent in cur[:app.config["QUERY_LIMIT"]]:
|
||||
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
|
||||
|
||||
cur.close()
|
||||
log.info("Test3")
|
||||
# filter by relevant hw
|
||||
frames = [x for x in frames if x.hw == hw]
|
||||
|
||||
ret_frames = RF(frames, valdb[SENSEMAP_COLL])
|
||||
|
||||
ret_frames = RF(frames, mongo.db.sensemap)
|
||||
log.info("Test3")
|
||||
json_ret = {"frames": []}
|
||||
for frame in ret_frames:
|
||||
json_ret["frames"].append(frame.to_json())
|
||||
log.info("Test4")
|
||||
return json.dumps(json_ret)
|
||||
# return prepare_frames(ret_frames)
|
||||
|
||||
@@ -298,19 +301,20 @@ def api_get_functor_frames():
|
||||
RF = reduce_functions[rf_name]["f"]
|
||||
|
||||
corpus = request.args.get("cor")
|
||||
if corpus not in CORPORA:
|
||||
if corpus not in app.config["CORPORA"]:
|
||||
return json.dumps({"error": "cor={kres,ssj}"})
|
||||
|
||||
cur = valdb[corpus].find({"functors": functor})
|
||||
cur = mongo.db[corpus].find({"functors": functor})
|
||||
frames = []
|
||||
for ent in cur:
|
||||
for ent in cur[:app.config["QUERY_LIMIT"]]:
|
||||
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
|
||||
cur.close()
|
||||
|
||||
# filter by relevant functor
|
||||
frames = [x for x in frames if functor in x.get_functors()]
|
||||
|
||||
# raw_frames = vallex.functors_index[functor] # TODO
|
||||
ret_frames = RF(frames, valdb[SENSEMAP_COLL])
|
||||
ret_frames = RF(frames, mongo.db.sensemap)
|
||||
ret_frames = _aggregate_by_hw(ret_frames)
|
||||
|
||||
json_ret = {"frames": []}
|
||||
@@ -322,15 +326,17 @@ def api_get_functor_frames():
|
||||
|
||||
|
||||
# SENSES ----------------------------.
|
||||
# ssj_id is legacy notation, read
|
||||
# it as general sentence_id
|
||||
|
||||
@app.route("/api/senses/get")
|
||||
def api_senses_get():
|
||||
# returns senses and mapping for hw
|
||||
hw = request.args.get("hw")
|
||||
senses = list(valdb[SENSES_COLL].find({
|
||||
senses = list(mongo.db.senses.find({
|
||||
"hw": hw
|
||||
}))
|
||||
sense_map_query = list(valdb[SENSEMAP_COLL].find({
|
||||
sense_map_query = list(mongo.db.sensemap.find({
|
||||
"hw": hw
|
||||
}))
|
||||
# aggregation by max date possible on DB side
|
||||
@@ -407,8 +413,10 @@ def api_senses_update():
|
||||
ns["date"] = tmp_dt
|
||||
id_map[frontend_sense_id] = new_sense_id
|
||||
|
||||
print(ns)
|
||||
|
||||
# insert into db
|
||||
valdb[SENSES_COLL].insert(ns)
|
||||
mongo.db.senses.insert(ns)
|
||||
|
||||
# replace tmp_id with mongo's _id
|
||||
for ssj_id, el in sense_map.items():
|
||||
@@ -423,22 +431,42 @@ def api_senses_update():
|
||||
"date": datetime.datetime.utcnow()
|
||||
}
|
||||
# vallex.db["v2_sense_map"].update(key, data, upsert=True)
|
||||
valdb[SENSEMAP_COLL].insert(data)
|
||||
mongo.db.sensemap.insert(data)
|
||||
return "OK"
|
||||
|
||||
# SENSES ----------------------------^
|
||||
|
||||
|
||||
# APP PREFLIGHT ---------------------.
|
||||
def _is_banned(hw):
|
||||
banned = True
|
||||
if hw in app.config["BANNED_HEADWORDS"]:
|
||||
banned = True
|
||||
elif hw in sskj_wordlist["wordlist"]:
|
||||
banned = False
|
||||
elif (hw + " se") in sskj_wordlist["wordlist"]:
|
||||
banned = False
|
||||
return banned
|
||||
|
||||
def prepare_app_index():
|
||||
def prepare_app_index(appindex_json):
|
||||
log.info("[*] preparing app_index")
|
||||
# create app_index (used in frontend, left side word index)
|
||||
tmp_app_index = {c: {} for c in CORPORA}
|
||||
for corpus in CORPORA:
|
||||
tmp_app_index = {c: {} for c in app.config["CORPORA"]}
|
||||
for corpus in app.config["CORPORA"]:
|
||||
res_hws = {}
|
||||
res_fns = {}
|
||||
for e in valdb[corpus].find({}):
|
||||
|
||||
# print('CORPUS...!!...')
|
||||
# print(corpus)
|
||||
# a = mongo.db[corpus]
|
||||
# print('TEST_OK')
|
||||
# print(a)
|
||||
# print(mongo.db)
|
||||
# a = mongo.db.list_collection_names()
|
||||
# print('TEST_OK2')
|
||||
nentries = mongo.db[corpus].count()
|
||||
idx = 0
|
||||
for e in mongo.db[corpus].find({}):
|
||||
if "headwords" not in e:
|
||||
continue
|
||||
for hw in e["headwords"]:
|
||||
@@ -453,6 +481,10 @@ def prepare_app_index():
|
||||
res_fns[fn] += 1
|
||||
else:
|
||||
res_fns[fn] = 1
|
||||
idx += 1
|
||||
if idx % 10000 == 0:
|
||||
log.debug("indexing {}: {}/{}".format(
|
||||
corpus, idx, nentries))
|
||||
|
||||
alphabetical = {}
|
||||
for k, e in res_hws.items():
|
||||
@@ -462,19 +494,48 @@ def prepare_app_index():
|
||||
else:
|
||||
alphabetical[fst] = [(k, e)]
|
||||
|
||||
for k, e in alphabetical.items():
|
||||
alphabetical[k] = sorted(e, key=lambda x: x[0])
|
||||
for letter, words in alphabetical.items():
|
||||
filtered_words = [x for x in words if not _is_banned(x[0])]
|
||||
# filtered_words = [x for x in words]
|
||||
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
|
||||
|
||||
tmp_app_index[corpus]["words"] = alphabetical
|
||||
|
||||
|
||||
functors = [(k, e) for (k, e) in res_fns.items()]
|
||||
functors = sorted(functors, key=lambda x: x[0])
|
||||
tmp_app_index[corpus]["functors"] = functors
|
||||
|
||||
valdb.appindex.update({"dockey": "appindex"}, {"dockey": "appindex", "data": tmp_app_index}, upsert=True)
|
||||
with Path(appindex_json).open("w") as fp:
|
||||
json.dump(tmp_app_index, fp)
|
||||
|
||||
# APP PREFLIGHT ---------------------^
|
||||
|
||||
|
||||
def init_wsgi(app):
|
||||
print("Initiating wsgi")
|
||||
config = None
|
||||
with Path("/project/prod_conf.yaml").open("r") as fp:
|
||||
config = list(yaml.safe_load_all(fp))[0]
|
||||
|
||||
app.debug = False
|
||||
logfile = config["logfile"]
|
||||
logging.basicConfig(filename=logfile, level=logging.INFO)
|
||||
|
||||
# app index from db
|
||||
with Path(config["appindex"]).open("r") as fp:
|
||||
# a dirty hack but ok
|
||||
app.config["app_index"] = json.load(fp)
|
||||
|
||||
# log.info("[*] Starting app.py with config:\n%s".format(config))
|
||||
log.info("[*] Starting app.py with config:\n{}".format(config))
|
||||
|
||||
|
||||
# if we don't pass arguments, assume production environment (gunicorn)
|
||||
if "gunicorn" in sys.argv[0]:
|
||||
init_wsgi(app)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Starting app.py main()")
|
||||
aparser = argparse.ArgumentParser(description="Arguments for app.py")
|
||||
@@ -483,9 +544,10 @@ if __name__ == "__main__":
|
||||
aparser.add_argument("--dbuser", type=str)
|
||||
aparser.add_argument("--dbpass", type=str)
|
||||
aparser.add_argument("--dbaddr", type=str)
|
||||
aparser.add_argument("--sskj-wordlist", type=str)
|
||||
aparser.add_argument("--appindex-json", type=str)
|
||||
args = aparser.parse_args()
|
||||
|
||||
config = None
|
||||
with Path(args.config_file).open("r") as fp:
|
||||
config = list(yaml.safe_load_all(fp))[0]
|
||||
|
||||
@@ -496,25 +558,35 @@ if __name__ == "__main__":
|
||||
else:
|
||||
logging.basicConfig(filename=logfile, level=logging.INFO)
|
||||
|
||||
"""
|
||||
# db login
|
||||
client = MongoClient(
|
||||
"mongodb://{}".format(args.dbaddr),
|
||||
username=args.dbuser,
|
||||
password=args.dbpass,
|
||||
authSource="valdb",
|
||||
authSource="mongo.db",
|
||||
authMechanism='SCRAM-SHA-1'
|
||||
)
|
||||
valdb = client.valdb
|
||||
valdb = client.mongo.db
|
||||
"""
|
||||
|
||||
if args.prepare_db:
|
||||
prepare_app_index()
|
||||
with Path(args.sskj_wordlist).open("r") as fp:
|
||||
sskj_wordlist = json.load(fp)
|
||||
prepare_app_index(args.appindex_json)
|
||||
sys.exit()
|
||||
|
||||
# app index from db
|
||||
app_index = (valdb.appindex.find_one({"dockey": "appindex"}))["data"]
|
||||
with Path(args.appindex_json).open("r") as fp:
|
||||
app.config["app_index"] = json.load(fp)
|
||||
# a = app.config["app_index"]
|
||||
# b = app.config["app_index"]["kres"]
|
||||
# c = app.config["app_index"]["kres"]["words"]
|
||||
# print('HERE')
|
||||
|
||||
# log.info("[*] Starting app.py with config:\n%s".format(config))
|
||||
log.info("[*] Starting app.py with config:\n{}".format(config))
|
||||
|
||||
app.run(host=str(config["host"]), port=int(config["port"]))
|
||||
|
||||
|
||||
|
||||
106
src/backend_flask/build_app_index.py
Normal file
106
src/backend_flask/build_app_index.py
Normal file
@@ -0,0 +1,106 @@
|
||||
import argparse
|
||||
import json
|
||||
|
||||
from flask import Flask
|
||||
from flask_pymongo import PyMongo
|
||||
from pathlib import Path
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
app.config.from_object("db_config")
|
||||
mongo = PyMongo(app)
|
||||
|
||||
app.config["BANNED_HEADWORDS"] = ["biti"]
|
||||
|
||||
def _is_banned(hw):
|
||||
banned = True
|
||||
if hw in app.config["BANNED_HEADWORDS"]:
|
||||
banned = True
|
||||
elif hw in sskj_wordlist["wordlist"]:
|
||||
banned = False
|
||||
elif (hw + " se") in sskj_wordlist["wordlist"]:
|
||||
banned = False
|
||||
return banned
|
||||
|
||||
|
||||
def prepare_app_index(appindex_json, corporas, previous_json=None):
|
||||
if previous_json:
|
||||
with Path(previous_json).open("r") as fp:
|
||||
tmp_app_index = json.load(fp)
|
||||
else:
|
||||
tmp_app_index = {}
|
||||
# create app_index (used in frontend, left side word index)
|
||||
for c in corporas:
|
||||
tmp_app_index[c] = {}
|
||||
|
||||
for corpus in corporas:
|
||||
res_hws = {}
|
||||
res_fns = {}
|
||||
|
||||
# print('CORPUS...!!...')
|
||||
# print(corpus)
|
||||
# a = mongo.db[corpus]
|
||||
# print('TEST_OK')
|
||||
# print(a)
|
||||
# print(mongo.db)
|
||||
# a = mongo.db.list_collection_names()
|
||||
# print('TEST_OK2')
|
||||
nentries = mongo.db[corpus].count()
|
||||
idx = 0
|
||||
for e in mongo.db[corpus].find({}):
|
||||
if "headwords" not in e:
|
||||
continue
|
||||
for hw in e["headwords"]:
|
||||
if hw in res_hws:
|
||||
res_hws[hw] += 1
|
||||
else:
|
||||
res_hws[hw] = 1
|
||||
if "functors" not in e:
|
||||
continue
|
||||
for fn in e["functors"]:
|
||||
if fn in res_fns:
|
||||
res_fns[fn] += 1
|
||||
else:
|
||||
res_fns[fn] = 1
|
||||
idx += 1
|
||||
if idx % 10000 == 0:
|
||||
print("indexing {}: {}/{}".format(
|
||||
corpus, idx, nentries))
|
||||
|
||||
alphabetical = {}
|
||||
for k, e in res_hws.items():
|
||||
fst = k[0].lower()
|
||||
if fst in alphabetical:
|
||||
alphabetical[fst].append((k, e))
|
||||
else:
|
||||
alphabetical[fst] = [(k, e)]
|
||||
|
||||
for letter, words in alphabetical.items():
|
||||
filtered_words = [x for x in words if not _is_banned(x[0])]
|
||||
# filtered_words = [x for x in words]
|
||||
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
|
||||
|
||||
tmp_app_index[corpus]["words"] = alphabetical
|
||||
|
||||
|
||||
functors = [(k, e) for (k, e) in res_fns.items()]
|
||||
functors = sorted(functors, key=lambda x: x[0])
|
||||
tmp_app_index[corpus]["functors"] = functors
|
||||
|
||||
with Path(appindex_json).open("w") as fp:
|
||||
json.dump(tmp_app_index, fp)
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Starting app.py main()")
|
||||
aparser = argparse.ArgumentParser(description="Arguments for app.py")
|
||||
aparser.add_argument("--previous-json", type=str, default=None)
|
||||
aparser.add_argument("--appindex-json", type=str)
|
||||
aparser.add_argument("--sskj-wordlist", type=str)
|
||||
args = aparser.parse_args()
|
||||
|
||||
corporas = ['gigafida']
|
||||
|
||||
with Path(args.sskj_wordlist).open("r") as fp:
|
||||
sskj_wordlist = json.load(fp)
|
||||
|
||||
prepare_app_index(args.appindex_json, corporas, args.previous_json)
|
||||
@@ -4,3 +4,4 @@ port: 8084
|
||||
host: localhost
|
||||
logfile: "/var/log/valency_backend.log"
|
||||
---
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
---
|
||||
debug: True
|
||||
port: 8084
|
||||
host: 0.0.0.0
|
||||
logfile: "/var/log/valency_backend.log"
|
||||
---
|
||||
appindex: /project/data/appindex.json
|
||||
|
||||
2
src/backend_flask/db_config.py
Normal file
2
src/backend_flask/db_config.py
Normal file
@@ -0,0 +1,2 @@
|
||||
MONGO_URI = "mongodb://user:user@0.0.0.0:27017/valdb"
|
||||
MONGO_AUTH_SOURCE = 'admin'
|
||||
8
src/backend_flask/entrypoint.sh
Executable file
8
src/backend_flask/entrypoint.sh
Executable file
@@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
pip3 install -e /project/src/pkg/cjvt-corpusparser/.
|
||||
pip3 install -e /project/src/pkg/valency/.
|
||||
pip3 install -e /project/src/pkg/seqparser/.
|
||||
|
||||
cd /project/src/backend_flask
|
||||
gunicorn -t 4 -b 0.0.0.0:8084 app:app
|
||||
18
src/backend_flask/get_sentence_ids.py
Normal file
18
src/backend_flask/get_sentence_ids.py
Normal file
@@ -0,0 +1,18 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
input_dir = "/media/luka/Portable Disk/Datasets/gigafida_jos/final_json"
|
||||
output_file = "../../all_sentences.json"
|
||||
|
||||
results = {}
|
||||
filenames = os.listdir(input_dir)
|
||||
len(filenames)
|
||||
for i, filename in enumerate(filenames):
|
||||
if filename.endswith(".json"):
|
||||
with open(os.path.join(input_dir, filename)) as json_file:
|
||||
data = json.load(json_file)
|
||||
results[filename.split('-')[0]] = list(data.keys())
|
||||
print('Progress: %.2f %%' % (i/len(filenames)))
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(results, f)
|
||||
@@ -1,73 +0,0 @@
|
||||
# Deprecated: headword creation moved to be part of corpusparser,
|
||||
# index creation moved to app.py as a preprocessing (with exit) step
|
||||
|
||||
CORPORA = ["kres", "ssj"]
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
valdb = None
|
||||
|
||||
def helper_tid_to_token(tid, tokens):
|
||||
for t in tokens:
|
||||
if t["tid"] == tid:
|
||||
return t
|
||||
return None
|
||||
|
||||
# update entries (add headwords and fuctors for indexing)
|
||||
for corpus in CORPORA:
|
||||
for e in valdb[corpus].find({}):
|
||||
if e["srl_links"] is None:
|
||||
e["headwords"] = []
|
||||
e["functors"] = []
|
||||
else:
|
||||
hw_tids = list(set([x["from"] for x in e["srl_links"]]))
|
||||
hw_tokens = [helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids]
|
||||
headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens]
|
||||
e["headwords"] = headwords
|
||||
|
||||
functors = list(set([x["afun"] for x in e["srl_links"]]))
|
||||
e["functors"] = functors
|
||||
|
||||
valdb[corpus].save(e)
|
||||
|
||||
valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)])
|
||||
valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)])
|
||||
|
||||
# create app_index (used in frontend, left side word index)
|
||||
tmp_app_index = {c: {} for c in CORPORA}
|
||||
for corpus in CORPORA:
|
||||
res_hws = {}
|
||||
res_fns = {}
|
||||
for e in valdb[corpus].find({}):
|
||||
if "headwords" not in e:
|
||||
continue
|
||||
for hw in e["headwords"]:
|
||||
if hw in res_hws:
|
||||
res_hws[hw] += 1
|
||||
else:
|
||||
res_hws[hw] = 1
|
||||
if "functors" not in e:
|
||||
continue
|
||||
for fn in e["functors"]:
|
||||
if fn in res_fns:
|
||||
res_fns[fn] += 1
|
||||
else:
|
||||
res_fns[fn] = 1
|
||||
|
||||
alphabetical = {}
|
||||
for k, e in res_hws.items():
|
||||
fst = k[0].lower()
|
||||
if fst in alphabetical:
|
||||
alphabetical[fst].append((k, e))
|
||||
else:
|
||||
alphabetical[fst] = [(k, e)]
|
||||
|
||||
for k, e in alphabetical.items():
|
||||
alphabetical[k] = sorted(e, key=lambda x: x[0])
|
||||
tmp_app_index[corpus]["words"] = alphabetical
|
||||
|
||||
functors = [(k, e) for (k, e) in res_fns.items()]
|
||||
functors = sorted(functors, key=lambda x: x[0])
|
||||
tmp_app_index[corpus]["functors"] = functors
|
||||
|
||||
valdb.appindex.update({"dockey": "appindex"}, {"dockey": "appindex", "data": tmp_app_index}, upsert=True)
|
||||
@@ -18,7 +18,12 @@ dev: build-container clean
|
||||
docker run --name $(CONNAME) -d -p 8080:8080 -v $(shell pwd):/src $(IMGNAME) /src/ops_scripts/dev.sh
|
||||
|
||||
prod: build-container clean
|
||||
docker run --name $(CONNAME) -d -p 8080:8080 -v $(shell pwd):/src $(IMGNAME) /src/ops_scripts/prod.sh
|
||||
docker run --restart always --name $(CONNAME) -d -p 8080:8080 -v $(shell pwd):/src $(IMGNAME) /src/ops_scripts/prod.sh
|
||||
|
||||
node-env: clean
|
||||
docker run --name $(CONNAME) -it -p 8080:8080 -v $(shell pwd):/src $(IMGNAME)
|
||||
|
||||
build-prod: build-container clean
|
||||
docker run --rm -v $(shell pwd):/src $(IMGNAME) /src/ops_scripts/prod.sh
|
||||
|
||||
|
||||
|
||||
1
src/frontend_vue/dist_bkp/index.html
Normal file
1
src/frontend_vue/dist_bkp/index.html
Normal file
@@ -0,0 +1 @@
|
||||
<!DOCTYPE html><html><head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>vue_frontend</title><link href=/static/css/app.05a420a551b5bded5dfec6b370d3edca.css rel=stylesheet></head><body><div id=app></div><script type=text/javascript src=/static/js/manifest.2ae2e69a05c33dfc65f8.js></script><script type=text/javascript src=/static/js/vendor.5d3d2fd333c62579d227.js></script><script type=text/javascript src=/static/js/app.8538f7133303d3e391b2.js></script></body></html>
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,2 @@
|
||||
!function(r){var n=window.webpackJsonp;window.webpackJsonp=function(e,u,c){for(var f,i,p,a=0,l=[];a<e.length;a++)i=e[a],o[i]&&l.push(o[i][0]),o[i]=0;for(f in u)Object.prototype.hasOwnProperty.call(u,f)&&(r[f]=u[f]);for(n&&n(e,u,c);l.length;)l.shift()();if(c)for(a=0;a<c.length;a++)p=t(t.s=c[a]);return p};var e={},o={2:0};function t(n){if(e[n])return e[n].exports;var o=e[n]={i:n,l:!1,exports:{}};return r[n].call(o.exports,o,o.exports,t),o.l=!0,o.exports}t.m=r,t.c=e,t.d=function(r,n,e){t.o(r,n)||Object.defineProperty(r,n,{configurable:!1,enumerable:!0,get:e})},t.n=function(r){var n=r&&r.__esModule?function(){return r.default}:function(){return r};return t.d(n,"a",n),n},t.o=function(r,n){return Object.prototype.hasOwnProperty.call(r,n)},t.p="/",t.oe=function(r){throw console.error(r),r}}([]);
|
||||
//# sourceMappingURL=manifest.2ae2e69a05c33dfc65f8.js.map
|
||||
@@ -0,0 +1 @@
|
||||
{"version":3,"sources":["webpack:///webpack/bootstrap d176f5affa434246605f"],"names":["parentJsonpFunction","window","chunkIds","moreModules","executeModules","moduleId","chunkId","result","i","resolves","length","installedChunks","push","Object","prototype","hasOwnProperty","call","modules","shift","__webpack_require__","s","installedModules","2","exports","module","l","m","c","d","name","getter","o","defineProperty","configurable","enumerable","get","n","__esModule","object","property","p","oe","err","console","error"],"mappings":"aACA,IAAAA,EAAAC,OAAA,aACAA,OAAA,sBAAAC,EAAAC,EAAAC,GAIA,IADA,IAAAC,EAAAC,EAAAC,EAAAC,EAAA,EAAAC,KACQD,EAAAN,EAAAQ,OAAoBF,IAC5BF,EAAAJ,EAAAM,GACAG,EAAAL,IACAG,EAAAG,KAAAD,EAAAL,GAAA,IAEAK,EAAAL,GAAA,EAEA,IAAAD,KAAAF,EACAU,OAAAC,UAAAC,eAAAC,KAAAb,EAAAE,KACAY,EAAAZ,GAAAF,EAAAE,IAIA,IADAL,KAAAE,EAAAC,EAAAC,GACAK,EAAAC,QACAD,EAAAS,OAAAT,GAEA,GAAAL,EACA,IAAAI,EAAA,EAAYA,EAAAJ,EAAAM,OAA2BF,IACvCD,EAAAY,IAAAC,EAAAhB,EAAAI,IAGA,OAAAD,GAIA,IAAAc,KAGAV,GACAW,EAAA,GAIA,SAAAH,EAAAd,GAGA,GAAAgB,EAAAhB,GACA,OAAAgB,EAAAhB,GAAAkB,QAGA,IAAAC,EAAAH,EAAAhB,IACAG,EAAAH,EACAoB,GAAA,EACAF,YAUA,OANAN,EAAAZ,GAAAW,KAAAQ,EAAAD,QAAAC,IAAAD,QAAAJ,GAGAK,EAAAC,GAAA,EAGAD,EAAAD,QAKAJ,EAAAO,EAAAT,EAGAE,EAAAQ,EAAAN,EAGAF,EAAAS,EAAA,SAAAL,EAAAM,EAAAC,GACAX,EAAAY,EAAAR,EAAAM,IACAhB,OAAAmB,eAAAT,EAAAM,GACAI,cAAA,EACAC,YAAA,EACAC,IAAAL,KAMAX,EAAAiB,EAAA,SAAAZ,GACA,IAAAM,EAAAN,KAAAa,WACA,WAA2B,OAAAb,EAAA,SAC3B,WAAiC,OAAAA,GAEjC,OADAL,EAAAS,EAAAE,EAAA,IAAAA,GACAA,GAIAX,EAAAY,EAAA,SAAAO,EAAAC,GAAsD,OAAA1B,OAAAC,UAAAC,eAAAC,KAAAsB,EAAAC,IAGtDpB,EAAAqB,EAAA,IAGArB,EAAAsB,GAAA,SAAAC,GAA8D,MAApBC,QAAAC,MAAAF,GAAoBA","file":"static/js/manifest.2ae2e69a05c33dfc65f8.js","sourcesContent":[" \t// install a JSONP callback for chunk loading\n \tvar parentJsonpFunction = window[\"webpackJsonp\"];\n \twindow[\"webpackJsonp\"] = function webpackJsonpCallback(chunkIds, moreModules, executeModules) {\n \t\t// add \"moreModules\" to the modules object,\n \t\t// then flag all \"chunkIds\" as loaded and fire callback\n \t\tvar moduleId, chunkId, i = 0, resolves = [], result;\n \t\tfor(;i < chunkIds.length; i++) {\n \t\t\tchunkId = chunkIds[i];\n \t\t\tif(installedChunks[chunkId]) {\n \t\t\t\tresolves.push(installedChunks[chunkId][0]);\n \t\t\t}\n \t\t\tinstalledChunks[chunkId] = 0;\n \t\t}\n \t\tfor(moduleId in moreModules) {\n \t\t\tif(Object.prototype.hasOwnProperty.call(moreModules, moduleId)) {\n \t\t\t\tmodules[moduleId] = moreModules[moduleId];\n \t\t\t}\n \t\t}\n \t\tif(parentJsonpFunction) parentJsonpFunction(chunkIds, moreModules, executeModules);\n \t\twhile(resolves.length) {\n \t\t\tresolves.shift()();\n \t\t}\n \t\tif(executeModules) {\n \t\t\tfor(i=0; i < executeModules.length; i++) {\n \t\t\t\tresult = __webpack_require__(__webpack_require__.s = executeModules[i]);\n \t\t\t}\n \t\t}\n \t\treturn result;\n \t};\n\n \t// The module cache\n \tvar installedModules = {};\n\n \t// objects to store loaded and loading chunks\n \tvar installedChunks = {\n \t\t2: 0\n \t};\n\n \t// The require function\n \tfunction __webpack_require__(moduleId) {\n\n \t\t// Check if module is in cache\n \t\tif(installedModules[moduleId]) {\n \t\t\treturn installedModules[moduleId].exports;\n \t\t}\n \t\t// Create a new module (and put it into the cache)\n \t\tvar module = installedModules[moduleId] = {\n \t\t\ti: moduleId,\n \t\t\tl: false,\n \t\t\texports: {}\n \t\t};\n\n \t\t// Execute the module function\n \t\tmodules[moduleId].call(module.exports, module, module.exports, __webpack_require__);\n\n \t\t// Flag the module as loaded\n \t\tmodule.l = true;\n\n \t\t// Return the exports of the module\n \t\treturn module.exports;\n \t}\n\n\n \t// expose the modules object (__webpack_modules__)\n \t__webpack_require__.m = modules;\n\n \t// expose the module cache\n \t__webpack_require__.c = installedModules;\n\n \t// define getter function for harmony exports\n \t__webpack_require__.d = function(exports, name, getter) {\n \t\tif(!__webpack_require__.o(exports, name)) {\n \t\t\tObject.defineProperty(exports, name, {\n \t\t\t\tconfigurable: false,\n \t\t\t\tenumerable: true,\n \t\t\t\tget: getter\n \t\t\t});\n \t\t}\n \t};\n\n \t// getDefaultExport function for compatibility with non-harmony modules\n \t__webpack_require__.n = function(module) {\n \t\tvar getter = module && module.__esModule ?\n \t\t\tfunction getDefault() { return module['default']; } :\n \t\t\tfunction getModuleExports() { return module; };\n \t\t__webpack_require__.d(getter, 'a', getter);\n \t\treturn getter;\n \t};\n\n \t// Object.prototype.hasOwnProperty.call\n \t__webpack_require__.o = function(object, property) { return Object.prototype.hasOwnProperty.call(object, property); };\n\n \t// __webpack_public_path__\n \t__webpack_require__.p = \"/\";\n\n \t// on error function for async loading\n \t__webpack_require__.oe = function(err) { console.error(err); throw err; };\n\n\n\n// WEBPACK FOOTER //\n// webpack/bootstrap d176f5affa434246605f"],"sourceRoot":""}
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -4,4 +4,4 @@ cp ./config/config_prod.json ./config/config.json
|
||||
|
||||
npm install
|
||||
npm run build
|
||||
http-server /src/dist
|
||||
# http-server /src/dist
|
||||
|
||||
4318
src/frontend_vue/package-lock.json
generated
4318
src/frontend_vue/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -38,7 +38,7 @@
|
||||
"friendly-errors-webpack-plugin": "^1.6.1",
|
||||
"html-webpack-plugin": "^2.30.1",
|
||||
"node-notifier": "^5.4.0",
|
||||
"optimize-css-assets-webpack-plugin": "^5.0.1",
|
||||
"optimize-css-assets-webpack-plugin": "^3.2.0",
|
||||
"ora": "^1.2.0",
|
||||
"portfinder": "^1.0.20",
|
||||
"postcss-import": "^11.0.0",
|
||||
|
||||
@@ -6,7 +6,12 @@
|
||||
<div class="col-sm-7">
|
||||
<div class="row">
|
||||
<div class="col-sm-12">
|
||||
št. povedi: {{ frameData.sentences.length }}
|
||||
<span v-if="frameData.sentences.length < frameData.sentence_count">
|
||||
št. povedi: {{ frameData.sentence_count }} (prikazanih {{ frameData.sentences.length }})
|
||||
</span>
|
||||
<span v-else>
|
||||
št. povedi: {{ frameData.sentences.length }}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
<LFunctors v-else></LFunctors>
|
||||
</div>
|
||||
<div class="col-sm-10">
|
||||
<p v-if="this.$root.store.api_error != null">
|
||||
<p class="text-danger" v-if="this.$root.store.api_error != null">
|
||||
{{ this.$root.store.api_error }}
|
||||
</p>
|
||||
<router-view></router-view>
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
<template>
|
||||
|
||||
<!--in case of error-->
|
||||
<div v-if="this.$root.store.api_error != null">
|
||||
</div>
|
||||
|
||||
<!--load mode-->
|
||||
<div v-if="state === 'loading'">
|
||||
<div v-else-if="state === 'loading'">
|
||||
<pulse-loader :color="loader_color"></pulse-loader>
|
||||
</div>
|
||||
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
<template>
|
||||
<nav>
|
||||
<b-navbar toggleable="md" type="light" variant="light">
|
||||
<b-navbar id="nav-red-bg" toggleable="md" type="light" variant="light">
|
||||
<b-navbar-toggle target="nav_collapse"></b-navbar-toggle>
|
||||
<!--b-navbar-brand>Vezljivostni vzorci slovenskih glagolov</b-navbar-brand-->
|
||||
<b-navbar-brand>VEZLJIVOSTNI VZORCI SLOVENSKIH GLAGOLOV</b-navbar-brand>
|
||||
<b-navbar-brand class=cursorpointer v-on:click="goHome">
|
||||
VEZLJIVOSTNI VZORCI SLOVENSKIH GLAGOLOV
|
||||
</b-navbar-brand>
|
||||
<b-collapse is-nav id="nav_collapse">
|
||||
|
||||
<b-navbar-nav>
|
||||
@@ -60,7 +62,7 @@ export default {
|
||||
name: "Nav",
|
||||
props: ["appState"],
|
||||
data() {return {
|
||||
optCorpora: ["kres", "ssj"],
|
||||
optCorpora: ["kres", "ssj", "gigafida"],
|
||||
optIndexes: [
|
||||
{key: "besede", val: "words"},
|
||||
{key: "udeleženske vloge", val: "functors"},
|
||||
@@ -101,13 +103,16 @@ export default {
|
||||
this.$router.push({
|
||||
name: "Home"
|
||||
})
|
||||
},
|
||||
goHome() {
|
||||
this.$router.replace({path: "/home"})
|
||||
}
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
<style>
|
||||
.bg-light {
|
||||
#nav-red-bg {
|
||||
background-color: rgb(183,21,17,0.9) !important;
|
||||
}
|
||||
nav a {
|
||||
@@ -116,4 +121,7 @@ nav a {
|
||||
nav a:hover {
|
||||
color: white;
|
||||
}
|
||||
.cursorpointer {
|
||||
cursor: pointer;
|
||||
}
|
||||
</style>
|
||||
0
src/pkg/__init__.py
Normal file
0
src/pkg/__init__.py
Normal file
Submodule src/pkg/cjvt-corpusparser updated: 01adf47b9b...92b3ac4ea3
1
src/pkg/luscenje_struktur
Submodule
1
src/pkg/luscenje_struktur
Submodule
Submodule src/pkg/luscenje_struktur added at 8c87d07b8a
9
src/pkg/seqparser/Makefile
Normal file
9
src/pkg/seqparser/Makefile
Normal file
@@ -0,0 +1,9 @@
|
||||
SSKJ_HTML = /home/kristjan/git/diploma/data/sskj/sskj2_v1.html
|
||||
SSKJ_JSON = "./sskj_senses.json"
|
||||
WORDLIST = "./wordlist.json"
|
||||
|
||||
gen_json_files:
|
||||
cd seqparser; python3 main.py \
|
||||
--sskj-html=$(SSKJ_HTML) \
|
||||
--sskj-json=$(SSKJ_JSON) \
|
||||
--wordlist=$(WORDLIST)
|
||||
1
src/pkg/seqparser/requirements.txt
Normal file
1
src/pkg/seqparser/requirements.txt
Normal file
@@ -0,0 +1 @@
|
||||
bs4
|
||||
313
src/pkg/seqparser/seqparser/Seqparser.py
Normal file
313
src/pkg/seqparser/seqparser/Seqparser.py
Normal file
@@ -0,0 +1,313 @@
|
||||
from bs4 import BeautifulSoup as BS
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from time import time
|
||||
import pickle
|
||||
import json
|
||||
from copy import deepcopy as DC
|
||||
from pathlib import Path
|
||||
|
||||
# Match sese ordinals (1., 2., ...)
|
||||
rord = re.compile(r"^ *[0-9]+\. *$")
|
||||
|
||||
# Get rid of accented characters.
|
||||
intab = "ÁÉÍÓÚàáäçèéêìíîñòóôöùúüčŔŕ"
|
||||
outtb = "AEIOUaaaceeeiiinoooouuučRr"
|
||||
transtab = str.maketrans(intab, outtb)
|
||||
|
||||
def d_time(fun):
|
||||
def wrapper(*args, **kwargs):
|
||||
tstart = time()
|
||||
fun(*args, **kwargs)
|
||||
duration = time() - tstart
|
||||
print("Function {} ran for {:.2f} s.".format(
|
||||
fun.__name__, duration))
|
||||
return wrapper
|
||||
|
||||
class Seqparser:
|
||||
def __init__(sskj_file):
|
||||
pass
|
||||
|
||||
@d_time
|
||||
def html_to_verb_adj_json(self, infile, outfile):
|
||||
out_dict = defaultdict(list)
|
||||
with Path(infile).open("rb") as fp:
|
||||
for line in fp:
|
||||
data = self.parse_line(line)
|
||||
if data is None: continue
|
||||
out_dict[data["izt_clean"]].append(data)
|
||||
with Path(outfile).open("w") as fp:
|
||||
json.dump(dict(out_dict), fp)
|
||||
|
||||
@d_time
|
||||
def generate_sskj_wordlist(self, in_json_file, out_wordlist):
|
||||
wordlist = None
|
||||
with Path(in_json_file).open("r") as fp:
|
||||
jdata = json.load(fp)
|
||||
wordlist = list(jdata.keys())
|
||||
with Path(out_wordlist).open("w") as fp:
|
||||
json.dump({"wordlist": wordlist}, fp)
|
||||
|
||||
# main functions
|
||||
def html_to_raw_pickle(self, sskj_html_filepath, raw_pickle_filepath):
|
||||
entries = dict(self.parse_file(sskj_html_filepath, self.parse_line))
|
||||
print("entries len: " + str(len(entries)))
|
||||
with open(raw_pickle_filepath, "wb") as f:
|
||||
tmpstr = json.dumps(dict(entries))
|
||||
pickle.dump(tmpstr, f)
|
||||
# debugging
|
||||
|
||||
def raw_pickle_to_parsed_pickle(
|
||||
self, raw_pickle_filepath, parsed_pickle_filepath,
|
||||
se_list_filepath
|
||||
):
|
||||
data = self.load_raw_pickle(raw_pickle_filepath)
|
||||
print("raw_pickle data len: " + str(len(data)))
|
||||
se_list = self.gen_se_list(data)
|
||||
print("se_list len: " + str(len(se_list)))
|
||||
with open(se_list_filepath, "wb") as f:
|
||||
pickle.dump(se_list, f)
|
||||
data1 = self.remove_se(data)
|
||||
data2 = self.reorganize(data1, se_list)
|
||||
print("data2 len: " + str(len(data2.keys())))
|
||||
with open(parsed_pickle_filepath, "wb") as f:
|
||||
pickle.dump(data2, f)
|
||||
|
||||
# helper html reading functions
|
||||
def parse_file(self, path, f_parse_line):
|
||||
tstart = time()
|
||||
entries = defaultdict(list)
|
||||
with open(path, "r") as f:
|
||||
for line in f:
|
||||
data = f_parse_line(line)
|
||||
if data is not None:
|
||||
entries[data["izt_clean"]].append(data)
|
||||
print("parse_file({}) in {:.2f}s".format(path, time() - tstart))
|
||||
return entries
|
||||
|
||||
def parse_line(self, line):
|
||||
def helper_bv_set(g_or_p):
|
||||
if g_or_p not in ["G", "P"]:
|
||||
print("Err g_or_p.")
|
||||
exit(1)
|
||||
if data.get("bv") is not None:
|
||||
if data["bv"] != g_or_p:
|
||||
print(str(line))
|
||||
# exit(1)
|
||||
data["bv"] = g_or_p
|
||||
data = {
|
||||
"izt": "",
|
||||
"izt_clean": "",
|
||||
"senses": defaultdict(list)
|
||||
}
|
||||
soup = BS(line, "html.parser")
|
||||
|
||||
current_sense_id = "0"
|
||||
for span in soup.find_all("span"):
|
||||
|
||||
# sense id
|
||||
if span.string is not None:
|
||||
rmatch = rord.match(span.string)
|
||||
if rmatch is not None:
|
||||
current_sense_id = rmatch.group().strip()
|
||||
|
||||
title = span.attrs.get("title")
|
||||
if title is not None:
|
||||
title = title.lower()
|
||||
|
||||
# only verbs and adjectives
|
||||
if "glagol" in title:
|
||||
helper_bv_set("G")
|
||||
data["bv_full"] = title
|
||||
elif "pridevn" in title:
|
||||
helper_bv_set("P")
|
||||
data["bv_full"] = title
|
||||
|
||||
# žšč
|
||||
if title == "iztočnica":
|
||||
data["izt"] = span.string
|
||||
data["izt_clean"] = span.string.translate(transtab).lower()
|
||||
|
||||
# sense description
|
||||
if title == "razlaga" and span.string is not None:
|
||||
data["senses"][current_sense_id].append(
|
||||
("razl", span.string))
|
||||
if "pridevnik od" in span.string:
|
||||
helper_bv_set("P")
|
||||
|
||||
if title == "sopomenka":
|
||||
subspan = span.find_all("a")[0]
|
||||
if subspan.string is not None:
|
||||
data["senses"][current_sense_id].append(
|
||||
("sopo", subspan.string))
|
||||
|
||||
# save verbs and adjectives
|
||||
if (
|
||||
("bv" not in data) or
|
||||
(data["bv"] != "P" and data["bv"] != "G")
|
||||
):
|
||||
return None
|
||||
|
||||
# sanity check
|
||||
if data["bv"] == "P" and " se" in data["izt_clean"]:
|
||||
print(data)
|
||||
exit(1)
|
||||
|
||||
# append _ to adjective keywords
|
||||
if data["bv"] == "P":
|
||||
data["izt_clean"] = data["izt_clean"] + "_"
|
||||
|
||||
# cleanup
|
||||
if "bv" not in data:
|
||||
print("Should not be here (no bv).")
|
||||
exit(1)
|
||||
del(data["bv"])
|
||||
if "bv_full" in data:
|
||||
del(data["bv_full"])
|
||||
|
||||
return data
|
||||
|
||||
# helper functions
|
||||
def load_raw_pickle(self, raw_pickle_filepath):
|
||||
with open(raw_pickle_filepath, "rb") as f:
|
||||
tmpstr = pickle.load(f)
|
||||
return json.loads(tmpstr)
|
||||
|
||||
def helper_loop(self, data, fnc):
|
||||
for k, lst in data.items():
|
||||
for el in lst:
|
||||
fnc(el)
|
||||
|
||||
def gen_se_list(self, data):
|
||||
|
||||
def fnc1(el):
|
||||
ic = el["izt_clean"]
|
||||
if " se" in ic:
|
||||
se_list.append(ic)
|
||||
|
||||
def fnc2(el):
|
||||
ic = el["izt_clean"]
|
||||
if ic in se_pruned:
|
||||
se_pruned.remove(ic)
|
||||
|
||||
# hw entries that only exist with " se"
|
||||
se_list = []
|
||||
self.helper_loop(data, fnc1)
|
||||
se_pruned = set([hw.split(" se")[0] for hw in se_list])
|
||||
self.helper_loop(data, fnc2)
|
||||
return sorted(list(se_pruned))
|
||||
|
||||
def remove_se(self, data):
|
||||
|
||||
def fnc1(el):
|
||||
nel = DC(el)
|
||||
ic = nel["izt_clean"]
|
||||
if " se" in ic:
|
||||
nic = ic.split(" se")[0]
|
||||
nel["izt_clean"] = nic
|
||||
data_new[nel["izt_clean"]].append(nel)
|
||||
|
||||
data_new = defaultdict(list)
|
||||
self.helper_loop(data, fnc1)
|
||||
return dict(data_new)
|
||||
|
||||
def reorganize(self, data, se_list):
|
||||
# some hw entries have several headwords,
|
||||
# some senses have subsenses
|
||||
# index everything, make 1 object per hw
|
||||
|
||||
def helper_prune(sense_str):
|
||||
# remove space padding
|
||||
sense_str = sense_str.strip()
|
||||
|
||||
if len(sense_str) == 1:
|
||||
return sense_str
|
||||
|
||||
# remove banned characters from string ending
|
||||
banned = ": ; . , - ! ?".split(" ")
|
||||
if sense_str[-1] in banned:
|
||||
return sense_str[:-1]
|
||||
|
||||
return sense_str
|
||||
|
||||
data_new = {}
|
||||
for k, lst in data.items():
|
||||
new_el = {
|
||||
"hw": k,
|
||||
"has_se": k in se_list,
|
||||
"senses": []
|
||||
}
|
||||
|
||||
# if there is a single hw entry, hw_id is 0
|
||||
if len(lst) == 1:
|
||||
homonym_id = -1
|
||||
else:
|
||||
homonym_id = 0
|
||||
|
||||
# loop homonyms
|
||||
for el in lst:
|
||||
homonym_id += 1
|
||||
# loop top lvl sense ids
|
||||
for sense_id, sens_lst in el["senses"].items():
|
||||
# loop subsenses
|
||||
for i, sens in enumerate(sens_lst):
|
||||
nsid = sense_id.split(".")[0]
|
||||
if len(sens_lst) == 1:
|
||||
nsid += "-0"
|
||||
else:
|
||||
nsid += ("-" + str(i + 1))
|
||||
new_sense = {
|
||||
"homonym_id": homonym_id,
|
||||
# sense_id: sense_id-subsense_id
|
||||
"sense_id": nsid,
|
||||
"sense_type": sens[0],
|
||||
"sense_desc": helper_prune(sens[1]),
|
||||
}
|
||||
new_el["senses"].append(new_sense)
|
||||
hw = new_el["hw"]
|
||||
if hw in data_new:
|
||||
print("Shouldn't be here.")
|
||||
print(new_el)
|
||||
exit(1)
|
||||
data_new[hw] = DC(new_el)
|
||||
# return data_new
|
||||
|
||||
# check
|
||||
for hw, el in data_new.items():
|
||||
for sens in el["senses"]:
|
||||
if sens["sense_desc"] is None:
|
||||
print(sens)
|
||||
|
||||
return data_new
|
||||
|
||||
|
||||
def plst(lst):
|
||||
for el in lst:
|
||||
print(el)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
datapath = "../../../data"
|
||||
html_filepath = datapath + "/sskj/sskj2_v1.html"
|
||||
raw_pickle_filepath = datapath + "/tmp_pickles/raw_sskj.pickle"
|
||||
parsed_pickle_filepath = datapath + "/no_del_pickles/sskj_senses.pickle"
|
||||
se_list_filepath = datapath + "/no_del_pickles/se_list.pickle"
|
||||
|
||||
p = Seqparser()
|
||||
|
||||
if True:
|
||||
print("html_to_raw_pickle({}, {})".format(
|
||||
html_filepath, raw_pickle_filepath))
|
||||
print("Big file, this might take a while (2 min).")
|
||||
tstart = time()
|
||||
p.html_to_raw_pickle(html_filepath, raw_pickle_filepath)
|
||||
print("Finished in {:.2f}.".format(time() - tstart))
|
||||
|
||||
if False:
|
||||
print("raw_pickle_to_parsed_pickle({}, {}, {})".format(
|
||||
raw_pickle_filepath, parsed_pickle_filepath, se_list_filepath))
|
||||
tstart = time()
|
||||
p.raw_pickle_to_parsed_pickle(
|
||||
raw_pickle_filepath, parsed_pickle_filepath, se_list_filepath)
|
||||
print("Finished in {:.2f}.".format(time() - tstart))
|
||||
print("Done.")
|
||||
0
src/pkg/seqparser/seqparser/__init__.py
Normal file
0
src/pkg/seqparser/seqparser/__init__.py
Normal file
68
src/pkg/seqparser/seqparser/main.py
Normal file
68
src/pkg/seqparser/seqparser/main.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from Seqparser import Seqparser
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import json
|
||||
import datetime
|
||||
import hashlib
|
||||
from pymongo import MongoClient
|
||||
|
||||
SSKJ_USER = "sskj2"
|
||||
|
||||
if __name__ == "__main__":
|
||||
aparser = argparse.ArgumentParser()
|
||||
aparser.add_argument("--sskj-html", type=str)
|
||||
aparser.add_argument("--sskj-json", type=str)
|
||||
aparser.add_argument("--wordlist", type=str)
|
||||
aparser.add_argument("--operation", type=str)
|
||||
aparser.add_argument("--dbaddr", type=str)
|
||||
aparser.add_argument("--dbuser", type=str)
|
||||
aparser.add_argument("--dbpass", type=str)
|
||||
args = aparser.parse_args()
|
||||
|
||||
if args.operation == "gen_sskj_json":
|
||||
sqp = Seqparser()
|
||||
sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
|
||||
sys.exit()
|
||||
|
||||
if args.operation == "gen_wordlist":
|
||||
sqp = Seqparser()
|
||||
sqp.generate_sskj_wordlist(args.sskj_senses, args.wordlist)
|
||||
sys.exit()
|
||||
|
||||
if args.operation == "senses_to_db":
|
||||
db_entries = []
|
||||
tmp_dt = datetime.datetime.utcnow()
|
||||
with Path(args.sskj_json).open("r") as fp:
|
||||
jdata = json.load(fp)
|
||||
# print(jdata[list(jdata.keys())[201]])
|
||||
for hw, entry in jdata.items():
|
||||
for key, sense in entry[0]["senses"].items():
|
||||
desc = sense[0][1]
|
||||
if sense[0][0] == "razl":
|
||||
desc = desc[:-1] # for some reason, descriptions contain a ':'
|
||||
else:
|
||||
desc = sense[0][0] + ": " + desc
|
||||
tmp_entry = {
|
||||
"desc": desc,
|
||||
"hw": hw,
|
||||
"author": SSKJ_USER
|
||||
}
|
||||
tmp_entry["sense_id"] = "{}-{}".format(
|
||||
SSKJ_USER,
|
||||
hashlib.sha256(str(tmp_entry).encode("utf-8")).hexdigest()[:10]
|
||||
)
|
||||
tmp_entry["date"] = tmp_dt
|
||||
db_entries.append(tmp_entry)
|
||||
print(len(db_entries))
|
||||
|
||||
# db login
|
||||
client = MongoClient(
|
||||
"mongodb://{}".format(args.dbaddr),
|
||||
username=args.dbuser,
|
||||
password=args.dbpass,
|
||||
authSource="valdb",
|
||||
authMechanism='SCRAM-SHA-1'
|
||||
)
|
||||
valdb = client.valdb
|
||||
valdb.senses.insert_many(db_entries)
|
||||
11
src/pkg/seqparser/setup.py
Normal file
11
src/pkg/seqparser/setup.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name='seqparser',
|
||||
version='0.0.1',
|
||||
description='Parser for sskj2 html dump.',
|
||||
author='Kristjan Voje',
|
||||
author_email='kristjan.voje@gmail.com',
|
||||
license='MIT',
|
||||
packages=['seqparser'],
|
||||
)
|
||||
@@ -3,6 +3,41 @@ from corpusparser import enriched_lemma
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
def frames_from_db_entry_headword(dbent, headword):
|
||||
def _full_tid(tid):
|
||||
return ".".join([dbent["sid"], str(tid)])
|
||||
|
||||
token_dict = {str(x["tid"]): x for x in dbent["tokens"]}
|
||||
|
||||
frames = []
|
||||
if "srl_links" not in dbent:
|
||||
return []
|
||||
srldict = {}
|
||||
for srl in dbent["srl_links"]:
|
||||
key = str(srl["from"])
|
||||
if enriched_lemma(token_dict[key]) != headword:
|
||||
continue
|
||||
if key not in srldict:
|
||||
srldict[key] = [srl]
|
||||
else:
|
||||
srldict[key] += [srl]
|
||||
for hwtid, srlarr in srldict.items():
|
||||
frames += [Frame(
|
||||
hw_lemma=enriched_lemma(token_dict[hwtid]),
|
||||
tids=[_full_tid(hwtid)],
|
||||
slots=[
|
||||
Slot(
|
||||
functor=srl["afun"],
|
||||
tids=[_full_tid(srl["to"])]
|
||||
) for srl in srlarr
|
||||
],
|
||||
# sentences=[(dbent["sid"], dbent["tokens"])],
|
||||
sentences=[
|
||||
[(_full_tid(t["tid"]), t) for t in dbent["tokens"]],
|
||||
]
|
||||
)]
|
||||
return frames
|
||||
|
||||
def frames_from_db_entry(dbent):
|
||||
def _full_tid(tid):
|
||||
return ".".join([dbent["sid"], str(tid)])
|
||||
@@ -37,7 +72,8 @@ def frames_from_db_entry(dbent):
|
||||
return frames
|
||||
|
||||
class Frame():
|
||||
def __init__(self, tids, deep_links=None, slots=None, hw_lemma=None, sentences=None):
|
||||
def __init__(self, tids, deep_links=None, slots=None,
|
||||
hw_lemma=None, sentences=None, sentence_count=None):
|
||||
self.hw = hw_lemma
|
||||
self.tids = tids # list of tokens with the same hw_lemma
|
||||
# Each tid = "S123.t123";
|
||||
@@ -50,6 +86,8 @@ class Frame():
|
||||
self.sense_info = {}
|
||||
self.sentences = sentences
|
||||
self.aggr_sent = None # Dictionary { hw: self.sentences idx }
|
||||
self.sentence_count = sentence_count # paging, optimization
|
||||
|
||||
|
||||
def get_functors(self):
|
||||
return [slot.functor for slot in self.slots]
|
||||
@@ -62,7 +100,8 @@ class Frame():
|
||||
"slots": [slot.to_json() for slot in self.slots],
|
||||
"sentences": self.sentences,
|
||||
"aggr_sent": self.aggr_sent,
|
||||
"sense_info": self.sense_info
|
||||
"sense_info": self.sense_info,
|
||||
"sentence_count": self.sentence_count
|
||||
}
|
||||
return ret
|
||||
|
||||
|
||||
@@ -1,96 +0,0 @@
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Frame():
|
||||
def __init__(self, tids, deep_links=None, slots=None, hw=None):
|
||||
self.hw = hw
|
||||
self.tids = tids # list of tokens with the same hw_lemma
|
||||
# Each tid = "S123.t123";
|
||||
# you can get sentence with vallex.get_sentence(S123)
|
||||
self.slots = []
|
||||
if slots is None:
|
||||
self.slots = self.init_slots(deep_links)
|
||||
else:
|
||||
self.slots = slots
|
||||
self.sense_info = {}
|
||||
self.sentences = None # Used for passing to view in app.py, get_frames
|
||||
self.aggr_sent = None # Dictionary { hw: self.sentences idx }
|
||||
|
||||
def to_json(self):
|
||||
ret = {
|
||||
"hw": self.hw,
|
||||
"tids": self.tids,
|
||||
"slots": [slot.to_json() for slot in self.slots],
|
||||
"sentences": self.sentences,
|
||||
"aggr_sent": self.aggr_sent,
|
||||
"sense_info": self.sense_info
|
||||
}
|
||||
return ret
|
||||
|
||||
def init_slots(self, deep):
|
||||
slots = []
|
||||
for link in deep:
|
||||
slots.append(Slot(
|
||||
functor=link["functor"],
|
||||
tids=[link["to"]]
|
||||
))
|
||||
return slots
|
||||
|
||||
def sort_slots(self):
|
||||
# ACT, PAT, alphabetically
|
||||
srt1 = [
|
||||
x for x in self.slots
|
||||
if (x.functor == "ACT" or
|
||||
x.functor == "PAT")
|
||||
]
|
||||
srt1 = sorted(srt1, key=lambda x: x.functor)
|
||||
srt2 = [
|
||||
x for x in self.slots
|
||||
if (x.functor != "ACT" and
|
||||
x.functor != "PAT")
|
||||
]
|
||||
srt2 = sorted(srt2, key=lambda x: x.functor)
|
||||
self.slots = (srt1 + srt2)
|
||||
|
||||
def to_string(self):
|
||||
ret = "Frame:\n"
|
||||
ret += "sense_info: {}\n".format(str(self.sense_info))
|
||||
ret += "tids: ["
|
||||
for t in self.tids:
|
||||
ret += (str(t) + ", ")
|
||||
ret += "]\n"
|
||||
if self.slots is not None:
|
||||
ret += "slots:\n"
|
||||
for sl in self.slots:
|
||||
ret += (sl.to_string() + "\n")
|
||||
return ret
|
||||
|
||||
|
||||
class Slot():
|
||||
# Each slot is identified by its functor (ACT, PAT, ...)
|
||||
# It consists of different tokens.
|
||||
def __init__(self, functor, tids=None, count=None):
|
||||
self.functor = functor
|
||||
self.tids = tids or [] # combining multiple sentences vertically
|
||||
self.count = count or 1
|
||||
|
||||
def to_string(self):
|
||||
ret = "---- Slot:\n"
|
||||
ret += "functor: {}\n".format(self.functor)
|
||||
ret += "tids: ["
|
||||
for t in self.tids:
|
||||
ret += (str(t) + ", ")
|
||||
ret += "]\n"
|
||||
ret += "]\n"
|
||||
ret += "----\n"
|
||||
return ret
|
||||
|
||||
def to_json(self):
|
||||
ret = {
|
||||
"functor": self.functor,
|
||||
"tids": self.tids,
|
||||
"count": self.count
|
||||
}
|
||||
return ret
|
||||
@@ -9,6 +9,7 @@ import logging
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
SENSE_UNDEFINED = "nedefinirano"
|
||||
SENTENCE_LIMIT = 10
|
||||
|
||||
## TIDI: use frame.py
|
||||
## TODO: build a list of [Frame] with lists of [Slot]
|
||||
@@ -70,7 +71,10 @@ def reduce_1(frames, valdb_sensemap=None):
|
||||
for functor in fs[0]:
|
||||
slots[functor] = Slot(functor=functor)
|
||||
# Reduce slots from all frames. (Merge ACT from all frames, ...)
|
||||
sentence_count = len(fs[1])
|
||||
for frame in fs[1]:
|
||||
if len(tids) >= SENTENCE_LIMIT:
|
||||
break
|
||||
tids += frame.tids
|
||||
sentences += frame.sentences
|
||||
for sl in frame.slots:
|
||||
@@ -78,8 +82,13 @@ def reduce_1(frames, valdb_sensemap=None):
|
||||
slots_list = []
|
||||
for k, e in slots.items():
|
||||
slots_list.append(e)
|
||||
# TODO does appending hw_lemma of first frame work for functor frames too?
|
||||
rf = Frame(hw_lemma=fs[1][0].hw, tids=tids, slots=slots_list, sentences=sentences)
|
||||
rf = Frame(
|
||||
hw_lemma=fs[1][0].hw,
|
||||
tids=tids,
|
||||
slots=slots_list,
|
||||
sentences=sentences,
|
||||
sentence_count=sentence_count
|
||||
)
|
||||
rf.sort_slots()
|
||||
ret_frames.append(rf)
|
||||
return sorted_by_len_tids(ret_frames)
|
||||
@@ -182,7 +191,11 @@ def frames_from_sense_ids(raw_frames, id_map):
|
||||
tids = []
|
||||
reduced_slots = []
|
||||
sentences = []
|
||||
|
||||
sentence_count = len(frames)
|
||||
for frame in frames:
|
||||
if len(tids) >= SENTENCE_LIMIT:
|
||||
break
|
||||
tids += frame.tids
|
||||
sentences += frame.sentences
|
||||
for slot in frame.slots:
|
||||
@@ -204,7 +217,8 @@ def frames_from_sense_ids(raw_frames, id_map):
|
||||
hw_lemma="derp",
|
||||
tids=tids,
|
||||
slots=reduced_slots,
|
||||
sentences=sentences
|
||||
sentences=sentences,
|
||||
sentence_count=sentence_count,
|
||||
)
|
||||
id_map_entry = (
|
||||
id_map.get(tids[0]) or
|
||||
|
||||
Reference in New Issue
Block a user