29 Commits

Author SHA1 Message Date
ec083a8d63 Added modifications to validate xsd 2020-12-09 07:47:55 +01:00
69c3521e4b Added gigafida fix for multiple senses 2020-12-08 15:33:20 +01:00
75b015dcda A couple of fixes on write_xml in create_xml.py + Created form_csv.py script 2020-12-08 08:01:17 +01:00
c18aaff11f Merge branch 'scripts' of https://gitea.cjvt.si/lkrsnik/cjvt-valency into scripts 2020-11-23 11:25:01 +01:00
34b776be11 Small README fixes 2020-11-23 11:24:50 +01:00
26bca0b083 Makefile changes and added options 2020-11-23 11:20:47 +01:00
2551a9c6a8 Fixing loop in jos srl 2020-09-30 11:40:55 +02:00
5cdc963c2d Added warnings to skip sentences that do not match 2020-09-30 09:51:41 +02:00
ce1fb46b4e Adding processing improvemets. 2020-09-23 13:02:31 +02:00
220529b777 Parameterized mongo_db + Added internal state saving for p1 2020-09-22 19:31:31 +02:00
ae5f2869bc Deleted unnecessary argument 2020-09-18 12:37:52 +02:00
931b3531b3 Added some progress bars + erased beginning skipping. 2020-09-18 10:21:05 +02:00
3d91251905 First commit on scripts branch 2020-09-15 14:08:16 +02:00
c803057164 Added gigafida to file creation 2019-09-06 10:27:03 +02:00
b4db4e5255 docker stack deployment 2019-05-06 23:13:10 +02:00
ef02583d72 stack config 2019-05-06 22:10:33 +02:00
155c0b2c3d docker stack commands 2019-05-05 15:50:48 +02:00
c96b199932 css fix, preparing prod 2019-05-04 18:12:05 +02:00
02c0e74798 npm package.json with a working build 2019-05-04 16:49:47 +02:00
2ff339e24c prepared app for production (removed global variables, add flask-pymongo as db driver 2019-05-04 01:28:46 +02:00
707034153c closing db cursor 2019-05-01 18:38:35 +02:00
ba72802f1f added QUERY_LIMIT to app.py 2019-05-01 10:30:16 +02:00
14a6e2423b logging when indexing 2019-04-28 23:24:40 +02:00
c5fc78dca1 appindex to file instead of DB 2019-04-28 22:44:54 +02:00
439446b1d0 Merge branch 'master' of gitea.cjvt.si:kristjan/cjvt-valency 2019-04-28 21:59:22 +02:00
eca236bc7e README fix 2019-04-28 21:59:20 +02:00
11d3dfc0e6 some changes 2019-04-28 21:46:17 +02:00
1aff111cb9 frontend showing m out of n sentences 2019-04-28 20:02:54 +02:00
910955abb8 backend: added sentence_count to limit displayed sentences in frontend 2019-04-28 19:02:39 +02:00
62 changed files with 4961 additions and 2510 deletions

13
.gitignore vendored
View File

@@ -1,10 +1,23 @@
data/samples/
data/wordlist.json
data/sskj_senses.json
data/appindex.json
*egg-info/
*.pyc
src/frontend_vue/node_modules/
src/frontend_vue/dist/
dockerfiles/database/create.js
dockerfiles/database/create_mongo.js
dockerfiles/database/create_postgres.js
dockerfiles/database/mongo_db.gz
dockerfiles/database/postgres_db.tar
dockerfiles/database/postgres_db_OLD.tar
*__pycache__/
env.local
logs/*
.idea/
venv*
data/
data
deploy_instructions/
run.sh

3
.gitmodules vendored
View File

@@ -1,3 +1,6 @@
[submodule "src/pkg/cjvt-corpusparser"]
path = src/pkg/cjvt-corpusparser
url = git@gitea.cjvt.si:kristjan/cjvt-corpusparser.git
[submodule "src/pkg/luscenje_struktur"]
path = src/pkg/luscenje_struktur
url = https://gitea.cjvt.si/ozbolt/luscenje_struktur.git

16
Dockerfile-backend-flask Normal file
View File

@@ -0,0 +1,16 @@
FROM cjvt-python-env
RUN mkdir -p /project/src/backend_flask
RUN mkdir -p /project/src/pkg
RUN mkdir -p /project/data
COPY src/backend_flask /project/src/backend_flask
COPY src/pkg /project/src/pkg
COPY data/appindex.json /project/data
COPY src/backend_flask/entrypoint.sh /.
COPY src/backend_flask/conf_files/prod_conf.yaml /project
ENTRYPOINT ["/bin/bash", "/entrypoint.sh"]

View File

@@ -13,24 +13,28 @@ SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link"
# KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
# KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
KRES_FOLDER = "/home/kristjan/kres_data/payload/kres_xml"
GIGAFIDA_FOLDER = "/home/lukakrsnik/cjvt-valency/data_all/giga_orig"
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json"
GIGAFIDA_SRL_FOLDER = "/home/lukakrsnik/cjvt-valency/data_all/final_json"
# This file comes with the source code. Make sure you unpack it and name it right.
SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json"
SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json"
# for pre-generation the index of all headwords and functors
APPINDEX_PATH = "$(MAKE_ROOT)/data/appindex.json"
OUTPUT = "db"
# OUTPUT = "file"
OUTDIR = "/tmp/three" # if you're running this in docker, make sure to mount the volume
OUTDIR = "/project/data" # if you're running this in docker, make sure to mount the volume
DBADDR = "0.0.0.0:27017" # don't use localhost
# credentials from .gitignored file
# create it from env.default
include env.local
N_CORES = 3
N_CORES = 4
# insert kres files into database in chunks, for fewer connections
KRES_CHUNK_SIZE = 30
@@ -53,6 +57,12 @@ database-service:
database-users:
cd dockerfiles/database; $(MAKE) create_users
database-restore:
cd dockerfiles/database; $(MAKE) restore_db
database-restore-postgres:
cd dockerfiles/database; $(MAKE) restore_postgres_db
# also useful, if we want to restart the db
database-clean:
cd dockerfiles/database; $(MAKE) clean_stack
@@ -66,6 +76,7 @@ python-env-install:
pip3 install -e src/pkg/cjvt-corpusparser/.
pip3 install -e src/pkg/valency/.
pip3 install -e src/pkg/seqparser/.
pip3 install -e src/pkg/luscenje_struktur/.
# from inside python-env container:
data/samples:
@@ -90,7 +101,14 @@ fill-database-kres: data/samples
--chunk-size $(KRES_CHUNK_SIZE) \
--cores $(N_CORES)
fill-database-gigafida: data/samples
python3 src/pkg/cjvt-corpusparser/corpusparser/main.py --kres-folder $(GIGAFIDA_FOLDER) \
--corpus="gigafida" \
--ssj-file $(SSJ_FILE) --kres-srl-folder $(GIGAFIDA_SRL_FOLDER) \
--output $(OUTPUT) --outdir $(OUTDIR) --dbaddr $(DBADDR) \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) \
--chunk-size $(KRES_CHUNK_SIZE) \
--cores $(N_CORES)
## Frontend
@@ -102,6 +120,9 @@ frontend-dev:
frontend-prod:
cd src/frontend_vue/; $(MAKE) prod
build-frontend-prod:
cd src/frontend_vue/; $(MAKE) build-prod
## Backend
@@ -112,17 +133,23 @@ backend-prepare-db:
--config-file ./conf_files/dev_conf.yaml \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \
--sskj-wordlist $(SSKJ_WORDLIST) \
--appindex-json $(APPINDEX_PATH) \
--prepare-db
backend-dev:
cd ./src/backend_flask; python3 app.py \
--config-file ./conf_files/dev_conf.yaml \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \
--appindex-json $(APPINDEX_PATH)
backend-prod:
backend-prod-old:
cd ./src/backend_flask; python3 app.py \
--config-file ./conf_files/prod_conf.yaml \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \
--appindex-json $(APPINDEX_PATH)
build-backend-flask:
cd ./src/backend_flask; $(MAKE) build
## add sskj senses to db (generated with pkg/seqparser)
sskj-senses:
@@ -132,3 +159,7 @@ sskj-senses:
--dbaddr $(DBADDR) \
--dbuser $(DB_USR_USER) \
--dbpass $(DB_USR_PASS)
deploy-prod-stack:
- docker network create val-backend
docker stack deploy -c production.yaml val

127
README.md
View File

@@ -52,8 +52,6 @@ $ make fill-database-ssj
$ make fill-database-kres
# You can detach from the running process using Ctrl-p + Ctrl-q
# this is a long operation
# if running on a remote server, use nohup:
$ nohup $(make fill-database > fill-database.log) &
@@ -74,6 +72,10 @@ $ make python-env-install
# needs to be ran once to modify a new database
$ make backend-prepare-db
# if you have the file prepared (sskj_senses.json), you can
# fill the database with some senses
$ make sskj-senses
# with debugger
$ make backend-dev
@@ -100,3 +102,124 @@ $ make frontend-prod
```
App available on: `http://0.0.0.0:8080`.
## Production deployment
Prerequisite: machine with free ports 80 and 8084.
### Database
Either build the database from scratch (lenghty process) using above instructions or just migrate the database from the faculty server (recommended).
Build container my-mongo:
```bash
# run once and destroy containers
$ make database-service
```
### Backend
Set database connection details in `/src/backend_flask/db_config.py`.
Change 'valuser' and 'valuserpass' to the database user.
```bash
mongodb://valuser:valuserpass@my_mongo/valdb
```
In the above line, replace `valuser` with the username and `valuserpass` with the password that was used to create the database tables (the values were set in the root Makefile).
You can also set the number of workers in `/src/backend_flask/entrypoint.sh`.
In line with `gunicorn -t 4 -b 127.0.0.1:8084 app:app`, edit the `-t` parameter.
Rule of thumb is 2x number of available CPU cores.
Build the backend container:
```bash
# From git root
$ make build-backend-flask
```
### Frontend
Set the server address (where backend will be runnig) in `src/frontend_vue/config/config_prod.json`.
Build the `/dist` folder that contains the static app (we will be using Nginx to serve it).
```bash
# From git root
$ make build-frontend-prod
```
All set, now run the stack.
Stack configuration in `production.yaml`.
```bash
# From git root
$ make deploy-prod-stack
```
## Uploading a mongo dump
There's a 15GB mongo dump containing the fully processed kres and ssj data.
We can use that file to deploy our aplication.
With this database, we will need a minimum of 8GB ram to serve the app.
If the server is struggling, frontend will throw "Network errors".
Check `0.0.0.0:8081` and remove (or backup) the current example database `valdb`.
Run the stack with mongo port mapped:
(uncomment the lines in `production.yaml`)
```yml
ports:
- 27017:27017
```
Run a separate my-mongo container with the mounted data:
```bash
$ mongo run -it --net host -v <local_dump_path>/dumps my-mongo /bin/bash
```
Inside the container (edit the uesrname, password):
```bash
$ mongorestore /dumps/valdb --db valdb --uri=mongodb://valuser:valuserpass@0.0.0.0:27017
```
After uploading, restart the stack with `27017` commented out.
## Script running
### Environment setup
```bash
pip install -r requirements.txt
pip install git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git
pip install git+https://gitea.cjvt.si/kristjan/cjvt-corpusparser.git
```
### Running on already setup environment
```bash
make database-service
```
### Setting up environment for running on ramdisk
```bash
# create ramdisk
sudo mount -t tmpfs tmpfs /mnt/tmp
sudo mount -o remount,size=120G,noexec,nosuid,nodev,noatime /mnt/tmp
# change volumes to /mnt/tmp:/data/db
vim dockerfiles/database/valency-stack.yml
# change Makefile -runStack to mkdir -p /mnt/tmp
vim dockerfiles/database/Makefile
# run service
make database-service
# run ONLY ONCE to create users and restore database
make database-users
make database-restore
# double check if it worked
docker exec -it ef0a /bin/bash
# following steps in docker bash:
# check if it worked by
mongo --username <REGULAR USER> --password --authenticationDatabase valdb
db.getRoles()
```

View File

@@ -1 +0,0 @@
/home/kristjan/workdir/final_json/

View File

@@ -1 +0,0 @@
/home/kristjan/kres_mount/kres_parsed/tei/

Binary file not shown.

View File

@@ -1 +0,0 @@
/home/kristjan/git/diploma/data/ssj500k-sl.TEI/ssj500k-sl.body.xml

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@@ -1,5 +1,5 @@
FROM mongo:latest
FROM mongo:4.2.9
WORKDIR /
COPY init_inside_container.sh /.
COPY create.js /.
COPY init_inside_mongo_container.sh /.
COPY create_mongo.js /.

View File

@@ -2,33 +2,62 @@
# collection names: lower case, plural
# user names?
# mongo admin -u root -p password --eval "db.getSiblingDB('vlDB').addUser('vluser', 'password')"
STACKNAME = dbstack
.PHONY: start_db FORCE
all: build_run create_users
build_run: build_mongo run_stack
build_run: build_mongo run_docker_compose
create.js: FORCE
postgres_create_roles:
echo 'psql -v ON_ERROR_STOP=OFF --username $(DB_ADM_USER) <<-EOSQL' > create_postgres.js
echo "create user $(DB_USR_USER) with encrypted password '$(DB_USR_PASS)';" >> create_postgres.js
echo "create database superdb_small;" >> create_postgres.js
echo "grant all privileges on database superdb_small to $(DB_USR_USER);" >> create_postgres.js
echo "grant usage on schema public to $(DB_USR_USER);" >> create_postgres.js
echo "grant select on all tables in schema public to $(DB_USR_USER);" >> create_postgres.js
echo "EOSQL" >> create_postgres.js
chmod +x create_postgres.js
FORCE:
echo 'db.auth("$(DB_ADM_USER)", "$(DB_ADM_PASS)")' > create.js
echo 'use valdb' >> create.js
echo 'db.createUser({user: "$(DB_USR_USER)", pwd: "$(DB_USR_PASS)", roles: ["readWrite"]})' >> create.js
mongo_create_roles:
echo 'db.auth("$(DB_ADM_USER)", "$(DB_ADM_PASS)")' > create_mongo.js
echo 'use valdb' >> create_mongo.js
echo 'db.createUser({user: "$(DB_USR_USER)", pwd: "$(DB_USR_PASS)", roles: ["readWrite"]})' >> create_mongo.js
echo 'db.grantRolesToUser("$(DB_USR_USER)", [{ role: "readWrite", db: "extvaldb"}])' >> create_mongo.js
build_mongo: create.js
build_mongo: mongo_create_roles
docker build . -t my-mongo --no-cache
clean_stack:
docker stack rm $(STACKNAME)
# build_postgres: postgres_create_roles
# docker build . -t my-mongo --no-cache
run_stack:
mkdir -p ${HOME}/mongo_container/data/
docker stack deploy --compose-file mongodb-stack.yml $(STACKNAME)
run_docker_compose:
mkdir -p ${HOME}/valency_data/mongo_container/data/
#docker kill $(shell ./get_mongo_container_name.sh)
#docker kill $(shell ./get_postgres_container_name.sh)
#docker-compose stop
docker-compose -f valency-stack.yml up -d --force-recreate
# docker stack deploy --compose-file mongodb-stack.yml $(STACKNAME)
create_users: create.js
docker exec $(shell ./get_container_name.sh) /init_inside_container.sh
create_users: create_mongo_users create_postgres_users
create_mongo_users: mongo_create_roles
docker exec $(shell ./get_mongo_container_name.sh) /init_inside_mongo_container.sh
# rm create.js
create_postgres_users: postgres_create_roles
docker exec $(shell ./get_postgres_container_name.sh) /scripts/init_inside_postgres_container.sh
restore_db: restore_mongo_db restore_postgres_db
restore_mongo_db:
ifeq (,$(wildcard ./mongo_db.gz))
$(error "mongo_db.gz does not exists. Make sure to have dump of mongo db in 'dockerfiles/database/mongo_db.gz'")
else
docker exec $(shell ./get_mongo_container_name.sh) sh -c 'mongorestore --gzip --archive=/scripts/mongo_db.gz --db valdb --username $(DB_USR_USER) --password $(DB_USR_PASS) --authenticationDatabase valdb'
endif
restore_postgres_db:
ifeq (,$(wildcard ./postgres_db.tar))
$(error "postgres_db.tar does not exists. Make sure to have dump of postgres db in 'dockerfiles/database/postgres_db.tar'")
else
docker exec $(shell ./get_postgres_container_name.sh) sh -c 'pg_restore -U $(DB_ADM_USER) --dbname=superdb_small --create --verbose /scripts/postgres_db.tar'
endif

View File

@@ -0,0 +1,2 @@
#!/bin/bash
docker ps | grep postgres | awk '{print $1}'

View File

@@ -1,3 +0,0 @@
#!/bin/bash
mongo admin < /create.js

View File

@@ -0,0 +1,3 @@
#!/bin/bash
mongo admin < /create_mongo.js

View File

@@ -0,0 +1,3 @@
#!/bin/bash
/scripts/create_postgres.js

View File

@@ -1,26 +0,0 @@
version: '3.1'
services:
my-mongo:
image: my-mongo
restart: always
ports:
- 27017:27017
environment:
MONGO_INITDB_ROOT_USERNAME: ${DB_ADM_USER}
MONGO_INITDB_ROOT_PASSWORD: ${DB_ADM_PASS}
volumes:
- ${HOME}/mongo_container/data/:/data/db
mongo-express:
image: mongo-express
restart: always
ports:
- 8087:8081
environment:
ME_CONFIG_BASICAUTH_USERNAME: ${MONGOEXPRESS_USER}
ME_CONFIG_BASICAUTH_PASSWORD: ${MONGOEXPRESS_PASS}
ME_CONFIG_MONGODB_ADMINUSERNAME: ${DB_ADM_USER}
ME_CONFIG_MONGODB_ADMINPASSWORD: ${DB_ADM_PASS}
ME_CONFIG_MONGODB_SERVER: my-mongo

View File

@@ -0,0 +1,27 @@
version: '3.1'
services:
my_mongo:
image: my-mongo
restart: always
ports:
- 127.0.0.1:27017:27017
environment:
MONGO_INITDB_ROOT_USERNAME: ${DB_ADM_USER}
MONGO_INITDB_ROOT_PASSWORD: ${DB_ADM_PASS}
volumes:
- ${HOME}/valency_data/mongo_container/data/:/data/db
- ./:/scripts
my_postgres:
image: postgres
restart: always
ports:
- 127.0.0.1:5432:5432
environment:
POSTGRES_USER: ${DB_ADM_USER}
POSTGRES_PASSWORD: ${DB_ADM_PASS}
volumes:
- ${HOME}/valency_data/postgres_container/data/:/var/lib/postgresql/data
- ./:/scripts

View File

@@ -1,26 +1,37 @@
FROM ubuntu:16.04
FROM ubuntu:18.04
RUN apt-get update --fix-missing
RUN apt-get install -y \
vim \
python3 \
python3-pip \
sshfs
sshfs \
curl \
locales
RUN pip3 install --upgrade pip
RUN pip3 install \
lxml \
pandas \
sklearn \
argparse \
pyyaml \
pathlib \
flask \
flask_cors \
pymongo \
flask
flask-pymongo \
gunicorn \
SQLAlchemy \
tqdm \
psycopg2-binary
RUN apt-get install -y \
curl
# Set the locale
RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \
locale-gen
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
ENV PYTHONIOENCODING UTF-8
RUN pip3 install \
pyyaml \
flask_cors

View File

@@ -1,4 +1,4 @@
IMAGE_NAME="cjvt-python-env"
IMAGE_NAME="cjvt-python-env" # don't change, used in backend_flask/Makefile
CNNAME="python-env"
all: build run

View File

@@ -0,0 +1,5 @@
#!/bin/bash
echo "testing entrypoint."
$(exit 1)
exit 0

29
nginx.conf Normal file
View File

@@ -0,0 +1,29 @@
# frontend
server {
listen 80;
server_name _;
location / {
root /srv/dist;
index index.html index.htm;
}
location /home {
return 301 /;
}
}
# backend
server {
listen 8084;
server_name _;
location / {
proxy_set_header X-Forward-For $proxy_add_x_forwarded_for;
proxy_set_header Host $http_host;
proxy_pass http://backend_flask:8084;
}
}
https://vezljivostni.cjvt.si/api/* -> http://vezljivostni-host.cjvt.si:8084/api/*
https://vezljivostni.cjvt.si/* -> http://vezljivostni-host.cjvt.si:80/*

43
production.yaml Normal file
View File

@@ -0,0 +1,43 @@
version: '3.1'
services:
my_mongo:
image: my-mongo
restart: always
# ports:
# - 27017:27017
expose:
- 27017
environment:
MONGO_INITDB_ROOT_USERNAME: valuser
MONGO_INITDB_ROOT_PASSWORD: valuserpass
volumes:
- ${HOME}/mongo_container/data/:/data/db
mongo_express:
image: mongo-express
restart: always
ports:
- 8081:8081
environment:
ME_CONFIG_BASICAUTH_USERNAME: test
ME_CONFIG_BASICAUTH_PASSWORD: test
ME_CONFIG_MONGODB_ADMINUSERNAME: valadmin
ME_CONFIG_MONGODB_ADMINPASSWORD: rolercoaster
ME_CONFIG_MONGODB_SERVER: my_mongo
backend_flask:
image: backend-flask
expose:
- 8084
proxy:
image: nginx
ports:
- 80:80
- 8084:8084
volumes:
- ./nginx.conf:/etc/nginx/conf.d/default.conf
- ./src/frontend_vue/dist:/srv/dist

37
requirements.txt Normal file
View File

@@ -0,0 +1,37 @@
asn1crypto==0.24.0
beautifulsoup4==4.8.0
bs4==0.0.1
cffi==1.12.3
Click==7.0
cryptography==2.1.4
Flask==1.1.1
Flask-Cors==3.0.8
Flask-PyMongo==2.3.0
gunicorn==19.9.0
idna==2.6
itsdangerous==1.1.0
Jinja2==2.10.1
joblib==0.13.2
keyring==10.6.0
keyrings.alt==3.0
lxml==4.4.0
MarkupSafe==1.1.1
numpy==1.17.0
pandas==0.25.0
pathlib==1.0.1
psycopg2==2.8.4
pycparser==2.19
pycrypto==2.6.1
pymongo==3.8.0
python-dateutil==2.8.0
pytz==2019.2
pyxdg==0.25
PyYAML==5.1.2
scikit-learn==0.21.3
scipy==1.3.0
SecretStorage==2.3.1
six==1.11.0
sklearn==0.0
soupsieve==1.9.3
SQLAlchemy==1.3.12
Werkzeug==0.15.5

1708
scripts/create_xml.py Normal file

File diff suppressed because it is too large Load Diff

189
scripts/extract_keywords.py Normal file
View File

@@ -0,0 +1,189 @@
import copy
import csv
from xml.etree import ElementTree
import re
import sys
import logging
import argparse
import pickle
import time
import gc
import subprocess
import concurrent.futures
import tempfile
def read_gigafida(path):
words = {}
with open(path) as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
words[row[0]] = int(row[2])
return words
def read_sloleks(path):
words = set()
with open(path) as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
words.add(row[1])
return words
def read_zele(path):
with open(path) as f:
content = f.readlines()
# fix content
content[0] = content[0][1:]
# a = content[2]
# a = content[2].split()
# a = content[2].split()[0].split('<IZT>')[1]
# a = content[2].split()[0].split('<IZT>')[1].split('</IZT>')[0]
content = [x.split()[0].split('<IZT>')[1].split('</IZT>')[0] for x in content]
# content = [x.split() for x in content]
return set(content)
def read_wordlist(path):
with open(path) as f:
content = [line[:-1] for line in f.readlines()]
print(content[-1])
return set(content)
def filter_gigafida(gigafida_raw, min_limit, max_limit):
return {word[0]: word[1] for word in gigafida_raw.items() if (word[0][-2:] == 'ti' or word[0][-2:] == 'či') and word[1] > min_limit and word[1] <= max_limit}
def set_list_intersection(gigafida_filtered, sloleks):
intersection = {}
for word, num in gigafida_filtered.items():
if word in sloleks:
intersection[word] = num
return intersection
def list_list_union(list1, list2):
union = copy.copy(list1)
for w, n in list2.items():
if w not in list1:
union[w] = list2[w]
return union
def list_list_subtraction(list1, list2):
subtraction = {}
for w, n in list2.items():
# if w == 'dejati':
# print('here')
if w not in list1:
subtraction[w] = n
return subtraction
def set_set_subtraction(set1, set2):
subtraction = {}
for w in set2:
if w not in set1:
subtraction[w] = -1
return subtraction
def create_document(list1, path):
with open(path, "w") as text_file:
for w, n in list1.items():
text_file.write("%s\t%d\n" % (w, n))
def create_document_set(list1, path):
with open(path, "w") as text_file:
for w in sorted(list(list1)):
text_file.write("%s\n" % w)
def gigafida_merge(sloleks, zele, gigafida_raw, giga_min, giga_max):
gigafida_filtered = filter_gigafida(gigafida_raw, giga_min, giga_max)
sloleks_gf_intersect = set_list_intersection(gigafida_filtered, sloleks)
gigafida_filtered1 = filter_gigafida(gigafida_raw, 1, sys.maxsize)
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
return sloleks_zele_subtraction
def main(args):
gigafida_raw = read_gigafida(args.gigafida_verb_list)
sloleks = read_sloleks(args.sloleks)
zele = read_zele(args.zele)
if args.wordlist is not None:
sloleks_wordlist = set()
# sloleks_wordlist = set()
for el in sloleks:
if el in gigafida_raw:
sloleks_wordlist.add(el)
filtered_wordlist = read_wordlist(args.wordlist)
# sloleks_wordlist = set()
for el in sloleks:
if el in gigafida_raw:
filtered_wordlist.add(el)
create_document_set(filtered_wordlist, 'wordlist.tsv')
# gigafida_merge(sloleks, zele, gigafida_raw, 3, sys.maxsize)
gigafida_filtered3 = filter_gigafida(gigafida_raw, 2, sys.maxsize)
sloleks_gf_intersect = set_list_intersection(gigafida_filtered3, sloleks)
nouns_sloleks_gf_intersect = sorted(sloleks_gf_intersect.items(), key=lambda x: x[1], reverse=True)
res = [el[0] for el in nouns_sloleks_gf_intersect]
gigafida_filtered1 = filter_gigafida(gigafida_raw, 0, sys.maxsize)
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
sloleks_zele_subtraction = set_set_subtraction(sloleks, zele)
create_document(gigafida_filtered3, 'gigafida_3+.tsv')
# create_document(sloleks_gf_intersect, 'gigafida_3+-sloleks-presek.tsv')
create_document(sloleks_zele_union, 'gigafida_3+-sloleks_zele-presek.tsv')
create_document(sloleks_zele_subtraction, 'sloleks-zele-razlika.tsv')
# gigafida_filtered = filter_gigafida(gigafida_raw, 10, sys.maxsize)
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
gigafida_10 = gigafida_merge(sloleks, zele, gigafida_raw, 10, sys.maxsize)
create_document(gigafida_10, 'gigafida_10+-sloleks_zele-razlika.tsv')
# gigafida_filtered = filter_gigafida(gigafida_raw, 3, 10)
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
gigafida_3_10 = gigafida_merge(sloleks, zele, gigafida_raw, 2, 10)
create_document(gigafida_3_10, 'gigafida_3-10-sloleks_zele-razlika.tsv')
# pass
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Extract keywords from multiple lists.')
parser.add_argument('gigafida_verb_list',
help='Path to gigafida list of verbs in tsv format.')
parser.add_argument('sloleks',
help='Path to Sloleks in tsv format.')
parser.add_argument('--zele',
help='Path to zele valency dictionary.')
parser.add_argument('--wordlist', default=None,
help='Path to filtered wordlist.')
parser.add_argument('--handchecked_words', default=None,
help='Path to handchecked words.')
# parser.add_argument('--min_limit',
# help='Limit min number of ocurrences',
# type=int, default=0)
# parser.add_argument('--max_limit',
# help='Limit max number of ocurrences',
# type=int, default=sys.maxsize)
parser.add_argument('--verbose', help='Enable verbose output to stderr',
choices=["warning", "info", "debug"], default="info",
const="info", nargs='?')
args = parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
start = time.time()
main(args)
logging.info("TIME: {}".format(time.time() - start))

117
scripts/form_csv.py Normal file
View File

@@ -0,0 +1,117 @@
import argparse
import csv
import os
from lxml import etree, objectify, html
def write_general_statistics(path, out_list):
if len(out_list) == 0:
return
with open(path, 'w') as csvfile:
writer = csv.writer(csvfile, delimiter='\t',
quotechar='"')
writer.writerow(['Semantic role', 'Valency pattern ratio', 'Valency sentence ratio'])
for line in out_list:
writer.writerow(line)
def write_statistics(path, out_list):
if len(out_list) == 0:
return
with open(path, 'w') as csvfile:
writer = csv.writer(csvfile, delimiter='\t',
quotechar='"')
writer.writerow(['Valency pattern id', 'Frequency all GF', 'Semantic role', 'Pattern representation', 'Corpus example'])
for line in out_list:
writer.writerow(line)
def main(args):
for file in sorted(os.listdir(args.input)):
path = os.path.join(args.input, file)
tree = etree.parse(path)
gf_output = []
ssj_output = []
head = next(tree.iter('head'))
headword = head.find('headword').find('lemma').text
#for div in root.iterfind('.//div'):
for elem in tree.iter('statisticsContainer'):
# for element in tree.iterfind('statisticsContainer'):
# for element in tree.find('statisticsContainer'):
semRole = elem.find('semanticRole').text
gf_pattern = None
gf_sentence = None
ssj_pattern = None
ssj_sentence = None
measure = elem.find('measureList')
for el in measure:
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'Gigafida 2.0':
gf_pattern = el.text
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'Gigafida 2.0':
gf_sentence = el.text
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'ssj500k 2.2':
ssj_pattern = el.text
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'ssj500k 2.2':
ssj_sentence = el.text
if gf_pattern is not None and gf_sentence is not None:
gf_output.append([semRole, gf_pattern, gf_sentence])
if ssj_pattern is not None and ssj_sentence is not None:
ssj_output.append([semRole, ssj_pattern, ssj_sentence])
print(file)
analyze_output = []
for elem in tree.iter('valencyPattern'):
valency_pattern_id = elem.attrib['id']
# get frequency
measure = ''
for measure_el in elem.find('measureList').findall('measure'):
if measure_el.attrib['source'] == 'Gigafida 2.0':
measure = measure_el.text
# get semantic roles
semantic_roles_list = []
for semantic_rol_con in elem.find('semanticRoleContainerList').findall('semanticRoleContainer'):
semantic_roles_list.append(semantic_rol_con.find('semanticRole').text)
semantic_roles = '_'.join(semantic_roles_list)
# pattern representation
pattern_representation = elem.find('patternRepresentation').text
# corpus example
if elem.find('exampleContainerList') is not None and elem.find('exampleContainerList').find('exampleContainer') is not None and elem.find('exampleContainerList').find('exampleContainer').find('corpusExample') is not None:
corpus_example_text = html.tostring(elem.find('exampleContainerList').find('exampleContainer').find('corpusExample'), encoding='unicode')
else:
continue
# ugly postprocessing to remove xmlns:xsi=... duh..
root = etree.fromstring(corpus_example_text)
# Remove namespace prefixes
for elem in root.getiterator():
elem.tag = etree.QName(elem).localname
# Remove unused namespace declarations
etree.cleanup_namespaces(root)
corpus_example = etree.tostring(root, encoding='unicode')
print(f"Valency pattern {valency_pattern_id}")
analyze_output.append([valency_pattern_id, measure, semantic_roles, pattern_representation, corpus_example])
write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output)
write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output)
write_statistics(os.path.join(args.output, headword + '_patterns.tsv'), analyze_output)
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
arg_parser.add_argument('--input', type=str, help='Input directory')
arg_parser.add_argument('--output', type=str, help='Output directory')
args = arg_parser.parse_args()
main(args)

1
scripts/valency Symbolic link
View File

@@ -0,0 +1 @@
../src/pkg/valency/valency

8
scripts/xsd_checker.py Normal file
View File

@@ -0,0 +1,8 @@
from lxml import etree as lxml
with open('../data/inventory.xsd') as f:
xmlschema_doc = lxml.parse(f)
xmlschema = lxml.XMLSchema(xmlschema_doc)
with open('../data/xmls/output.xml') as op:
doc = lxml.parse(op)
print(xmlschema.validate(doc))

0
src/__init__.py Normal file
View File

View File

@@ -0,0 +1,16 @@
IMG="backend-flask"
CNT="backend_flask"
clean:
- docker rm -f $(CNT)
run: clean build
docker run -d --net host --name $(CNT) $(IMG)
docker logs -f $(CNT)
build: build-cjvt-python-env
# docker build . -f ../../Dockerfile-backend-flask -t $(IMG)
cd ../..; docker build . -f Dockerfile-backend-flask -t $(IMG)
build-cjvt-python-env:
cd ../../dockerfiles/python-env; $(MAKE) build

View File

@@ -26,25 +26,22 @@ from email.mime.text import MIMEText
from copy import deepcopy as DC
from pathlib import Path
from pymongo import MongoClient
from flask_pymongo import PyMongo
import pymongo
import argparse
# some db collections
USERS_COLL = "users"
TOKENS_COLL = "usertokens"
SENSES_COLL = "senses"
SENSEMAP_COLL = "sensemap"
# pre-generated data (gui leftside word index)
CORPORA = ["ssj", "kres"]
app_index = None
sskj_wordlist = None # used by _is_banned(hw)
BANNED_HEADWORDS = ["biti"]
log = logging.getLogger(__name__)
valdb = None
app = Flask(__name__)
app.config.from_object("db_config")
mongo = PyMongo(app)
# app.config["CORPORA"] = ["ssj", "kres", "gigafida"]
app.config["CORPORA"] = ["gigafida"]
app.config["BANNED_HEADWORDS"] = ["biti"]
app.config["QUERY_LIMIT"] = 1000
# when running vuejs via webpack
# CORS(app)
@@ -59,7 +56,7 @@ CORS(app)
@app.route("/api/dev")
def api_dev():
print("DEV")
cur = valdb.kres.find({"headwords": "nagovarjati"})
cur = mongo.db.kres.find({"headwords": "nagovarjati"})
frames = []
for ent in cur:
frames += frames_from_db_entry(ent)
@@ -72,12 +69,12 @@ def api_dev():
@app.route("/api/words/<corpus>")
def api_words(corpus):
return json.dumps({
"sorted_words": app_index[corpus]["words"], # todo - make corpus as arg
"sorted_words": app.config["app_index"][corpus]["words"], # todo - make corpus as arg
})
@app.route("/api/functors/<corpus>")
def api_functors(corpus):
return json.dumps(app_index[corpus]["functors"])
return json.dumps(app.config["app_index"][corpus]["functors"])
# INDEX SELECTION -------------------^
@@ -98,7 +95,7 @@ def api_register():
):
return "ERR"
email_hash = hashlib.sha256(email.encode("utf-8")).hexdigest()
existing = list(valdb[USERS_COLL].find({
existing = list(mongo.db.users.find({
"$or": [{"username": username}, {"email": email_hash}]
}))
if len(existing) > 0:
@@ -109,7 +106,7 @@ def api_register():
password.encode("utf-8")).hexdigest(),
"email": email_hash
}
valdb[USERS_COLL].insert(entry)
mongo.db.users.insert(entry)
return "OK"
@@ -121,7 +118,7 @@ def api_login():
password = data["password"]
hpass = hashlib.sha256(password.encode("utf-8")).hexdigest()
db_user = list(valdb[USERS_COLL].find({
db_user = list(mongo.db.users.find({
"username": username,
"hpass": hpass
}))
@@ -135,7 +132,7 @@ def api_login():
"date": datetime.datetime.utcnow(),
"token": token
}
valdb[TOKENS_COLL].update(
mongo.db.usertokens.update(
{"username": token_entry["username"]},
token_entry,
upsert=True
@@ -178,7 +175,7 @@ def api_new_pass():
username = data["username"]
email = data["email"]
hemail = hashlib.sha256(email.encode("utf-8")).hexdigest()
db_res = list(valdb[USERS_COLL].find({
db_res = list(mongo.db.users.find({
"username": username,
"email": hemail
}))
@@ -190,7 +187,7 @@ def api_new_pass():
string.ascii_letters + string.digits) for i in range(10)])
# update locally
hpass = hashlib.sha256(new_pass.encode("utf-8")).hexdigest()
valdb[USERS_COLL].update(
mongo.db.users.update(
{
"username": username,
"email": hemail
@@ -208,12 +205,12 @@ def token_to_username(token):
key = {
"token": token
}
res = list(valdb[TOKENS_COLL].find(key))
res = list(mongo.db.usertokens.find(key))
if len(res) != 1:
return None
username = res[0]["username"]
# update deletion interval
valdb[TOKENS_COLL].update(
mongo.db.usertokens.update(
key, {"$set": {"date": datetime.datetime.utcnow()}})
return username
@@ -248,22 +245,26 @@ def api_get_frames():
RF = reduce_functions[rf_name]["f"]
corpus = request.args.get("cor")
if corpus not in CORPORA:
if corpus not in app.config["CORPORA"]:
return json.dumps({"error": "cor={kres,ssj}"})
cur = valdb[corpus].find({"headwords": hw})
log.info("Test1")
cur = mongo.db[corpus].find({"headwords": hw})
log.info("Test2")
frames = []
for ent in cur:
for ent in cur[:app.config["QUERY_LIMIT"]]:
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
cur.close()
log.info("Test3")
# filter by relevant hw
frames = [x for x in frames if x.hw == hw]
ret_frames = RF(frames, valdb[SENSEMAP_COLL])
ret_frames = RF(frames, mongo.db.sensemap)
log.info("Test3")
json_ret = {"frames": []}
for frame in ret_frames:
json_ret["frames"].append(frame.to_json())
log.info("Test4")
return json.dumps(json_ret)
# return prepare_frames(ret_frames)
@@ -300,19 +301,20 @@ def api_get_functor_frames():
RF = reduce_functions[rf_name]["f"]
corpus = request.args.get("cor")
if corpus not in CORPORA:
if corpus not in app.config["CORPORA"]:
return json.dumps({"error": "cor={kres,ssj}"})
cur = valdb[corpus].find({"functors": functor})
cur = mongo.db[corpus].find({"functors": functor})
frames = []
for ent in cur:
for ent in cur[:app.config["QUERY_LIMIT"]]:
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
cur.close()
# filter by relevant functor
frames = [x for x in frames if functor in x.get_functors()]
# raw_frames = vallex.functors_index[functor] # TODO
ret_frames = RF(frames, valdb[SENSEMAP_COLL])
ret_frames = RF(frames, mongo.db.sensemap)
ret_frames = _aggregate_by_hw(ret_frames)
json_ret = {"frames": []}
@@ -331,10 +333,10 @@ def api_get_functor_frames():
def api_senses_get():
# returns senses and mapping for hw
hw = request.args.get("hw")
senses = list(valdb[SENSES_COLL].find({
senses = list(mongo.db.senses.find({
"hw": hw
}))
sense_map_query = list(valdb[SENSEMAP_COLL].find({
sense_map_query = list(mongo.db.sensemap.find({
"hw": hw
}))
# aggregation by max date possible on DB side
@@ -414,7 +416,7 @@ def api_senses_update():
print(ns)
# insert into db
valdb[SENSES_COLL].insert(ns)
mongo.db.senses.insert(ns)
# replace tmp_id with mongo's _id
for ssj_id, el in sense_map.items():
@@ -429,7 +431,7 @@ def api_senses_update():
"date": datetime.datetime.utcnow()
}
# vallex.db["v2_sense_map"].update(key, data, upsert=True)
valdb[SENSEMAP_COLL].insert(data)
mongo.db.sensemap.insert(data)
return "OK"
# SENSES ----------------------------^
@@ -438,25 +440,33 @@ def api_senses_update():
# APP PREFLIGHT ---------------------.
def _is_banned(hw):
banned = True
if hw in BANNED_HEADWORDS:
if hw in app.config["BANNED_HEADWORDS"]:
banned = True
elif hw in sskj_wordlist["wordlist"]:
banned = False
elif (hw + " se") in sskj_wordlist["wordlist"]:
banned = False
if hw[-1] == "_":
log.debug("hw: {}, banned: {}".format(hw, banned))
return banned
def prepare_app_index():
def prepare_app_index(appindex_json):
log.info("[*] preparing app_index")
# create app_index (used in frontend, left side word index)
tmp_app_index = {c: {} for c in CORPORA}
for corpus in CORPORA:
tmp_app_index = {c: {} for c in app.config["CORPORA"]}
for corpus in app.config["CORPORA"]:
res_hws = {}
res_fns = {}
for e in valdb[corpus].find({}):
# print('CORPUS...!!...')
# print(corpus)
# a = mongo.db[corpus]
# print('TEST_OK')
# print(a)
# print(mongo.db)
# a = mongo.db.list_collection_names()
# print('TEST_OK2')
nentries = mongo.db[corpus].count()
idx = 0
for e in mongo.db[corpus].find({}):
if "headwords" not in e:
continue
for hw in e["headwords"]:
@@ -471,6 +481,10 @@ def prepare_app_index():
res_fns[fn] += 1
else:
res_fns[fn] = 1
idx += 1
if idx % 10000 == 0:
log.debug("indexing {}: {}/{}".format(
corpus, idx, nentries))
alphabetical = {}
for k, e in res_hws.items():
@@ -482,19 +496,46 @@ def prepare_app_index():
for letter, words in alphabetical.items():
filtered_words = [x for x in words if not _is_banned(x[0])]
# filtered_words = [x for x in words]
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
tmp_app_index[corpus]["words"] = alphabetical
functors = [(k, e) for (k, e) in res_fns.items()]
functors = sorted(functors, key=lambda x: x[0])
tmp_app_index[corpus]["functors"] = functors
valdb.appindex.update({"dockey": "appindex"}, {"dockey": "appindex", "data": tmp_app_index}, upsert=True)
with Path(appindex_json).open("w") as fp:
json.dump(tmp_app_index, fp)
# APP PREFLIGHT ---------------------^
def init_wsgi(app):
print("Initiating wsgi")
config = None
with Path("/project/prod_conf.yaml").open("r") as fp:
config = list(yaml.safe_load_all(fp))[0]
app.debug = False
logfile = config["logfile"]
logging.basicConfig(filename=logfile, level=logging.INFO)
# app index from db
with Path(config["appindex"]).open("r") as fp:
# a dirty hack but ok
app.config["app_index"] = json.load(fp)
# log.info("[*] Starting app.py with config:\n%s".format(config))
log.info("[*] Starting app.py with config:\n{}".format(config))
# if we don't pass arguments, assume production environment (gunicorn)
if "gunicorn" in sys.argv[0]:
init_wsgi(app)
if __name__ == "__main__":
print("Starting app.py main()")
aparser = argparse.ArgumentParser(description="Arguments for app.py")
@@ -504,9 +545,9 @@ if __name__ == "__main__":
aparser.add_argument("--dbpass", type=str)
aparser.add_argument("--dbaddr", type=str)
aparser.add_argument("--sskj-wordlist", type=str)
aparser.add_argument("--appindex-json", type=str)
args = aparser.parse_args()
config = None
with Path(args.config_file).open("r") as fp:
config = list(yaml.safe_load_all(fp))[0]
@@ -517,27 +558,35 @@ if __name__ == "__main__":
else:
logging.basicConfig(filename=logfile, level=logging.INFO)
"""
# db login
client = MongoClient(
"mongodb://{}".format(args.dbaddr),
username=args.dbuser,
password=args.dbpass,
authSource="valdb",
authSource="mongo.db",
authMechanism='SCRAM-SHA-1'
)
valdb = client.valdb
valdb = client.mongo.db
"""
if args.prepare_db:
with Path(args.sskj_wordlist).open("r") as fp:
sskj_wordlist = json.load(fp)
prepare_app_index()
prepare_app_index(args.appindex_json)
sys.exit()
# app index from db
app_index = (valdb.appindex.find_one({"dockey": "appindex"}))["data"]
with Path(args.appindex_json).open("r") as fp:
app.config["app_index"] = json.load(fp)
# a = app.config["app_index"]
# b = app.config["app_index"]["kres"]
# c = app.config["app_index"]["kres"]["words"]
# print('HERE')
# log.info("[*] Starting app.py with config:\n%s".format(config))
log.info("[*] Starting app.py with config:\n{}".format(config))
app.run(host=str(config["host"]), port=int(config["port"]))

View File

@@ -0,0 +1,106 @@
import argparse
import json
from flask import Flask
from flask_pymongo import PyMongo
from pathlib import Path
app = Flask(__name__)
app.config.from_object("db_config")
mongo = PyMongo(app)
app.config["BANNED_HEADWORDS"] = ["biti"]
def _is_banned(hw):
banned = True
if hw in app.config["BANNED_HEADWORDS"]:
banned = True
elif hw in sskj_wordlist["wordlist"]:
banned = False
elif (hw + " se") in sskj_wordlist["wordlist"]:
banned = False
return banned
def prepare_app_index(appindex_json, corporas, previous_json=None):
if previous_json:
with Path(previous_json).open("r") as fp:
tmp_app_index = json.load(fp)
else:
tmp_app_index = {}
# create app_index (used in frontend, left side word index)
for c in corporas:
tmp_app_index[c] = {}
for corpus in corporas:
res_hws = {}
res_fns = {}
# print('CORPUS...!!...')
# print(corpus)
# a = mongo.db[corpus]
# print('TEST_OK')
# print(a)
# print(mongo.db)
# a = mongo.db.list_collection_names()
# print('TEST_OK2')
nentries = mongo.db[corpus].count()
idx = 0
for e in mongo.db[corpus].find({}):
if "headwords" not in e:
continue
for hw in e["headwords"]:
if hw in res_hws:
res_hws[hw] += 1
else:
res_hws[hw] = 1
if "functors" not in e:
continue
for fn in e["functors"]:
if fn in res_fns:
res_fns[fn] += 1
else:
res_fns[fn] = 1
idx += 1
if idx % 10000 == 0:
print("indexing {}: {}/{}".format(
corpus, idx, nentries))
alphabetical = {}
for k, e in res_hws.items():
fst = k[0].lower()
if fst in alphabetical:
alphabetical[fst].append((k, e))
else:
alphabetical[fst] = [(k, e)]
for letter, words in alphabetical.items():
filtered_words = [x for x in words if not _is_banned(x[0])]
# filtered_words = [x for x in words]
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
tmp_app_index[corpus]["words"] = alphabetical
functors = [(k, e) for (k, e) in res_fns.items()]
functors = sorted(functors, key=lambda x: x[0])
tmp_app_index[corpus]["functors"] = functors
with Path(appindex_json).open("w") as fp:
json.dump(tmp_app_index, fp)
if __name__ == "__main__":
print("Starting app.py main()")
aparser = argparse.ArgumentParser(description="Arguments for app.py")
aparser.add_argument("--previous-json", type=str, default=None)
aparser.add_argument("--appindex-json", type=str)
aparser.add_argument("--sskj-wordlist", type=str)
args = aparser.parse_args()
corporas = ['gigafida']
with Path(args.sskj_wordlist).open("r") as fp:
sskj_wordlist = json.load(fp)
prepare_app_index(args.appindex_json, corporas, args.previous_json)

View File

@@ -4,3 +4,4 @@ port: 8084
host: localhost
logfile: "/var/log/valency_backend.log"
---

View File

@@ -1,6 +1,5 @@
---
debug: True
port: 8084
host: 0.0.0.0
logfile: "/var/log/valency_backend.log"
---
appindex: /project/data/appindex.json

View File

@@ -0,0 +1,2 @@
MONGO_URI = "mongodb://user:user@0.0.0.0:27017/valdb"
MONGO_AUTH_SOURCE = 'admin'

View File

@@ -0,0 +1,8 @@
#!/bin/bash
pip3 install -e /project/src/pkg/cjvt-corpusparser/.
pip3 install -e /project/src/pkg/valency/.
pip3 install -e /project/src/pkg/seqparser/.
cd /project/src/backend_flask
gunicorn -t 4 -b 0.0.0.0:8084 app:app

View File

@@ -0,0 +1,18 @@
import json
import os
input_dir = "/media/luka/Portable Disk/Datasets/gigafida_jos/final_json"
output_file = "../../all_sentences.json"
results = {}
filenames = os.listdir(input_dir)
len(filenames)
for i, filename in enumerate(filenames):
if filename.endswith(".json"):
with open(os.path.join(input_dir, filename)) as json_file:
data = json.load(json_file)
results[filename.split('-')[0]] = list(data.keys())
print('Progress: %.2f %%' % (i/len(filenames)))
with open(output_file, 'w') as f:
json.dump(results, f)

View File

@@ -1,73 +0,0 @@
# Deprecated: headword creation moved to be part of corpusparser,
# index creation moved to app.py as a preprocessing (with exit) step
CORPORA = ["kres", "ssj"]
if __name__ == "__main__":
valdb = None
def helper_tid_to_token(tid, tokens):
for t in tokens:
if t["tid"] == tid:
return t
return None
# update entries (add headwords and fuctors for indexing)
for corpus in CORPORA:
for e in valdb[corpus].find({}):
if e["srl_links"] is None:
e["headwords"] = []
e["functors"] = []
else:
hw_tids = list(set([x["from"] for x in e["srl_links"]]))
hw_tokens = [helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids]
headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens]
e["headwords"] = headwords
functors = list(set([x["afun"] for x in e["srl_links"]]))
e["functors"] = functors
valdb[corpus].save(e)
valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)])
valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)])
# create app_index (used in frontend, left side word index)
tmp_app_index = {c: {} for c in CORPORA}
for corpus in CORPORA:
res_hws = {}
res_fns = {}
for e in valdb[corpus].find({}):
if "headwords" not in e:
continue
for hw in e["headwords"]:
if hw in res_hws:
res_hws[hw] += 1
else:
res_hws[hw] = 1
if "functors" not in e:
continue
for fn in e["functors"]:
if fn in res_fns:
res_fns[fn] += 1
else:
res_fns[fn] = 1
alphabetical = {}
for k, e in res_hws.items():
fst = k[0].lower()
if fst in alphabetical:
alphabetical[fst].append((k, e))
else:
alphabetical[fst] = [(k, e)]
for k, e in alphabetical.items():
alphabetical[k] = sorted(e, key=lambda x: x[0])
tmp_app_index[corpus]["words"] = alphabetical
functors = [(k, e) for (k, e) in res_fns.items()]
functors = sorted(functors, key=lambda x: x[0])
tmp_app_index[corpus]["functors"] = functors
valdb.appindex.update({"dockey": "appindex"}, {"dockey": "appindex", "data": tmp_app_index}, upsert=True)

View File

@@ -18,7 +18,12 @@ dev: build-container clean
docker run --name $(CONNAME) -d -p 8080:8080 -v $(shell pwd):/src $(IMGNAME) /src/ops_scripts/dev.sh
prod: build-container clean
docker run --name $(CONNAME) -d -p 8080:8080 -v $(shell pwd):/src $(IMGNAME) /src/ops_scripts/prod.sh
docker run --restart always --name $(CONNAME) -d -p 8080:8080 -v $(shell pwd):/src $(IMGNAME) /src/ops_scripts/prod.sh
node-env: clean
docker run --name $(CONNAME) -it -p 8080:8080 -v $(shell pwd):/src $(IMGNAME)
build-prod: build-container clean
docker run --rm -v $(shell pwd):/src $(IMGNAME) /src/ops_scripts/prod.sh

View File

@@ -0,0 +1 @@
<!DOCTYPE html><html><head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>vue_frontend</title><link href=/static/css/app.05a420a551b5bded5dfec6b370d3edca.css rel=stylesheet></head><body><div id=app></div><script type=text/javascript src=/static/js/manifest.2ae2e69a05c33dfc65f8.js></script><script type=text/javascript src=/static/js/vendor.5d3d2fd333c62579d227.js></script><script type=text/javascript src=/static/js/app.8538f7133303d3e391b2.js></script></body></html>

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,2 @@
!function(r){var n=window.webpackJsonp;window.webpackJsonp=function(e,u,c){for(var f,i,p,a=0,l=[];a<e.length;a++)i=e[a],o[i]&&l.push(o[i][0]),o[i]=0;for(f in u)Object.prototype.hasOwnProperty.call(u,f)&&(r[f]=u[f]);for(n&&n(e,u,c);l.length;)l.shift()();if(c)for(a=0;a<c.length;a++)p=t(t.s=c[a]);return p};var e={},o={2:0};function t(n){if(e[n])return e[n].exports;var o=e[n]={i:n,l:!1,exports:{}};return r[n].call(o.exports,o,o.exports,t),o.l=!0,o.exports}t.m=r,t.c=e,t.d=function(r,n,e){t.o(r,n)||Object.defineProperty(r,n,{configurable:!1,enumerable:!0,get:e})},t.n=function(r){var n=r&&r.__esModule?function(){return r.default}:function(){return r};return t.d(n,"a",n),n},t.o=function(r,n){return Object.prototype.hasOwnProperty.call(r,n)},t.p="/",t.oe=function(r){throw console.error(r),r}}([]);
//# sourceMappingURL=manifest.2ae2e69a05c33dfc65f8.js.map

View File

@@ -0,0 +1 @@
{"version":3,"sources":["webpack:///webpack/bootstrap d176f5affa434246605f"],"names":["parentJsonpFunction","window","chunkIds","moreModules","executeModules","moduleId","chunkId","result","i","resolves","length","installedChunks","push","Object","prototype","hasOwnProperty","call","modules","shift","__webpack_require__","s","installedModules","2","exports","module","l","m","c","d","name","getter","o","defineProperty","configurable","enumerable","get","n","__esModule","object","property","p","oe","err","console","error"],"mappings":"aACA,IAAAA,EAAAC,OAAA,aACAA,OAAA,sBAAAC,EAAAC,EAAAC,GAIA,IADA,IAAAC,EAAAC,EAAAC,EAAAC,EAAA,EAAAC,KACQD,EAAAN,EAAAQ,OAAoBF,IAC5BF,EAAAJ,EAAAM,GACAG,EAAAL,IACAG,EAAAG,KAAAD,EAAAL,GAAA,IAEAK,EAAAL,GAAA,EAEA,IAAAD,KAAAF,EACAU,OAAAC,UAAAC,eAAAC,KAAAb,EAAAE,KACAY,EAAAZ,GAAAF,EAAAE,IAIA,IADAL,KAAAE,EAAAC,EAAAC,GACAK,EAAAC,QACAD,EAAAS,OAAAT,GAEA,GAAAL,EACA,IAAAI,EAAA,EAAYA,EAAAJ,EAAAM,OAA2BF,IACvCD,EAAAY,IAAAC,EAAAhB,EAAAI,IAGA,OAAAD,GAIA,IAAAc,KAGAV,GACAW,EAAA,GAIA,SAAAH,EAAAd,GAGA,GAAAgB,EAAAhB,GACA,OAAAgB,EAAAhB,GAAAkB,QAGA,IAAAC,EAAAH,EAAAhB,IACAG,EAAAH,EACAoB,GAAA,EACAF,YAUA,OANAN,EAAAZ,GAAAW,KAAAQ,EAAAD,QAAAC,IAAAD,QAAAJ,GAGAK,EAAAC,GAAA,EAGAD,EAAAD,QAKAJ,EAAAO,EAAAT,EAGAE,EAAAQ,EAAAN,EAGAF,EAAAS,EAAA,SAAAL,EAAAM,EAAAC,GACAX,EAAAY,EAAAR,EAAAM,IACAhB,OAAAmB,eAAAT,EAAAM,GACAI,cAAA,EACAC,YAAA,EACAC,IAAAL,KAMAX,EAAAiB,EAAA,SAAAZ,GACA,IAAAM,EAAAN,KAAAa,WACA,WAA2B,OAAAb,EAAA,SAC3B,WAAiC,OAAAA,GAEjC,OADAL,EAAAS,EAAAE,EAAA,IAAAA,GACAA,GAIAX,EAAAY,EAAA,SAAAO,EAAAC,GAAsD,OAAA1B,OAAAC,UAAAC,eAAAC,KAAAsB,EAAAC,IAGtDpB,EAAAqB,EAAA,IAGArB,EAAAsB,GAAA,SAAAC,GAA8D,MAApBC,QAAAC,MAAAF,GAAoBA","file":"static/js/manifest.2ae2e69a05c33dfc65f8.js","sourcesContent":[" \t// install a JSONP callback for chunk loading\n \tvar parentJsonpFunction = window[\"webpackJsonp\"];\n \twindow[\"webpackJsonp\"] = function webpackJsonpCallback(chunkIds, moreModules, executeModules) {\n \t\t// add \"moreModules\" to the modules object,\n \t\t// then flag all \"chunkIds\" as loaded and fire callback\n \t\tvar moduleId, chunkId, i = 0, resolves = [], result;\n \t\tfor(;i < chunkIds.length; i++) {\n \t\t\tchunkId = chunkIds[i];\n \t\t\tif(installedChunks[chunkId]) {\n \t\t\t\tresolves.push(installedChunks[chunkId][0]);\n \t\t\t}\n \t\t\tinstalledChunks[chunkId] = 0;\n \t\t}\n \t\tfor(moduleId in moreModules) {\n \t\t\tif(Object.prototype.hasOwnProperty.call(moreModules, moduleId)) {\n \t\t\t\tmodules[moduleId] = moreModules[moduleId];\n \t\t\t}\n \t\t}\n \t\tif(parentJsonpFunction) parentJsonpFunction(chunkIds, moreModules, executeModules);\n \t\twhile(resolves.length) {\n \t\t\tresolves.shift()();\n \t\t}\n \t\tif(executeModules) {\n \t\t\tfor(i=0; i < executeModules.length; i++) {\n \t\t\t\tresult = __webpack_require__(__webpack_require__.s = executeModules[i]);\n \t\t\t}\n \t\t}\n \t\treturn result;\n \t};\n\n \t// The module cache\n \tvar installedModules = {};\n\n \t// objects to store loaded and loading chunks\n \tvar installedChunks = {\n \t\t2: 0\n \t};\n\n \t// The require function\n \tfunction __webpack_require__(moduleId) {\n\n \t\t// Check if module is in cache\n \t\tif(installedModules[moduleId]) {\n \t\t\treturn installedModules[moduleId].exports;\n \t\t}\n \t\t// Create a new module (and put it into the cache)\n \t\tvar module = installedModules[moduleId] = {\n \t\t\ti: moduleId,\n \t\t\tl: false,\n \t\t\texports: {}\n \t\t};\n\n \t\t// Execute the module function\n \t\tmodules[moduleId].call(module.exports, module, module.exports, __webpack_require__);\n\n \t\t// Flag the module as loaded\n \t\tmodule.l = true;\n\n \t\t// Return the exports of the module\n \t\treturn module.exports;\n \t}\n\n\n \t// expose the modules object (__webpack_modules__)\n \t__webpack_require__.m = modules;\n\n \t// expose the module cache\n \t__webpack_require__.c = installedModules;\n\n \t// define getter function for harmony exports\n \t__webpack_require__.d = function(exports, name, getter) {\n \t\tif(!__webpack_require__.o(exports, name)) {\n \t\t\tObject.defineProperty(exports, name, {\n \t\t\t\tconfigurable: false,\n \t\t\t\tenumerable: true,\n \t\t\t\tget: getter\n \t\t\t});\n \t\t}\n \t};\n\n \t// getDefaultExport function for compatibility with non-harmony modules\n \t__webpack_require__.n = function(module) {\n \t\tvar getter = module && module.__esModule ?\n \t\t\tfunction getDefault() { return module['default']; } :\n \t\t\tfunction getModuleExports() { return module; };\n \t\t__webpack_require__.d(getter, 'a', getter);\n \t\treturn getter;\n \t};\n\n \t// Object.prototype.hasOwnProperty.call\n \t__webpack_require__.o = function(object, property) { return Object.prototype.hasOwnProperty.call(object, property); };\n\n \t// __webpack_public_path__\n \t__webpack_require__.p = \"/\";\n\n \t// on error function for async loading\n \t__webpack_require__.oe = function(err) { console.error(err); throw err; };\n\n\n\n// WEBPACK FOOTER //\n// webpack/bootstrap d176f5affa434246605f"],"sourceRoot":""}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -4,4 +4,4 @@ cp ./config/config_prod.json ./config/config.json
npm install
npm run build
http-server /src/dist
# http-server /src/dist

File diff suppressed because it is too large Load Diff

View File

@@ -38,7 +38,7 @@
"friendly-errors-webpack-plugin": "^1.6.1",
"html-webpack-plugin": "^2.30.1",
"node-notifier": "^5.4.0",
"optimize-css-assets-webpack-plugin": "^5.0.1",
"optimize-css-assets-webpack-plugin": "^3.2.0",
"ora": "^1.2.0",
"portfinder": "^1.0.20",
"postcss-import": "^11.0.0",

View File

@@ -6,7 +6,12 @@
<div class="col-sm-7">
<div class="row">
<div class="col-sm-12">
št. povedi: {{ frameData.sentences.length }}
<span v-if="frameData.sentences.length < frameData.sentence_count">
št. povedi: {{ frameData.sentence_count }} (prikazanih {{ frameData.sentences.length }})
</span>
<span v-else>
št. povedi: {{ frameData.sentences.length }}
</span>
</div>
</div>

View File

@@ -1,6 +1,6 @@
<template>
<nav>
<b-navbar toggleable="md" type="light" variant="light">
<b-navbar id="nav-red-bg" toggleable="md" type="light" variant="light">
<b-navbar-toggle target="nav_collapse"></b-navbar-toggle>
<!--b-navbar-brand>Vezljivostni vzorci slovenskih glagolov</b-navbar-brand-->
<b-navbar-brand class=cursorpointer v-on:click="goHome">
@@ -62,7 +62,7 @@ export default {
name: "Nav",
props: ["appState"],
data() {return {
optCorpora: ["kres", "ssj"],
optCorpora: ["kres", "ssj", "gigafida"],
optIndexes: [
{key: "besede", val: "words"},
{key: "udeleženske vloge", val: "functors"},
@@ -112,7 +112,7 @@ export default {
</script>
<style>
.bg-light {
#nav-red-bg {
background-color: rgb(183,21,17,0.9) !important;
}
nav a {

0
src/pkg/__init__.py Normal file
View File

View File

@@ -3,6 +3,41 @@ from corpusparser import enriched_lemma
log = logging.getLogger(__name__)
def frames_from_db_entry_headword(dbent, headword):
def _full_tid(tid):
return ".".join([dbent["sid"], str(tid)])
token_dict = {str(x["tid"]): x for x in dbent["tokens"]}
frames = []
if "srl_links" not in dbent:
return []
srldict = {}
for srl in dbent["srl_links"]:
key = str(srl["from"])
if enriched_lemma(token_dict[key]) != headword:
continue
if key not in srldict:
srldict[key] = [srl]
else:
srldict[key] += [srl]
for hwtid, srlarr in srldict.items():
frames += [Frame(
hw_lemma=enriched_lemma(token_dict[hwtid]),
tids=[_full_tid(hwtid)],
slots=[
Slot(
functor=srl["afun"],
tids=[_full_tid(srl["to"])]
) for srl in srlarr
],
# sentences=[(dbent["sid"], dbent["tokens"])],
sentences=[
[(_full_tid(t["tid"]), t) for t in dbent["tokens"]],
]
)]
return frames
def frames_from_db_entry(dbent):
def _full_tid(tid):
return ".".join([dbent["sid"], str(tid)])
@@ -37,7 +72,8 @@ def frames_from_db_entry(dbent):
return frames
class Frame():
def __init__(self, tids, deep_links=None, slots=None, hw_lemma=None, sentences=None):
def __init__(self, tids, deep_links=None, slots=None,
hw_lemma=None, sentences=None, sentence_count=None):
self.hw = hw_lemma
self.tids = tids # list of tokens with the same hw_lemma
# Each tid = "S123.t123";
@@ -50,6 +86,8 @@ class Frame():
self.sense_info = {}
self.sentences = sentences
self.aggr_sent = None # Dictionary { hw: self.sentences idx }
self.sentence_count = sentence_count # paging, optimization
def get_functors(self):
return [slot.functor for slot in self.slots]
@@ -62,7 +100,8 @@ class Frame():
"slots": [slot.to_json() for slot in self.slots],
"sentences": self.sentences,
"aggr_sent": self.aggr_sent,
"sense_info": self.sense_info
"sense_info": self.sense_info,
"sentence_count": self.sentence_count
}
return ret

View File

@@ -1,96 +0,0 @@
import logging
log = logging.getLogger(__name__)
class Frame():
def __init__(self, tids, deep_links=None, slots=None, hw=None):
self.hw = hw
self.tids = tids # list of tokens with the same hw_lemma
# Each tid = "S123.t123";
# you can get sentence with vallex.get_sentence(S123)
self.slots = []
if slots is None:
self.slots = self.init_slots(deep_links)
else:
self.slots = slots
self.sense_info = {}
self.sentences = None # Used for passing to view in app.py, get_frames
self.aggr_sent = None # Dictionary { hw: self.sentences idx }
def to_json(self):
ret = {
"hw": self.hw,
"tids": self.tids,
"slots": [slot.to_json() for slot in self.slots],
"sentences": self.sentences,
"aggr_sent": self.aggr_sent,
"sense_info": self.sense_info
}
return ret
def init_slots(self, deep):
slots = []
for link in deep:
slots.append(Slot(
functor=link["functor"],
tids=[link["to"]]
))
return slots
def sort_slots(self):
# ACT, PAT, alphabetically
srt1 = [
x for x in self.slots
if (x.functor == "ACT" or
x.functor == "PAT")
]
srt1 = sorted(srt1, key=lambda x: x.functor)
srt2 = [
x for x in self.slots
if (x.functor != "ACT" and
x.functor != "PAT")
]
srt2 = sorted(srt2, key=lambda x: x.functor)
self.slots = (srt1 + srt2)
def to_string(self):
ret = "Frame:\n"
ret += "sense_info: {}\n".format(str(self.sense_info))
ret += "tids: ["
for t in self.tids:
ret += (str(t) + ", ")
ret += "]\n"
if self.slots is not None:
ret += "slots:\n"
for sl in self.slots:
ret += (sl.to_string() + "\n")
return ret
class Slot():
# Each slot is identified by its functor (ACT, PAT, ...)
# It consists of different tokens.
def __init__(self, functor, tids=None, count=None):
self.functor = functor
self.tids = tids or [] # combining multiple sentences vertically
self.count = count or 1
def to_string(self):
ret = "---- Slot:\n"
ret += "functor: {}\n".format(self.functor)
ret += "tids: ["
for t in self.tids:
ret += (str(t) + ", ")
ret += "]\n"
ret += "]\n"
ret += "----\n"
return ret
def to_json(self):
ret = {
"functor": self.functor,
"tids": self.tids,
"count": self.count
}
return ret

View File

@@ -9,6 +9,7 @@ import logging
log = logging.getLogger(__name__)
SENSE_UNDEFINED = "nedefinirano"
SENTENCE_LIMIT = 10
## TIDI: use frame.py
## TODO: build a list of [Frame] with lists of [Slot]
@@ -70,7 +71,10 @@ def reduce_1(frames, valdb_sensemap=None):
for functor in fs[0]:
slots[functor] = Slot(functor=functor)
# Reduce slots from all frames. (Merge ACT from all frames, ...)
sentence_count = len(fs[1])
for frame in fs[1]:
if len(tids) >= SENTENCE_LIMIT:
break
tids += frame.tids
sentences += frame.sentences
for sl in frame.slots:
@@ -78,8 +82,13 @@ def reduce_1(frames, valdb_sensemap=None):
slots_list = []
for k, e in slots.items():
slots_list.append(e)
# TODO does appending hw_lemma of first frame work for functor frames too?
rf = Frame(hw_lemma=fs[1][0].hw, tids=tids, slots=slots_list, sentences=sentences)
rf = Frame(
hw_lemma=fs[1][0].hw,
tids=tids,
slots=slots_list,
sentences=sentences,
sentence_count=sentence_count
)
rf.sort_slots()
ret_frames.append(rf)
return sorted_by_len_tids(ret_frames)
@@ -182,7 +191,11 @@ def frames_from_sense_ids(raw_frames, id_map):
tids = []
reduced_slots = []
sentences = []
sentence_count = len(frames)
for frame in frames:
if len(tids) >= SENTENCE_LIMIT:
break
tids += frame.tids
sentences += frame.sentences
for slot in frame.slots:
@@ -204,7 +217,8 @@ def frames_from_sense_ids(raw_frames, id_map):
hw_lemma="derp",
tids=tids,
slots=reduced_slots,
sentences=sentences
sentences=sentences,
sentence_count=sentence_count,
)
id_map_entry = (
id_map.get(tids[0]) or